# Part One - Digital content management text classifier

PROJECT OBJECTIVE: The need is to build a NLP classifier which can use input text parameters to determine the label/s of the blog.

## Importing the Libraries

In [70]:
import numpy as np 
import pandas as pd 
import re 
import nltk

import os

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

import warnings
warnings.filterwarnings('ignore')

## Importing data and EDA

In [2]:
dataset = pd.read_csv('blogtext.csv')
dataset.head(10)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...
5,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",I had an interesting conversation...
6,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Somehow Coca-Cola has a way of su...
7,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004","If anything, Korea is a country o..."
8,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Take a read of this news article ...
9,3581210,male,33,InvestmentBanking,Aquarius,"09,June,2004",I surf the English news sites a l...


In [3]:
## Checking for NA/NUll

print(dataset.isna().any())
print(dataset.isnull().any())

id        False
gender    False
age       False
topic     False
sign      False
date      False
text      False
dtype: bool
id        False
gender    False
age       False
topic     False
sign      False
date      False
text      False
dtype: bool


*No Null values are present in the dataset*

In [4]:
## Shape of the data

dataset.shape

(681284, 7)

There are 68,124 records and is huge to perform analysis and computation, hence we are going to take a subset and rerun with the entire data-set once all errors are fixed and optimization is done

In [5]:
partial_dataset = dataset.head(10000)

In [6]:
partial_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10000 non-null  int64 
 1   gender  10000 non-null  object
 2   age     10000 non-null  int64 
 3   topic   10000 non-null  object
 4   sign    10000 non-null  object
 5   date    10000 non-null  object
 6   text    10000 non-null  object
dtypes: int64(2), object(5)
memory usage: 547.0+ KB


In [7]:
## removing less important features
partial_dataset.drop(['id','date'], axis=1, inplace=True)

In [8]:
partial_dataset.head()

Unnamed: 0,gender,age,topic,sign,text
0,male,15,Student,Leo,"Info has been found (+/- 100 pages,..."
1,male,15,Student,Leo,These are the team members: Drewe...
2,male,15,Student,Leo,In het kader van kernfusie op aarde...
3,male,15,Student,Leo,testing!!! testing!!!
4,male,33,InvestmentBanking,Aquarius,Thanks to Yahoo!'s Toolbar I can ...


In [9]:
## converting 'age' to object type

partial_dataset['age'] = partial_dataset['age'].astype('object')

In [10]:
partial_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   gender  10000 non-null  object
 1   age     10000 non-null  object
 2   topic   10000 non-null  object
 3   sign    10000 non-null  object
 4   text    10000 non-null  object
dtypes: object(5)
memory usage: 390.8+ KB


*Converted all the columns to object data-type*

## Data Preprocessing

### Data cleaning

In [11]:
partial_dataset['clean_data']=partial_dataset['text'].apply(lambda x: re.sub(r'[^A-Za-z]+',' ',x))

In [12]:
## to lower
partial_dataset['clean_data']=partial_dataset['clean_data'].apply(lambda x: x.lower())

In [13]:
## stripping
partial_dataset['clean_data']=partial_dataset['clean_data'].apply(lambda x: x.strip())

In [14]:
print("Actual data=======> {}".format(partial_dataset['text'][1]))



In [15]:
print("Cleaned data=======> {}".format(partial_dataset['clean_data'][1]))



#### Removing stopwords

In [16]:
stopwords=set(stopwords.words('english'))

In [17]:
partial_dataset['clean_data'] = partial_dataset['clean_data'].apply(lambda x: ' '.join([words for words in x.split() if words not in stopwords]))

In [18]:
partial_dataset['clean_data'][16]

'urllink wonderful oh gyup sal favorite pork restaurant official pork outstripped beef top meat import korea see urllink joongang ilbo link wonder mad cow disease avian flu virus going around thing seems safe pork course may know oh gyup sal literally means layers fat maybe super healthy well bad hours inline skating call rollerblading seems tough time rolling korean tongue beforehand drowned lovely concoction called oh ship say joo mix baek say ju literally hundred year alcohol soju literally booze thus translation combined drink called year alcohol intersting eh'

### Target/label merger and transformation

In [19]:
partial_dataset['labels'] = partial_dataset.apply(lambda col: [col['gender'],str(col['age']),col['topic'],col['sign']], axis=1)

In [20]:
partial_dataset.head()

Unnamed: 0,gender,age,topic,sign,text,clean_data,labels
0,male,15,Student,Leo,"Info has been found (+/- 100 pages,...",info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,male,15,Student,Leo,These are the team members: Drewe...,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,male,15,Student,Leo,In het kader van kernfusie op aarde...,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,male,15,Student,Leo,testing!!! testing!!!,testing testing,"[male, 15, Student, Leo]"
4,male,33,InvestmentBanking,Aquarius,Thanks to Yahoo!'s Toolbar I can ...,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


In [21]:
cleaned_partial_dataset = partial_dataset[['clean_data', 'labels']]

In [22]:
cleaned_partial_dataset.head()

Unnamed: 0,clean_data,labels
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


### Vectorisation

Lets perform count vectorizer with bi-grams and tri-grams to get the count vectors of the X data

In [23]:
vectorizer=CountVectorizer(binary=True, ngram_range=(1,2))

In [24]:
X = cleaned_partial_dataset['clean_data']
y = cleaned_partial_dataset['labels']

In [25]:
X=vectorizer.fit_transform(X)

In [26]:
X[1]

<1x643302 sparse matrix of type '<class 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [27]:
## get some features

vectorizer.get_feature_names()[:4]

['aa', 'aa amazing', 'aa anger', 'aa compared']

In [28]:
label_counts=dict()

for labels in cleaned_partial_dataset.labels.values:
    for label in labels:
        if label in label_counts:
            label_counts[label]+=1
        else:
            label_counts[label]=1

In [29]:
label_counts

{'male': 5916,
 '15': 602,
 'Student': 1137,
 'Leo': 301,
 '33': 136,
 'InvestmentBanking': 70,
 'Aquarius': 571,
 'female': 4084,
 '14': 212,
 'indUnk': 3287,
 'Aries': 4198,
 '25': 386,
 'Capricorn': 215,
 '17': 1185,
 'Gemini': 150,
 '23': 253,
 'Non-Profit': 71,
 'Cancer': 504,
 'Banking': 16,
 '37': 33,
 'Sagittarius': 1097,
 '26': 234,
 '24': 655,
 'Scorpio': 971,
 '27': 1054,
 'Education': 270,
 '45': 16,
 'Engineering': 127,
 'Libra': 491,
 'Science': 63,
 '34': 553,
 '41': 20,
 'Communications-Media': 99,
 'BusinessServices': 91,
 'Sports-Recreation': 80,
 'Virgo': 236,
 'Taurus': 812,
 'Arts': 45,
 'Pisces': 454,
 '44': 3,
 '16': 440,
 'Internet': 118,
 'Museums-Libraries': 17,
 'Accounting': 4,
 '39': 79,
 '35': 2315,
 'Technology': 2654,
 '36': 1708,
 'Law': 11,
 '46': 7,
 'Consulting': 21,
 'Automotive': 14,
 '42': 14,
 'Religion': 9,
 '13': 42,
 'Fashion': 1622,
 '38': 46,
 '43': 6,
 'Publishing': 4,
 '40': 1,
 'Marketing': 156,
 'LawEnforcement-Security': 10,
 'HumanReso

In [30]:
## Preprocessing Labels

binarizer=MultiLabelBinarizer(classes=sorted(label_counts.keys()))

In [31]:
y = binarizer.fit_transform(cleaned_partial_dataset.labels)

In [32]:
y[1]

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1])

### Splitting data into train and test split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## Model building and evaluation

In [50]:
model=LogisticRegression(solver='lbfgs')

In [53]:
model=OneVsRestClassifier(model)

In [54]:
model.fit(X_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None)

In [55]:
y_pred = model.predict(X_test)

In [58]:
def print_evaluation_scores(y_test, y_pred):
    print('Accuracy score: ', accuracy_score(y_test, y_pred))
    print('F1 score: ', f1_score(y_test, y_pred, average='micro'))
    print('Average precision score: ', average_precision_score(y_test, y_pred, average='micro'))
    print('Average recall score: ', recall_score(y_test, y_pred, average='micro'))

In [59]:
print_evaluation_scores(y_test, y_pred)

Accuracy score:  0.304
F1 score:  0.6351104804378673
Average precision score:  0.45302758557509915
Average recall score:  0.5221666666666667


In [72]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.06      0.12        16
           1       1.00      0.05      0.09        63
           2       0.86      0.19      0.31       192
           3       0.87      0.26      0.40       131
           4       0.69      0.23      0.35       326
           5       0.00      0.00      0.00        71
           6       0.90      0.05      0.09       183
           7       1.00      0.02      0.04       133
           8       1.00      0.03      0.05        74
           9       0.96      0.35      0.51       330
          10       1.00      0.11      0.20        44
          11       1.00      0.75      0.85       189
          12       0.75      0.66      0.70       686
          13       0.96      0.50      0.66       498
          14       0.00      0.00      0.00         9
          15       1.00      0.17      0.29        12
          16       0.00      0.00      0.00        26
          17       0.00    

### Predictions

true vs predicted labels for any 5 entries

In [80]:
y_pred_inversed = binarizer.inverse_transform(y_pred)
y_test_inversed = binarizer.inverse_transform(y_test)

In [57]:
for i in range(5):
    print('Text:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_test[i],
        ','.join(y_test_inversed[i]),
        ','.join(y_pred_inversed[i])
    ))

Text:	  (0, 20)	1
  (0, 21)	1
  (0, 4892)	1
  (0, 4965)	1
  (0, 8672)	1
  (0, 8806)	1
  (0, 13692)	1
  (0, 13792)	1
  (0, 18103)	1
  (0, 18191)	1
  (0, 55068)	1
  (0, 55085)	1
  (0, 85438)	1
  (0, 85534)	1
  (0, 102031)	1
  (0, 102260)	1
  (0, 103588)	1
  (0, 103619)	1
  (0, 117927)	1
  (0, 117995)	1
  (0, 128843)	1
  (0, 128851)	1
  (0, 131554)	1
  (0, 131724)	1
  (0, 141348)	1
  :	:
  (0, 498235)	1
  (0, 498620)	1
  (0, 513546)	1
  (0, 514356)	1
  (0, 515865)	1
  (0, 515996)	1
  (0, 552721)	1
  (0, 552939)	1
  (0, 559897)	1
  (0, 564667)	1
  (0, 564896)	1
  (0, 567161)	1
  (0, 567905)	1
  (0, 570626)	1
  (0, 570852)	1
  (0, 573316)	1
  (0, 573321)	1
  (0, 586020)	1
  (0, 586198)	1
  (0, 587562)	1
  (0, 587581)	1
  (0, 615450)	1
  (0, 615874)	1
  (0, 636616)	1
  (0, 636762)	1
True labels:	16,Cancer,indUnk,male
Predicted labels:	16,Cancer,indUnk,male


Text:	  (0, 1917)	1
  (0, 1929)	1
  (0, 2073)	1
  (0, 2089)	1
  (0, 3990)	1
  (0, 4322)	1
  (0, 8672)	1
  (0, 8901)	1
  (0, 15527)	1
  