In [1]:
import pandas as pd
import numpy as np
import glob

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity

import scipy.sparse
from scipy.sparse import csr_matrix, hstack

import matplotlib.pyplot as plt
import seaborn as sns

import sys
sys.path.append('/home/ec2-user/anaconda3/envs/JupyterSystemEnv/lib/python3.6/site-packages')
from xgboost.sklearn import XGBClassifier

  from numpy.core.umath_tests import inner1d


In [2]:
db = pd.read_csv('./sampleorig.csv')
db.describe()

Unnamed: 0.1,Unnamed: 0,id,age
count,10000.0,10000.0,10000.0
mean,188146.6728,2360750.0,24.1309
std,108352.699739,1283130.0,7.651197
min,74.0,8173.0,13.0
25%,94432.75,1103575.0,17.0
50%,188383.5,2581386.0,24.0
75%,282350.75,3542478.0,27.0
max,375944.0,4334776.0,48.0


In [3]:
db.head()

Unnamed: 0.1,Unnamed: 0,id,gender,age,topic,sign,date,text
0,74,3539003,female,14,indUnk,Aries,"07,June,2004",O= optimist P= pessimist My...
1,141,3705830,male,25,Non-Profit,Cancer,"22,June,2004",whatever it is they put into freeze pop...
2,145,3705830,male,25,Non-Profit,Cancer,"26,July,2004","&nbsp; in my head, and in response to a..."
3,220,3429420,male,15,Student,Aquarius,"23,May,2004",Friends are like ships Anchored their l...
4,226,3429420,male,15,Student,Aquarius,"21,June,2004",There have been times in my life when I...


In [4]:
db.topic.value_counts()

indUnk                     3658
Student                    2110
Technology                  665
Education                   499
Arts                        457
Communications-Media        320
Non-Profit                  228
Internet                    220
Engineering                 198
Law                         149
Government                  107
Publishing                  107
Consulting                  102
Science                      96
Marketing                    93
Accounting                   89
Fashion                      89
BusinessServices             61
Telecommunications           61
Advertising                  59
Sports-Recreation            56
Biotech                      55
Military                     55
Religion                     48
Chemicals                    46
Banking                      43
Transportation               41
Manufacturing                37
Museums-Libraries            37
LawEnforcement-Security      35
Agriculture                  25
Investme

In [5]:
db = db[['gender','text','id']]
db.head()

Unnamed: 0,gender,text,id
0,female,O= optimist P= pessimist My...,3539003
1,male,whatever it is they put into freeze pop...,3705830
2,male,"&nbsp; in my head, and in response to a...",3705830
3,male,Friends are like ships Anchored their l...,3429420
4,male,There have been times in my life when I...,3429420


In [6]:
dbm = db.groupby(['gender','id'])\
    .apply(lambda x: ' '.join(x.text))\
    .to_frame()\
    .rename(columns={0:'text'})
dbm.reset_index(inplace=True)
dbm.head()

Unnamed: 0,gender,id,text
0,female,28417,Perhaps so. But I hardly inferred ...
1,female,48923,Leigha is performing in a ur...
2,female,66895,It's been a while since anything ...
3,female,78196,Something went terribly wrong...
4,female,96600,Something that I have been thinki...


#### Analyzing the class balance

In [7]:
dbm.gender.value_counts()

female    1872
male      1840
Name: gender, dtype: int64

#### Analyzing the unique number of bloggers

In [8]:
dbm.id.nunique()

3712

#### Feature engineering

In [9]:
dbm.text = dbm.text.str.strip()
dbm['clen'] = dbm.text.str.len()
dbm['numb'] = dbm.text.str.count(r'\d')/dbm['clen']
dbm['caps'] = dbm.text.str.count(r'[A-Z]')/dbm['clen']
dbm['wordlen'] = dbm.text.apply(lambda x: len(str(x).split()))
dbm['schar'] = (dbm.text.str.count(r'\W')-dbm.text.str.count(r'\W\S'))/dbm['clen']
dbm['unique_wlen_percent']=(dbm.text.apply(lambda x: len(set(str(x).split()))))/dbm['wordlen']
dbm['clen_wlen']=dbm['clen']/dbm['wordlen']
dbm.reset_index(inplace=True)
dbm.drop(['index'], axis=1, inplace=True)
dbm.head()

Unnamed: 0,gender,id,text,clen,numb,caps,wordlen,schar,unique_wlen_percent,clen_wlen
0,female,28417,Perhaps so. But I hardly inferred that every l...,4817,0.003322,0.022628,848,0.047955,0.587264,5.680425
1,female,48923,Leigha is performing in a urlLink one-woman s...,3489,0.000287,0.007165,613,0.050731,0.539967,5.69168
2,female,66895,It's been a while since anything new came up o...,451,0.004435,0.02439,84,0.035477,0.857143,5.369048
3,female,78196,Something went terribly wrong late last night....,25627,0.001327,0.04866,4636,0.068014,0.410699,5.527826
4,female,96600,Something that I have been thinking is that we...,1950,0.006154,0.019487,355,0.047692,0.577465,5.492958


#### Analyzing the blog sentiment

In [10]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [12]:
sentiment_val = dbm.text.apply(lambda x: SentimentIntensityAnalyzer().polarity_scores(x))

pol_values=[]
for i in sentiment_val:
    pol_val = list(i.values())
    pol_values.append(pol_val)

sent = pd.DataFrame(pol_values)
sent.rename(columns = {0:'neg',1:'neu',2:'pos',3:'compound'}, inplace=True)

dbm = pd.merge(dbm,sent,how='inner',left_index=True,right_index=True)
dbm.head()

Unnamed: 0,gender,id,text,clen,numb,caps,wordlen,schar,unique_wlen_percent,clen_wlen,neg,neu,pos,compound
0,female,28417,Perhaps so. But I hardly inferred that every l...,4817,0.003322,0.022628,848,0.047955,0.587264,5.680425,0.103,0.747,0.15,0.994
1,female,48923,Leigha is performing in a urlLink one-woman s...,3489,0.000287,0.007165,613,0.050731,0.539967,5.69168,0.042,0.84,0.118,0.9958
2,female,66895,It's been a while since anything new came up o...,451,0.004435,0.02439,84,0.035477,0.857143,5.369048,0.0,0.909,0.091,0.743
3,female,78196,Something went terribly wrong late last night....,25627,0.001327,0.04866,4636,0.068014,0.410699,5.527826,0.176,0.699,0.125,-0.9999
4,female,96600,Something that I have been thinking is that we...,1950,0.006154,0.019487,355,0.047692,0.577465,5.492958,0.022,0.858,0.12,0.9742


In [13]:
#Cleaning
print(len(dbm))
dbm.text = dbm.text.str.lower()
dbm.text = dbm.text.str.replace('[^a-zA-Z]'," ")
dbm.text = dbm.text.str.replace('urllink'," ")
dbm.text = dbm.text.str.replace('nbsp'," ")
dbm.text = dbm.text.str.replace(r'\n',' ')
dbm.text = dbm.text.str.replace(r'\s+',' ')
dbm.text = dbm.text.str.replace('([ ]{2,})',' ')
dbm = dbm[~dbm.text.str.match(r'^\s*$')]
dbm = dbm[dbm.text.apply(lambda x:len(x))>10]
print(len(dbm))
dbm.head()

3712
3712


Unnamed: 0,gender,id,text,clen,numb,caps,wordlen,schar,unique_wlen_percent,clen_wlen,neg,neu,pos,compound
0,female,28417,perhaps so but i hardly inferred that every la...,4817,0.003322,0.022628,848,0.047955,0.587264,5.680425,0.103,0.747,0.15,0.994
1,female,48923,leigha is performing in a one woman show right...,3489,0.000287,0.007165,613,0.050731,0.539967,5.69168,0.042,0.84,0.118,0.9958
2,female,66895,it s been a while since anything new came up o...,451,0.004435,0.02439,84,0.035477,0.857143,5.369048,0.0,0.909,0.091,0.743
3,female,78196,something went terribly wrong late last night ...,25627,0.001327,0.04866,4636,0.068014,0.410699,5.527826,0.176,0.699,0.125,-0.9999
4,female,96600,something that i have been thinking is that we...,1950,0.006154,0.019487,355,0.047692,0.577465,5.492958,0.022,0.858,0.12,0.9742


### Modeling

In [14]:
def add_feature(X, feature_to_add):
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [15]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
dbm['gender'] = labelencoder_X_1.fit_transform(dbm.loc[:,'gender'])

In [16]:
dbm.head()

Unnamed: 0,gender,id,text,clen,numb,caps,wordlen,schar,unique_wlen_percent,clen_wlen,neg,neu,pos,compound
0,0,28417,perhaps so but i hardly inferred that every la...,4817,0.003322,0.022628,848,0.047955,0.587264,5.680425,0.103,0.747,0.15,0.994
1,0,48923,leigha is performing in a one woman show right...,3489,0.000287,0.007165,613,0.050731,0.539967,5.69168,0.042,0.84,0.118,0.9958
2,0,66895,it s been a while since anything new came up o...,451,0.004435,0.02439,84,0.035477,0.857143,5.369048,0.0,0.909,0.091,0.743
3,0,78196,something went terribly wrong late last night ...,25627,0.001327,0.04866,4636,0.068014,0.410699,5.527826,0.176,0.699,0.125,-0.9999
4,0,96600,something that i have been thinking is that we...,1950,0.006154,0.019487,355,0.047692,0.577465,5.492958,0.022,0.858,0.12,0.9742


#### Splitting data into train and test

In [17]:
dbm2 = dbm.drop(['gender','id'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(dbm2, 
                                                    dbm['gender'], 
                                                    random_state=0, test_size=0.25)
print(len(X_train))
X_train.columns

2784


Index(['text', 'clen', 'numb', 'caps', 'wordlen', 'schar',
       'unique_wlen_percent', 'clen_wlen', 'neg', 'neu', 'pos', 'compound'],
      dtype='object')

#### Word n-gram model

In [18]:
vect = TfidfVectorizer(max_df=0.5,min_df=0.005,lowercase=True,stop_words='english',norm='l1',ngram_range=(1,2),max_features=3000).fit(X_train['text'])
X_train_word = vect.transform(X_train['text'])
X_test_word = vect.transform(X_test['text'])

In [19]:
rf = RandomForestClassifier(oob_score='True')
params = {'max_depth': [100], 'n_estimators':[1000]}
gs_w = GridSearchCV(estimator=rf,scoring='accuracy',
                  param_grid=params,
                  n_jobs=-1,
                  cv=2,
                  verbose=5)

gs_w.fit(X_train_word, y_train)

best_parameters = gs_w.best_params_  
print("best parameters are",best_parameters) 

best_result = gs_w.best_score_  
print("Best Score is",best_result)

predictions_rf_w = gs_w.best_estimator_.predict(X_test_word)
auc = roc_auc_score(y_test, predictions_rf_w)
auc

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] max_depth=100, n_estimators=1000 ................................
[CV] max_depth=100, n_estimators=1000 ................................
[CV]  max_depth=100, n_estimators=1000, score=0.6353194544149318, total=  12.4s
[CV]  max_depth=100, n_estimators=1000, score=0.6513299784327822, total=  12.5s


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   13.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   13.4s finished


best parameters are {'max_depth': 100, 'n_estimators': 1000}
Best Score is 0.6433189655172413


0.667255216693419

#### Character n-gram model

In [20]:
vect_c = TfidfVectorizer(max_df=0.5,min_df=0.005,lowercase=True,stop_words='english',norm='l1',ngram_range=(2,6),analyzer='char',max_features=3000).fit(X_train['text'])
X_train_char = vect_c.transform(X_train['text'])
X_test_char = vect_c.transform(X_test['text'])

In [21]:
rf = RandomForestClassifier(oob_score='True')
params = {'max_depth': [100], 'n_estimators':[1000]}
gs_w = GridSearchCV(estimator=rf,scoring='accuracy',
                  param_grid=params,
                  n_jobs=-1,
                  cv=2,
                  verbose=5)

gs_w.fit(X_train_char, y_train)

best_parameters = gs_w.best_params_  
print("best parameters are",best_parameters) 

best_result = gs_w.best_score_  
print("Best Score is",best_result)

predictions_rf = gs_w.best_estimator_.predict(X_test_char)
auc = roc_auc_score(y_test, predictions_rf)
auc

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] max_depth=100, n_estimators=1000 ................................
[CV] max_depth=100, n_estimators=1000 ................................
[CV]  max_depth=100, n_estimators=1000, score=0.6274228284278536, total=  43.8s
[CV]  max_depth=100, n_estimators=1000, score=0.6376707404744788, total=  45.7s


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   49.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   49.6s finished


best parameters are {'max_depth': 100, 'n_estimators': 1000}
Best Score is 0.6325431034482759


0.6319957196361691

In [None]:
X_train_comb = hstack([X_train_word,X_train_char])
X_test_comb = hstack([X_test_word,X_test_char])

for i in X_train.columns:
    if i != 'text' and i != 'username':
        X_train_comb = add_feature(X_train_comb,X_train[i])

for i in X_test.columns:
    if i != 'text' and i != 'username':
        X_test_comb = add_feature(X_test_comb,X_test[i])

#### Performing topic modelling - Latent Dirichlet Allocation

In [23]:
print(X_train_comb.shape)

print(X_train_comb.shape)

print(X_test_comb.shape)

print(X_test_comb.shape)        

(2784, 6000)
(2784, 6011)
(928, 6000)
(928, 6011)


In [24]:
lda = LatentDirichletAllocation(n_components=5, max_iter=50,learning_method='online', random_state=0,)
lda.fit(X_train_word)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=50, mean_change_tol=0.001,
             n_components=5, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [25]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic #{topic_idx}: "
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [26]:
tf_feature_names = vect.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words=10)

Topic #0: male jewish lounge rick greg detroit dean danger dangerous leather
Topic #1: jewish lounge rick greg detroit dean danger dangerous leather pregnant
Topic #2: originally katie confused flowers game life cat stupid note right just
Topic #3: awake jewish lounge rick greg detroit dean danger dangerous leather
Topic #4: know time really think love don good today day blog



In [27]:
tr_topics = lda.transform(X_train_word)
tt_topics = lda.transform(X_test_word)
tr1 = pd.DataFrame(tr_topics)

tr1.mean(axis=0)

0    0.100653
1    0.100464
2    0.101250
3    0.100657
4    0.596976
dtype: float64

#### We can observe that the topic modelling does not lead to well differentiated topics. And, topic-4 is highly abundant.

In [28]:
X_train_comb2 = hstack([X_train_comb,tr_topics])
X_test_comb2 = hstack([X_test_comb,tt_topics])

X_test_comb2.shape

(2784, 6011)
(2784, 6016)


(928, 6016)

In [29]:
rf = RandomForestClassifier(oob_score='True')
params = {'max_depth': [100], 'n_estimators':[500]}
gs_w = GridSearchCV(estimator=rf,scoring='accuracy',
                  param_grid=params,
                  n_jobs=-1,
                  cv=2,
                  verbose=5)

gs_w.fit(X_train_comb2, y_train)

best_parameters = gs_w.best_params_  
print("best parameters are",best_parameters) 

best_result = gs_w.best_score_  
print("Best Score is",best_result)

predictions_rff = gs_w.best_estimator_.predict(X_test_comb2)
auc = roc_auc_score(y_test, predictions_rff)
auc

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] max_depth=100, n_estimators=500 .................................
[CV] max_depth=100, n_estimators=500 .................................
[CV]  max_depth=100, n_estimators=500, score=0.6209619526202441, total=  22.1s
[CV]  max_depth=100, n_estimators=500, score=0.6376707404744788, total=  22.6s


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   25.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   25.3s finished


best parameters are {'max_depth': 100, 'n_estimators': 500}
Best Score is 0.6293103448275862


0.6357829111126619

In [30]:
xgb = XGBClassifier(n_jobs=-1, n_estimators=300, silent=0,max_depth=5)
xgb.fit(X_train_comb2, y_train)

predictions_xgb = xgb.predict(X_test_comb2)
auc = roc_auc_score(y_test, predictions_xgb)
auc

[22:25:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[22:25:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[22:25:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[22:25:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 0 pruned nodes, max_depth=5
[22:25:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_depth=5
[22:25:54] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=5
[22:25:54] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_depth=5
[22:25:54] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[22:25:54] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_

[22:26:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 0 pruned nodes, max_depth=5
[22:26:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=5
[22:26:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=5
[22:26:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=5
[22:26:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=5
[22:26:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 26 extra nodes, 0 pruned nodes, max_depth=5
[22:26:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 24 extra nodes, 0 pruned nodes, max_depth=5
[22:26:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 0 pruned nodes, max_depth=5
[22:26:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 18 extra nodes, 0 pruned nodes, max_

[22:26:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=5
[22:26:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 0 pruned nodes, max_depth=5
[22:26:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=5
[22:26:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=5
[22:26:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 40 extra nodes, 0 pruned nodes, max_depth=5
[22:26:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=5
[22:26:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5
[22:26:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=5
[22:26:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 24 extra nodes, 0 pruned nodes, max_

[22:26:48] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 20 extra nodes, 0 pruned nodes, max_depth=5
[22:26:48] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=5
[22:26:48] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=5
[22:26:48] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=5
[22:26:49] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[22:26:49] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 20 extra nodes, 0 pruned nodes, max_depth=5
[22:26:49] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[22:26:49] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[22:26:50] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_

[22:27:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=5
[22:27:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 40 extra nodes, 0 pruned nodes, max_depth=5
[22:27:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[22:27:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5


  if diff:


0.6458952706632237

#### We get the best accuracy of 66.73% from word ngram based Random Forest model