In [1]:
import gensim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('SMSSpamCollection.tsv',sep='\t',header=None)

In [3]:
data.rename(columns = {0:'target',1:'text'},inplace=True)

In [4]:
data.head()

Unnamed: 0,target,text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [5]:
data['clean_text']= data['text'].apply(lambda x : gensim.utils.simple_preprocess(x))

In [6]:
data.head()

Unnamed: 0,target,text,clean_text
0,ham,I've been searching for the right words to tha...,"[ve, been, searching, for, the, right, words, ..."
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup..."
2,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, don, think, he, goes, to, usf, he, lives..."
3,ham,Even my brother is not like to speak with me. ...,"[even, my, brother, is, not, like, to, speak, ..."
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[have, date, on, sunday, with, will]"


In [7]:
data['target']=data['target'].apply(lambda x : 1 if x=='spam' else 0)

In [8]:
data.head()

Unnamed: 0,target,text,clean_text
0,0,I've been searching for the right words to tha...,"[ve, been, searching, for, the, right, words, ..."
1,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup..."
2,0,"Nah I don't think he goes to usf, he lives aro...","[nah, don, think, he, goes, to, usf, he, lives..."
3,0,Even my brother is not like to speak with me. ...,"[even, my, brother, is, not, like, to, speak, ..."
4,0,I HAVE A DATE ON SUNDAY WITH WILL!!,"[have, date, on, sunday, with, will]"


In [9]:
x_train,x_test,y_train,y_test=train_test_split(data['clean_text'],data['target'],test_size=.2,stratify=data['target'])


In [10]:
x_train

4891                       [convey, my, regards, to, him]
3921                         [how, izzit, still, raining]
1039     [easy, ah, sen, got, selected, means, its, good]
1744                    [think, have, the, wrong, number]
2560    [also, fine, when, will, you, complete, the, c...
                              ...                        
2763    [married, local, women, looking, for, discreet...
3463    [actually, fuck, that, just, do, whatever, do,...
3575    [they, said, dun, haf, passport, or, smth, lik...
2469    [final, chance, claim, ur, worth, of, discount...
2608    [knock, knock, txt, whose, there, to, to, ente...
Name: clean_text, Length: 4454, dtype: object

In [11]:
## Create tags for doc2vec
x_train_tag = [gensim.models.doc2vec.TaggedDocument(v,[i]) for i,v in enumerate(x_train)]
x_test_tag = [gensim.models.doc2vec.TaggedDocument(v,[i]) for i,v in enumerate(x_test)]

In [12]:
type(x_train_tag)

list

In [13]:
x_train_tag[0:4]

[TaggedDocument(words=['convey', 'my', 'regards', 'to', 'him'], tags=[0]),
 TaggedDocument(words=['how', 'izzit', 'still', 'raining'], tags=[1]),
 TaggedDocument(words=['easy', 'ah', 'sen', 'got', 'selected', 'means', 'its', 'good'], tags=[2]),
 TaggedDocument(words=['think', 'have', 'the', 'wrong', 'number'], tags=[3])]

In [53]:
## input is a tagged sentence (Tokenized)

d2v_model = gensim.models.Doc2Vec(x_train_tag,vector_size=300,window=7,min_count=2)

In [54]:
d2v_model

<gensim.models.doc2vec.Doc2Vec at 0x7fc688062e10>

In [55]:
## model generating vector of an unseen sentence

d2v_model.infer_vector(['My','name','is','Batman']) 

array([ 8.3593035e-04, -1.1307865e-03,  4.9551734e-04,  5.1871687e-04,
        1.2362729e-03, -2.5805749e-04, -2.8533326e-03,  1.8625248e-03,
       -2.7508932e-04,  4.1242549e-03,  2.4943845e-04, -3.1732905e-03,
       -1.4246483e-03,  2.1703569e-03,  1.6573903e-03,  2.0310064e-03,
       -2.5676871e-03, -2.1009329e-03,  1.2339117e-03, -1.4682449e-03,
        2.4065534e-03,  1.3484266e-03,  1.5538665e-03,  1.4943494e-03,
        4.0867561e-03, -1.9621878e-04, -1.5910048e-03,  2.2449753e-04,
       -2.6633225e-03,  6.8015826e-04, -9.6130396e-05,  3.7614398e-03,
        1.5108391e-03,  2.7931395e-03,  2.7198521e-03, -9.1674406e-04,
        1.3492510e-04, -2.1685541e-03, -1.1514629e-03,  1.0709256e-03,
        2.0055931e-04, -1.9819068e-03, -3.7105955e-04,  1.6843779e-03,
        1.2345868e-03,  2.6607409e-04, -6.5664609e-04,  1.5882093e-04,
       -2.3795965e-03,  1.2785771e-03, -4.5578472e-05,  1.2444291e-03,
       -3.2604251e-03,  2.5253324e-03, -1.7642049e-03,  2.3631849e-04,
      

In [56]:
x_train_tag[0].words

['convey', 'my', 'regards', 'to', 'him']

In [57]:
x_train_vector = [d2v_model.infer_vector(v.words) for v in x_train_tag]
x_test_vector = [d2v_model.infer_vector(v.words) for v in x_test_tag]

In [58]:
x_train_vector[0:1]

[array([-1.74489268e-03, -4.54734953e-04, -2.04657693e-03,  2.57067778e-03,
         3.31424992e-03,  1.02076493e-03, -3.71570559e-03,  5.51030599e-03,
         2.62131705e-03,  8.02996662e-03, -8.68789502e-04, -4.77889134e-03,
         8.96295009e-04,  3.81821278e-03,  7.30178086e-04,  6.62578037e-04,
        -1.00892000e-02, -6.46034535e-03, -1.81929080e-03, -1.55918405e-03,
         2.87509337e-03,  4.93583083e-03,  4.83064726e-03,  4.46677394e-03,
         7.45631196e-03, -2.61607557e-03, -2.27962784e-03,  6.40177168e-03,
        -4.37194062e-03,  9.97150782e-04, -9.31084563e-04,  1.13205090e-02,
        -5.33366925e-04,  4.96078236e-03,  5.23082307e-03, -3.76676419e-03,
        -7.25382706e-05, -5.14470972e-03, -6.39987178e-03, -1.95052405e-03,
         1.37822644e-03, -6.30762847e-03,  2.83576502e-03,  3.61302635e-04,
        -6.31209230e-04, -2.61341128e-03, -3.08233127e-03,  1.27335417e-03,
        -3.22323479e-03,  5.23877752e-05, -2.15634634e-03, -1.93642522e-03,
        -3.1

### RanndomForest Classifier

In [60]:
rf = RandomForestClassifier(n_estimators=500,max_depth=100,n_jobs=-1 )

In [61]:
rf_model = rf.fit(x_train_vector,np.ravel(y_train))

In [62]:
pred_proba = rf_model.predict_proba(x_test_vector)[:,1]

In [63]:
pred_proba

array([0.098, 0.076, 0.054, ..., 0.1  , 0.066, 0.006])

In [64]:
pred = rf_model.predict(x_test_vector)

In [65]:
roc_auc_score(y_test,pred_proba)

0.8874221928573912

In [66]:
precision_score(y_test,pred)

0.9666666666666667

In [67]:
recall_score(y_test,pred)

0.19463087248322147

In [44]:
## We can see doc2vec model is better than word2vec model(in End-to-End word2vec NLP LinkedIn Practise.ipynb) ##