In [46]:
import os
import pandas as pd
import nltk
import numpy as np
import re
from nltk import PorterStemmer
from nltk import WordNetLemmatizer 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score,precision_score,recall_score
from string import punctuation

In [2]:
os.getcwd()

'/Users/manpreetsi/My Python Jupyter Notebook'

In [None]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


## Frequency Based Models' NLP Data Prep Steps :-
### 1. Import data in a Table
### 2. Clean the data i.e. remove punctuations
### 3. Tokenize the data
### 4. Remove Stopwords
### 5. Apply Stemming or Lemmatization
### 6. Train-Test Split
### 7. Vectorize the data (Use either 7.1 or 7.2)
#### 7.1 De-Tokenize -> TFIDF Vectorizer
#### 7.2 Analyser -> TFIDF vectorizer

In [3]:
data = pd.read_csv('SMSSpamCollection.tsv',sep='\t',header=None)

In [4]:
data.shape

(5568, 2)

In [5]:
data.head()

Unnamed: 0,0,1
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [6]:
data.rename(columns={0:'target',1:'text_data'},inplace=True)

In [7]:
data.head()

Unnamed: 0,target,text_data
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


### Step 1:5

In [8]:
def data_pre_processing(text):
    text = "".join([char for char in text if char not in punctuation]); ## Remove punctuation
    text = re.split(" ",text.lower()); ## Tokenize
    text = [word for word in text if word not in stopwords.words('english')] ## Remove Stopwords
    text = [WordNetLemmatizer().lemmatize(word) for word in text] ## Lemmatize the words
    return text

In [9]:
data['processed_text_data']=data['text_data'].apply(lambda x : data_pre_processing(x))

In [10]:
data

Unnamed: 0,target,text_data,processed_text_data
0,ham,I've been searching for the right words to tha...,"[ive, searching, right, word, thank, breather,..."
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
2,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, go, usf, life, around, though]"
3,ham,Even my brother is not like to speak with me. ...,"[even, brother, like, speak, treat, like, aid,..."
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]"
...,...,...,...
5563,spam,This is the 2nd time we have tried 2 contact u...,"[2nd, time, tried, 2, contact, u, u, £750, pou..."
5564,ham,Will ü b going to esplanade fr home?,"[ü, b, going, esplanade, fr, home]"
5565,ham,"Pity, * was in mood for that. So...any other s...","[pity, , mood, soany, suggestion]"
5566,ham,The guy did some bitching but I acted like i'd...,"[guy, bitching, acted, like, id, interested, b..."


In [None]:
## Feature Engineering

In [None]:
## Feature 1 :- Len of the text

In [11]:
def count_len(text):
    return len(text)-text.count(" ")

In [12]:
data['len_count']=data['text_data'].apply(lambda x : count_len(x))

In [13]:
data.head()

Unnamed: 0,target,text_data,processed_text_data,len_count
0,ham,I've been searching for the right words to tha...,"[ive, searching, right, word, thank, breather,...",160
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...",128
2,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, go, usf, life, around, though]",49
3,ham,Even my brother is not like to speak with me. ...,"[even, brother, like, speak, treat, like, aid,...",62
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]",28


In [None]:
## Feature 1 :- % of pucntuations

In [14]:
def percent_punct(text):
    count = sum([1 for char in text if char in punctuation])
    return round(count/(len(text)-text.count(" "))*100,2)

In [15]:
data['punct_percent']=data['text_data'].apply(lambda x : percent_punct(x))

In [16]:
data.head()

Unnamed: 0,target,text_data,processed_text_data,len_count,punct_percent
0,ham,I've been searching for the right words to tha...,"[ive, searching, right, word, thank, breather,...",160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...",128,4.69
2,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, go, usf, life, around, though]",49,4.08
3,ham,Even my brother is not like to speak with me. ...,"[even, brother, like, speak, treat, like, aid,...",62,3.23
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]",28,7.14


### Step 6

In [None]:
## Split train test before TFIDF

In [17]:
x_train,x_test,y_train,y_test= train_test_split(data[['processed_text_data','len_count','punct_percent']],data['target'],test_size = .20,stratify=data['target'])



In [18]:
x_train.reset_index(drop=False,inplace=True);
x_test.reset_index(drop=False,inplace=True);
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

In [19]:
y_train_df = pd.DataFrame(y_train,columns=['target'])
y_train_df['target']= y_train_df['target'].apply(lambda x : 1 if x=='spam' else 0)
y_train_df['target']=y_train_df['target'].astype('category')

y_test_df = pd.DataFrame(y_test,columns=['target'])
y_test_df['target']= y_test_df['target'].apply(lambda x : 1 if x=='spam' else 0)
y_test_df['target']=y_test_df['target'].astype('category')

In [20]:
type(y_train_df.loc[0,'target'])

numpy.int64

In [21]:
type(x_train)

pandas.core.frame.DataFrame

In [22]:
x_train.head()

Unnamed: 0,index,processed_text_data,len_count,punct_percent
0,1611,"[sef, dey, laugh, meanwhile, hows, darling, an...",46,6.52
1,3003,"[also, hi, wesley, howve]",25,4.0
2,2027,"[ugh, cant, u, apologize, admit, u, wrong, ask...",59,5.08
3,2689,"[urgent, urgent, 800, free, flight, europe, gi...",130,3.85
4,1264,"[seriously, tell, exact, word, right]",41,4.88


In [23]:
def de_tokenize(text):
    return " ".join(text)

In [24]:
x_train['de_tokenize']=x_train['processed_text_data'].apply(lambda x : de_tokenize(x))

x_test['de_tokenize']=x_test['processed_text_data'].apply(lambda x : de_tokenize(x))

In [199]:
x_train.head()

Unnamed: 0,index,processed_text_data,len_count,punct_percent,de_tokenize
0,1611,"[sef, dey, laugh, meanwhile, hows, darling, an...",46,6.52,sef dey laugh meanwhile hows darling anjie
1,3003,"[also, hi, wesley, howve]",25,4.0,also hi wesley howve
2,2027,"[ugh, cant, u, apologize, admit, u, wrong, ask...",59,5.08,ugh cant u apologize admit u wrong ask take u ...
3,2689,"[urgent, urgent, 800, free, flight, europe, gi...",130,3.85,urgent urgent 800 free flight europe give away...
4,1264,"[seriously, tell, exact, word, right]",41,4.88,seriously tell exact word right


## TFIDF Fit

In [181]:
tfidf = TfidfVectorizer(lowercase=True)

In [182]:
X_TfidfVector = tfidf.fit(x_train['de_tokenize']) ## Using only train data to define our vocabulary corpus !

In [183]:
X_TfidfVector

TfidfVectorizer()

### IMP NOTE :- Never use transform as we have used x_train only to define our corpus of vocabulary and not test data


In [184]:
## input is list of sentences  (Detokenized)

x_train_feature = X_TfidfVector.transform(x_train['de_tokenize']) 
x_test_feature = X_TfidfVector.transform(x_test['de_tokenize'])

In [185]:
x_train_text_df = pd.DataFrame(x_train_feature.toarray())
x_test_text_df=pd.DataFrame(x_test_feature.toarray())

In [186]:
x_train_text_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7826,7827,7828,7829,7830,7831,7832,7833,7834,7835
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [187]:
x_train_final_df = pd.concat([x_train[['len_count','punct_percent']].reset_index(drop=True),x_train_text_df],axis=1)
x_test_final_df = pd.concat([x_test[['len_count','punct_percent']].reset_index(drop=True),x_test_text_df],axis=1)

In [188]:
x_train_final_df.head()

Unnamed: 0,len_count,punct_percent,0,1,2,3,4,5,6,7,...,7826,7827,7828,7829,7830,7831,7832,7833,7834,7835
0,46,6.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,25,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,59,5.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,130,3.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,41,4.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [189]:
x_test_final_df.head()

Unnamed: 0,len_count,punct_percent,0,1,2,3,4,5,6,7,...,7826,7827,7828,7829,7830,7831,7832,7833,7834,7835
0,58,5.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,70,2.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,71,16.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,42,4.76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,65,3.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Modelling

## RandomForest Classifier

In [190]:
rf = RandomForestClassifier(n_estimators=500,max_depth=100,n_jobs=100);

In [191]:
rf_model = rf.fit(x_train_final_df,np.ravel(y_train_df))

In [192]:
pred_proba = rf_model.predict_proba(x_test_final_df)[:,1]

In [193]:
pred = rf_model.predict(x_test_final_df)

In [194]:
pred_proba

array([0.02630402, 0.01321805, 0.04880505, ..., 0.00103179, 0.00897731,
       0.00115292])

In [195]:
roc_auc_score(y_test_df,pred_proba)

0.9920923601210141

In [196]:
precision_score(y_test_df,pred)

1.0

In [197]:
recall_score(y_test_df,pred)

0.8322147651006712

In [198]:
## Top 10 RF Features

pd.DataFrame({'Feature':x_train_final_df.columns,'RF_Importance':rf_model.feature_importances_}).sort_values(by=['RF_Importance'],ascending=False).head(10)
                                                                                                            

Unnamed: 0,Feature,RF_Importance
0,len_count,0.045353
1649,1647,0.035901
7120,7118,0.025883
2990,2988,0.02475
1872,1870,0.021881
4596,4594,0.02083
6060,6058,0.019387
6535,6533,0.015009
6808,6806,0.013534
5471,5469,0.013421


## GBM Classifier

In [47]:
gbm = GradientBoostingClassifier(n_estimators=500,learning_rate=.1,max_depth=5)

In [48]:
gbm_model = gbm.fit(x_train_final_df,np.ravel(y_train_df))

In [70]:
pred_proba = gbm_model.predict_proba(x_test_final_df)[:,1]

In [71]:
pred = gbm_model.predict(x_test_final_df)

In [72]:
roc_auc_score(y_test_df,pred_proba)

0.98334666342108

In [73]:
precision_score(y_test_df,pred)

0.9558823529411765

In [74]:
recall_score(y_test_df,pred)

0.87248322147651

In [75]:
## Top 10 GBM Features

pd.DataFrame({'Feature':x_train_final_df.columns,'GBM_Importance':gbm_model.feature_importances_}).sort_values(by=['GBM_Importance'],ascending=False).head(10)


Unnamed: 0,Feature,GBM_Importance
0,len_count,0.301741
1649,1647,0.138937
7120,7118,0.087844
6808,6806,0.03219
2990,2988,0.02263
6060,6058,0.020944
6535,6533,0.020288
595,593,0.016621
1872,1870,0.016189
7550,7548,0.015051


In [163]:
x_train_final_df.loc[:,1647]

0       0.000000
1       0.000000
2       0.000000
3       0.224211
4       0.000000
          ...   
4449    0.000000
4450    0.000000
4451    0.000000
4452    0.000000
4453    0.000000
Name: 1647, Length: 4454, dtype: float64