### Importing the required modules/packages

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import re
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

### Loading file and looking into the dimensions of data

In [2]:
raw_data = pd.read_csv("C:\\MSA\\Spring_Semester\\7152\\nlp\\SMSSpamCollection.tsv",sep='\t',names=['label','text'])
pd.set_option('display.max_colwidth',100)
raw_data.head()

#print(raw_data.shape)
#print(pd.crosstab(raw_data['label'],columns = 'label'))
#pd.crosstab(raw_data['label'],columns = 'label',normalize=True)
#raw_data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [23]:
print(raw_data.shape)
#print(pd.crosstab(raw_data['label'],columns = 'label'))
pd.crosstab(raw_data['label'],columns = 'label',normalize=True)
#raw_data.head()

(5572, 4)


col_0,label
label,Unnamed: 1_level_1
ham,0.865937
spam,0.134063


### Creating new features and cleaning function

In [3]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

def punct_pc(text):
    punct_count = sum([1 for char in text if char in string.punctuation])
    return (punct_count/(len(text) - text.count(' ')))*100

raw_data['text_length'] = raw_data['text'].apply(lambda x : len(x)-x.count(' '))
raw_data['Punct_pc'] = raw_data['text'].apply(lambda x : punct_pc(x))

def clean_data(text):
    punct = "".join([word.lower() for word in text if word not in string.punctuation])
    splt = re.split('\W+',punct)
    txt = [ps.stem(word) for word in splt if word not in stopwords]
    return txt

### Creating test train splits 

In [4]:
X_train,X_test,Y_train,Y_test = train_test_split(raw_data[['text','text_length','Punct_pc']],raw_data['label'],test_size=0.2,random_state=123)

In [33]:
print(pd.crosstab(Y_train,columns = 'label',normalize=True))
print(pd.crosstab(Y_test,columns = 'label',normalize=True))
X_train.head()

col_0     label
label          
ham    0.866726
spam   0.133274
col_0    label
label         
ham    0.86278
spam   0.13722


Unnamed: 0,text,text_length,Punct_pc
385,"Double mins and txts 4 6months FREE Bluetooth on Orange. Available on Sony, Nokia Motorola phone...",128,3.125
4003,Did you get any gift? This year i didnt get anything. So bad,48,4.166667
1283,"Ever green quote ever told by Jerry in cartoon ""A Person Who Irritates u Always Is the one Who L...",128,11.71875
2327,The Xmas story is peace.. The Xmas msg is love.. The Xmas miracle is jesus.. Hav a blessed month...,105,10.47619
1103,Black shirt n blue jeans... I thk i c ü...,33,18.181818


### Vectorizing using TFIDF

In [7]:
Tfidf_Vect = TfidfVectorizer(analyzer=clean_data)
Tfidf_vect_fit = Tfidf_Vect.fit(X_train['text'])

X_train_Tfidf_vect = Tfidf_vect_fit.transform(X_train['text'])
X_test_Tfidf_vect = Tfidf_vect_fit.transform(X_test['text'])

X_train_vect = pd.concat([X_train[['text_length','Punct_pc']].reset_index(drop=True) ,
                         pd.DataFrame(X_train_Tfidf_vect.toarray())],axis=1)


X_test_vect = pd.concat([X_test[['text_length','Punct_pc']].reset_index(drop=True) , 
                        pd.DataFrame(X_test_Tfidf_vect.toarray())],axis=1)
X_train_vect.head()


Unnamed: 0,text_length,Punct_pc,0,1,2,3,4,5,6,7,...,7075,7076,7077,7078,7079,7080,7081,7082,7083,7084
0,128,3.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,48,4.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,128,11.71875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,105,10.47619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,33,18.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.252939,0.0


In [30]:
X_test_vect.head()

Unnamed: 0,text_length,Punct_pc,0,1,2,3,4,5,6,7,...,7075,7076,7077,7078,7079,7080,7081,7082,7083,7084
0,73,4.109589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,65,4.615385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,55,3.636364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,23,4.347826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,124,7.258065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### CountVectorizer

In [8]:
Count_Vect = CountVectorizer(analyzer=clean_data)
Count_vect_fit = Count_Vect.fit(X_train['text'])

X_train_Count_vect = Count_vect_fit.transform(X_train['text'])
X_test_Count_vect = Count_vect_fit.transform(X_test['text'])

X_train_Count_vect = pd.concat([X_train[['text_length','Punct_pc']].reset_index(drop=True) ,
                         pd.DataFrame(X_train_Count_vect.toarray())],axis=1)


X_test_Count_vect = pd.concat([X_test[['text_length','Punct_pc']].reset_index(drop=True) , 
                        pd.DataFrame(X_test_Count_vect.toarray())],axis=1)
X_train_Count_vect.head()

Unnamed: 0,text_length,Punct_pc,0,1,2,3,4,5,6,7,...,7075,7076,7077,7078,7079,7080,7081,7082,7083,7084
0,128,3.125,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,48,4.166667,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,128,11.71875,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,105,10.47619,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,33,18.181818,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### RF - Tuning the hyperparameters for best model with TFIDF vector

In [9]:
rf = RandomForestClassifier(random_state=123,n_jobs=3)
param = {'n_estimators' : [10,25,50,100,300], 'max_depth' : [10, 20, 50,100, None],'max_features' : [10,50,'auto']}

In [10]:
grid = GridSearchCV(rf,param,cv=5,n_jobs=3)

rf_grid_fit_1 = grid.fit(X_train_vect, Y_train)
pd.DataFrame(rf_grid_fit_1.cv_results_).sort_values('mean_test_score',ascending=False)[0:10]

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_max_depth,param_max_features,param_n_estimators,params,rank_test_score,split0_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
74,26.992298,0.621765,0.976442,1.0,,auto,300,"{'max_depth': None, 'max_features': 'auto', 'n_estimators': 300}",1,0.982063,...,0.970852,1.0,0.968575,1.0,0.980899,1.0,4.335585,0.289018,0.005586,0.0
59,28.310359,0.702857,0.976442,0.99972,100.0,auto,300,"{'max_depth': 100, 'max_features': 'auto', 'n_estimators': 300}",1,0.980942,...,0.969731,0.999719,0.970819,1.0,0.979775,0.999439,1.839396,0.269796,0.005065,0.000177
57,5.898225,0.381109,0.976217,0.99972,100.0,auto,50,"{'max_depth': 100, 'max_features': 'auto', 'n_estimators': 50}",3,0.982063,...,0.965247,0.999719,0.975309,0.99972,0.979775,1.0,0.399882,0.035476,0.005904,0.000177
58,10.445192,0.538609,0.975768,0.999663,100.0,auto,100,"{'max_depth': 100, 'max_features': 'auto', 'n_estimators': 100}",4,0.982063,...,0.966368,0.999719,0.969697,0.99972,0.980899,0.999439,0.336311,0.137234,0.006444,0.000112
72,6.218115,0.366572,0.974422,1.0,,auto,50,"{'max_depth': None, 'max_features': 'auto', 'n_estimators': 50}",5,0.983184,...,0.963004,1.0,0.968575,1.0,0.979775,1.0,0.582722,0.199845,0.007484,0.0
52,4.650476,0.455022,0.974422,0.998373,100.0,50,50,"{'max_depth': 100, 'max_features': 50, 'n_estimators': 50}",5,0.980942,...,0.966368,0.998597,0.968575,0.998598,0.977528,0.998038,0.332224,0.114794,0.005824,0.000275
73,10.627047,0.598689,0.974422,1.0,,auto,100,"{'max_depth': None, 'max_features': 'auto', 'n_estimators': 100}",5,0.982063,...,0.965247,1.0,0.968575,1.0,0.979775,1.0,0.230061,0.204467,0.006475,0.0
56,3.743849,0.58261,0.9733,0.999046,100.0,auto,25,"{'max_depth': 100, 'max_features': 'auto', 'n_estimators': 25}",8,0.979821,...,0.960762,0.999719,0.973064,0.998037,0.978652,0.999159,0.259261,0.092646,0.006773,0.000577
71,4.008862,0.432861,0.973076,0.99972,,auto,25,"{'max_depth': None, 'max_features': 'auto', 'n_estimators': 25}",9,0.979821,...,0.963004,1.0,0.967452,0.999159,0.978652,0.99972,0.300188,0.250203,0.00665,0.000307
51,3.608676,0.375041,0.972852,0.99742,100.0,50,25,"{'max_depth': 100, 'max_features': 50, 'n_estimators': 25}",10,0.982063,...,0.965247,0.998036,0.967452,0.996354,0.976404,0.996916,0.279051,0.173165,0.006076,0.000718


In [15]:
RF_results_TFIDF = pd.DataFrame(rf_grid_fit_1.cv_results_).sort_values('mean_test_score',ascending=False)
RF_results_TFIDF.to_csv("C:\\MSA\\Spring_Semester\\7152\\nlp\\RF_Results_TFIDF.csv",header=True)

### RF Tuning for CountVectorizer

In [11]:
rf = RandomForestClassifier(random_state=123,n_jobs=3)
param = {'n_estimators' : [10,25,50,100,300], 'max_depth' : [10, 20, 50,100, None],'max_features' : [10,50,'auto']}

grid = GridSearchCV(rf,param,cv=5,n_jobs=3)

rf_grid_fit_2 = grid.fit(X_train_Count_vect, Y_train)
pd.DataFrame(rf_grid_fit_2.cv_results_).sort_values('mean_test_score',ascending=False)[0:10]

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_max_depth,param_max_features,param_n_estimators,params,rank_test_score,split0_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
58,10.187043,0.64841,0.974871,0.999439,100.0,auto,100,"{'max_depth': 100, 'max_features': 'auto', 'n_estimators': 100}",1,0.982063,...,0.965247,0.999719,0.970819,0.999439,0.980899,0.999159,0.262687,0.270129,0.006283,0.0002507858
57,6.785375,0.390667,0.974871,0.999495,100.0,auto,50,"{'max_depth': 100, 'max_features': 'auto', 'n_estimators': 50}",1,0.983184,...,0.965247,0.999719,0.970819,0.999159,0.978652,0.99972,1.211698,0.282491,0.006246,0.0002748055
73,10.81799,0.481357,0.974647,0.999944,,auto,100,"{'max_depth': None, 'max_features': 'auto', 'n_estimators': 100}",3,0.982063,...,0.964126,1.0,0.970819,0.99972,0.979775,1.0,0.309986,0.131527,0.006481,0.0001121705
59,28.217441,0.599345,0.973974,0.99972,100.0,auto,300,"{'max_depth': 100, 'max_features': 'auto', 'n_estimators': 300}",4,0.982063,...,0.964126,0.999719,0.96633,0.99972,0.979775,0.99972,2.200441,0.163023,0.007315,6.291332e-08
56,4.609863,0.356288,0.973749,0.998822,100.0,auto,25,"{'max_depth': 100, 'max_features': 'auto', 'n_estimators': 25}",5,0.980942,...,0.966368,0.999719,0.969697,0.998878,0.977528,0.998318,1.055001,0.164012,0.005238,0.0004824089
74,27.118944,0.589048,0.973076,1.0,,auto,300,"{'max_depth': None, 'max_features': 'auto', 'n_estimators': 300}",6,0.980942,...,0.961883,1.0,0.967452,1.0,0.978652,1.0,4.080222,0.312986,0.00723,0.0
72,6.004991,0.357068,0.972403,0.999944,,auto,50,"{'max_depth': None, 'max_features': 'auto', 'n_estimators': 50}",7,0.9787,...,0.964126,1.0,0.968575,0.99972,0.976404,1.0,0.452314,0.117556,0.005332,0.0001121705
69,23.628693,0.606407,0.971954,1.0,,50,300,"{'max_depth': None, 'max_features': 50, 'n_estimators': 300}",8,0.982063,...,0.959641,1.0,0.965208,1.0,0.977528,1.0,1.467595,0.177319,0.00827,0.0
66,3.67539,0.550488,0.971954,0.999383,,50,25,"{'max_depth': None, 'max_features': 50, 'n_estimators': 25}",8,0.979821,...,0.964126,0.999719,0.968575,0.999159,0.973034,0.998598,0.152918,0.314125,0.005309,0.000482321
71,4.098917,0.419298,0.97173,0.99972,,auto,25,"{'max_depth': None, 'max_features': 'auto', 'n_estimators': 25}",10,0.979821,...,0.960762,1.0,0.967452,0.999159,0.975281,0.99972,0.481341,0.174017,0.006779,0.0003071916


In [16]:
RF_results_CountVector = pd.DataFrame(rf_grid_fit_2.cv_results_).sort_values('mean_test_score',ascending=False)
RF_results_CountVector.to_csv("C:\\MSA\\Spring_Semester\\7152\\nlp\\RF_Results_CountVector.csv",header=True)

### Gradient Boosting parameter tuning

In [13]:
gb = GradientBoostingClassifier(random_state=123)
param = {'n_estimators' : [100,150,250], 'learning_rate' :[0.01,0.1],'max_depth':[3,10],'max_features': [300,'auto',None]}

In [14]:
gb_grid = GridSearchCV(gb,param,cv=5,n_jobs=-1)

gb_grid_fit = gb_grid.fit(X_train_vect, Y_train)
pd.DataFrame(gb_grid_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:10]

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_learning_rate,param_max_depth,param_max_features,param_n_estimators,params,rank_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
29,39.994312,0.237416,0.974647,1.0,0.1,10,300,250,"{'learning_rate': 0.1, 'max_depth': 10, 'max_features': 300, 'n_estimators': 250}",1,...,0.963004,1.0,0.969697,1.0,0.980899,1.0,0.67801,0.023927,0.00725,0.0
28,26.803623,0.212121,0.974198,1.0,0.1,10,300,150,"{'learning_rate': 0.1, 'max_depth': 10, 'max_features': 300, 'n_estimators': 150}",2,...,0.961883,1.0,0.968575,1.0,0.979775,1.0,0.482787,0.020428,0.007702,0.0
27,20.117713,0.186901,0.973525,1.0,0.1,10,300,100,"{'learning_rate': 0.1, 'max_depth': 10, 'max_features': 300, 'n_estimators': 100}",3,...,0.961883,1.0,0.96633,1.0,0.979775,1.0,0.342364,0.008686,0.007851,0.0
20,11.104559,0.182823,0.971057,0.99972,0.1,3,300,250,"{'learning_rate': 0.1, 'max_depth': 3, 'max_features': 300, 'n_estimators': 250}",4,...,0.957399,1.0,0.967452,0.99972,0.983146,0.99972,0.126442,0.015359,0.009329,0.000177
35,485.688527,0.195584,0.968813,1.0,0.1,10,,250,"{'learning_rate': 0.1, 'max_depth': 10, 'max_features': None, 'n_estimators': 250}",5,...,0.955157,1.0,0.961841,1.0,0.975281,1.0,75.116782,0.026989,0.008695,0.0
32,542.93009,0.246132,0.968813,1.0,0.1,10,auto,250,"{'learning_rate': 0.1, 'max_depth': 10, 'max_features': 'auto', 'n_estimators': 250}",5,...,0.955157,1.0,0.961841,1.0,0.975281,1.0,3.636683,0.037661,0.008695,0.0
19,7.969673,0.166501,0.967691,0.984182,0.1,3,300,150,"{'learning_rate': 0.1, 'max_depth': 3, 'max_features': 300, 'n_estimators': 150}",7,...,0.956278,0.987658,0.962963,0.986259,0.974157,0.980656,0.180202,0.005557,0.007968,0.002491
34,336.191472,0.221054,0.965672,1.0,0.1,10,,150,"{'learning_rate': 0.1, 'max_depth': 10, 'max_features': None, 'n_estimators': 150}",8,...,0.955157,1.0,0.958474,1.0,0.970787,1.0,3.510692,0.025745,0.007458,0.0
31,336.762781,0.207033,0.965672,1.0,0.1,10,auto,150,"{'learning_rate': 0.1, 'max_depth': 10, 'max_features': 'auto', 'n_estimators': 150}",8,...,0.955157,1.0,0.958474,1.0,0.970787,1.0,2.161775,0.009401,0.007458,0.0
26,165.633316,0.192965,0.965448,0.999832,0.1,3,,250,"{'learning_rate': 0.1, 'max_depth': 3, 'max_features': None, 'n_estimators': 250}",10,...,0.951794,0.999719,0.965208,1.0,0.976404,0.99972,0.869293,0.012998,0.007956,0.000137


In [17]:
GB_results = pd.DataFrame(gb_grid_fit.cv_results_).sort_values('mean_test_score',ascending=False)
GB_results.to_csv("C:\\MSA\\Spring_Semester\\7152\\nlp\\GB_Results.csv",header=True)

### Extreme Gradient Boosting tuning

In [12]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

print(XGBClassifier())
xgb1 = XGBClassifier(random_state=123,silent=False,n_jobs=4)
param = {'learning_rate':[0.01,0.1],'max_depth':[3,15],'n_estimators':[100,200],
         'colsample_bytree':[0.5,1]}
#'subsample':[0.5,1]
xgb_grid = GridSearchCV(xgb1,param,cv=5,n_jobs=4)

xgb_grid_fit = xgb_grid.fit(X_train_vect, Y_train)
pd.DataFrame(xgb_grid_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:10]


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,params,rank_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
6,177.456918,0.749632,0.974422,0.994223,0.5,0.1,15,100,"{'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 100}",1,...,0.967489,0.995231,0.969697,0.995233,0.982022,0.994113,19.904182,0.077554,0.005412,0.00088
14,348.700755,0.904826,0.973974,0.995569,1.0,0.1,15,100,"{'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 100}",2,...,0.963004,0.996073,0.968575,0.996074,0.980899,0.995234,39.874489,0.194613,0.006959,0.000449
15,620.893216,0.704965,0.972852,0.998373,1.0,0.1,15,200,"{'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 200}",3,...,0.963004,0.998597,0.967452,0.998317,0.977528,0.998598,188.441804,0.166772,0.006475,0.00021
7,362.568149,0.751972,0.972627,0.997644,0.5,0.1,15,200,"{'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 200}",4,...,0.966368,0.997756,0.967452,0.998317,0.980899,0.997477,30.102964,0.105621,0.005466,0.000456
5,141.659736,0.749317,0.971954,0.984126,0.5,0.1,3,200,"{'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}",5,...,0.963004,0.985975,0.967452,0.984857,0.976404,0.984301,36.022152,0.096564,0.005845,0.001275
13,234.655857,0.816191,0.971281,0.982556,1.0,0.1,3,200,"{'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}",6,...,0.961883,0.984853,0.970819,0.982894,0.970787,0.982058,18.387455,0.111131,0.005385,0.001445
3,374.056465,0.705983,0.967018,0.983173,0.5,0.01,15,200,"{'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 15, 'n_estimators': 200}",7,...,0.952915,0.985975,0.960718,0.983735,0.975281,0.982618,50.774477,0.075085,0.008755,0.001586
2,178.342406,0.701015,0.963204,0.978405,0.5,0.01,15,100,"{'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 15, 'n_estimators': 100}",8,...,0.951794,0.980645,0.957351,0.980931,0.969663,0.978694,15.606337,0.027351,0.00731,0.00221
4,85.75233,0.664009,0.962531,0.976161,0.5,0.1,3,100,"{'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}",9,...,0.950673,0.980084,0.952862,0.978127,0.967416,0.975049,12.471069,0.072705,0.009156,0.002564
12,147.069318,0.829275,0.960287,0.973356,1.0,0.1,3,100,"{'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}",10,...,0.947309,0.976718,0.950617,0.972518,0.967416,0.974769,22.306959,0.126858,0.009477,0.002115


In [18]:
XGB_results = pd.DataFrame(xgb_grid_fit.cv_results_).sort_values('mean_test_score',ascending=False)
XGB_results.to_csv("C:\\MSA\\Spring_Semester\\7152\\nlp\\XGB_Results.csv",header=True)

### Random Forest with Best Tuning Parameters for TFIDF vectorized data

In [38]:
import time

rf_final_1 = RandomForestClassifier(n_estimators = 50, max_depth = 100, max_features='auto',n_jobs=-1,random_state=123)

start = time.time()
rf_model_1 = rf_final_1.fit(X_train_vect, Y_train)
end = time.time()
fit_time = end - start

start = time.time()
Y_pred = rf_model_1.predict(X_test_vect)
end = time.time()
predict_time = end-start

precision, recall, fscore, train_support = score(Y_test, Y_pred, pos_label='spam', average='binary')
print('Fit_time : {} / Predict_time : {} / Precision: {} / Recall: {} / Accuracy: {}'.format(round(fit_time,3),round(predict_time,3),
    round(precision, 3), round(recall, 3), round((Y_pred==Y_test).sum()/len(Y_pred), 3)))



Fit_time : 3.572 / Predict_time : 0.224 / Precision: 1.0 / Recall: 0.863 / Accuracy: 0.981
[ 0.06426574  0.01079169  0.00062025 ...,  0.          0.00029655  0.        ]


In [39]:
sorted(zip(rf_model_1.feature_importances_ ,X_train_vect.columns),reverse=True)[0:5]

[(0.064265735980574709, 'text_length'),
 (0.027785497274975954, 1570),
 (0.027031553955556422, 6445),
 (0.022934517772735242, 2735),
 (0.022415369311974347, 4211)]

### Another RF with similar results and restricted hyperparameters for TFIDF vectorized data

In [13]:
rf_final_2 = RandomForestClassifier(n_estimators = 10, max_depth = 100, max_features='auto',n_jobs=-1,random_state=123)

start = time.time()
rf_model_2 = rf_final_2.fit(X_train_vect, Y_train)
end = time.time()
fit_time = end - start

start = time.time()
Y_pred = rf_model_2.predict(X_test_vect)
end = time.time()
predict_time = end-start

precision, recall, fscore, train_support = score(Y_test, Y_pred, pos_label='spam', average='binary')
print('Fit_time : {} / Predict_time : {} / Precision: {} / Recall: {} / Accuracy: {}'.format(round(fit_time,3),round(predict_time,3),
    round(precision, 3), round(recall, 3), round((Y_pred==Y_test).sum()/len(Y_pred), 3)))

Fit_time : 0.922 / Predict_time : 0.188 / Precision: 1.0 / Recall: 0.824 / Accuracy: 0.976


### RF with best tuned parameters for Count vectorized

In [35]:
rf_final_3 = RandomForestClassifier(n_estimators = 50, max_depth = 100, max_features='auto' , n_jobs=-1,random_state=123)

start = time.time()
rf_model_3 = rf_final_3.fit(X_train_Count_vect, Y_train)
end = time.time()
fit_time = end - start

start = time.time()
Y_pred = rf_model_3.predict(X_test_Count_vect)
end = time.time()
predict_time = end-start

precision, recall, fscore, train_support = score(Y_test, Y_pred, pos_label='spam', average='binary')
print('Fit_time : {} / Predict_time : {} / Precision: {} / Recall: {} / Accuracy: {}'.format(round(fit_time,3),round(predict_time,3),
    round(precision, 3), round(recall, 3), round((Y_pred==Y_test).sum()/len(Y_pred), 3)))

Fit_time : 3.86 / Predict_time : 0.321 / Precision: 1.0 / Recall: 0.817 / Accuracy: 0.975


### Gradient Boosting model with Best Tuned hyperparameters

In [36]:
import time

gb_final_1 =  GradientBoostingClassifier(n_estimators = 250 , max_depth = 3 , max_features=300, learning_rate = 0.1,random_state=123)

start = time.time()
gb_model_1 = gb_final_1.fit(X_train_vect, Y_train)
end = time.time()
fit_time = end - start

start = time.time()
Y_pred = gb_model_1.predict(X_test_vect)
end = time.time()
predict_time = end-start

precision, recall, fscore, train_support = score(Y_test, Y_pred, pos_label='spam', average='binary')
print('Fit_time : {} / Predict_time : {} / Precision: {} / Recall: {} / Accuracy: {}'.format(fit_time,predict_time,
    round(precision, 3), round(recall, 3), round((Y_pred==Y_test).sum()/len(Y_pred), 3)))


Fit_time : 12.5836923122406 / Predict_time : 0.272244930267334 / Precision: 0.985 / Recall: 0.85 / Accuracy: 0.978


In [40]:
sorted(zip(gb_model_1.feature_importances_ ,X_train_vect.columns),reverse=True)[0:5]

[(0.045961823614495662, 'text_length'),
 (0.03592707123333648, 1570),
 (0.028181258450186025, 6445),
 (0.024202504689814313, 5509),
 (0.018469949110446127, 591)]

### Xtreme Gradient Boosting with best tuning parameters

In [37]:
import time

xgb_final_1 =  XGBClassifier(max_depth = 15 ,colsample_bytree=0.5, learning_rate = 0.1,n_estimator=100,random_state=123,n_jobs=3)

start = time.time()
xgb_model_1 = xgb_final_1.fit(X_train_vect, Y_train)
end = time.time()
fit_time = end - start

start = time.time()
Y_pred = xgb_model_1.predict(X_test_vect)
end = time.time()
predict_time = end-start

precision, recall, fscore, train_support = score(Y_test, Y_pred, pos_label='spam', average='binary')
print('Fit_time : {} / Predict_time : {} / Precision: {} / Recall: {} / Accuracy: {}'.format(fit_time,predict_time,
    round(precision, 3), round(recall, 3), round((Y_pred==Y_test).sum()/len(Y_pred), 3)))

Fit_time : 95.34738516807556 / Predict_time : 0.5919156074523926 / Precision: 0.985 / Recall: 0.85 / Accuracy: 0.978


In [43]:
sorted(zip(xgb_model_1.feature_importances_ ,X_train_vect.columns),reverse=True)[0:5]

[(0.16893654, 'text_length'),
 (0.11735729, 'Punct_pc'),
 (0.063170098, 1570),
 (0.029266879, 6157),
 (0.027818024, 6457)]

In [53]:
X_train_vect.columns[0]

'text_length'