# SMS Spam ML4thDay Feature Extraction

In [1]:
import pandas as pd
data = pd.read_csv('D:\MLWorkshop\SMSSpamCollection.csv',sep = '\t',  names = ['label', 'message'])
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
text = data['message']
label = data['label']

In [3]:
# Number of Words

#x = lambda a : a + 10
#print(x(5))
data['word_count'] = data['message'].apply(lambda x: len(str(x).split(" ")))
data[['message','word_count']].head()

Unnamed: 0,message,word_count
0,"Go until jurong point, crazy.. Available only ...",20
1,Ok lar... Joking wif u oni...,6
2,Free entry in 2 a wkly comp to win FA Cup fina...,28
3,U dun say so early hor... U c already then say...,11
4,"Nah I don't think he goes to usf, he lives aro...",13


In [4]:
#Number of characters

data['char_count'] = data['message'].str.len() ## this also includes spaces
data[['message','char_count']].head()


Unnamed: 0,message,char_count
0,"Go until jurong point, crazy.. Available only ...",111
1,Ok lar... Joking wif u oni...,29
2,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,U dun say so early hor... U c already then say...,49
4,"Nah I don't think he goes to usf, he lives aro...",61


In [5]:
data[['word_count','char_count']].head()

Unnamed: 0,word_count,char_count
0,20,111
1,6,29
2,28,155
3,11,49
4,13,61


In [6]:
#Average Word Length

def avg_word(sentence):
  words = sentence.split()
#print(words)
  return (sum(len(word) for word in words)/len(words))
data['avg_word'] = data['message'].apply(lambda x: avg_word(x))
data[['message','avg_word']].head()

Unnamed: 0,message,avg_word
0,"Go until jurong point, crazy.. Available only ...",4.6
1,Ok lar... Joking wif u oni...,4.0
2,Free entry in 2 a wkly comp to win FA Cup fina...,4.571429
3,U dun say so early hor... U c already then say...,3.545455
4,"Nah I don't think he goes to usf, he lives aro...",3.769231


In [7]:
#Number of stopwords

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
data['stopwords'] = data['message'].apply(lambda x: len([x for x in x.split() if x in stop]))
data[['message','stopwords']].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,message,stopwords
0,"Go until jurong point, crazy.. Available only ...",4
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,5
3,U dun say so early hor... U c already then say...,2
4,"Nah I don't think he goes to usf, he lives aro...",5


In [8]:
#Number of special characters

data['hastags'] = data['message'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
data[['message','hastags']].head()

Unnamed: 0,message,hastags
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [9]:
#Number of numerics

data['numerics'] = data['message'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
data[['message','numerics']].head()


Unnamed: 0,message,numerics
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,2
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [10]:
#Number of Uppercase words

data['upper'] = data['message'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
data[['message','upper']].head()

Unnamed: 0,message,upper
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,2
3,U dun say so early hor... U c already then say...,2
4,"Nah I don't think he goes to usf, he lives aro...",1


In [11]:
import textblob

In [12]:
pos_family = {
 'noun' : ['NN','NNS','NNP','NNPS'],
 'pron' : ['PRP','PRP$','WP','WP$'],
 'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
 'adj' : ['JJ','JJR','JJS'],
 'adv' : ['RB','RBR','RBS','WRB']
}
# function to check and get the part of speech tag count of a words in a given sentence
from textblob import TextBlob, Word, Blobber
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
               cnt += 1
    except:
        pass
    return cnt
data['noun_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'noun'))
data['verb_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'verb'))
data['adj_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'adj'))
data['adv_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'adv'))
data['pron_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'pron'))
data[['message','noun_count','verb_count','adj_count', 'adv_count', 'pron_count' ]].head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,message,noun_count,verb_count,adj_count,adv_count,pron_count
0,"Go until jurong point, crazy.. Available only ...",9,1,3,3,0
1,Ok lar... Joking wif u oni...,4,1,1,0,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,14,3,4,0,0
3,U dun say so early hor... U c already then say...,3,3,2,3,0
4,"Nah I don't think he goes to usf, he lives aro...",1,5,0,3,3


In [13]:
features = data[['word_count','char_count','avg_word','stopwords','hastags','numerics',
'upper','noun_count','verb_count','adj_count', 'adv_count', 'pron_count']]


In [19]:
#label = data['label']

import numpy as np
classes_list = ["ham","spam"]
label_index = data['label'].apply(classes_list.index)
label = np.asarray(label_index)
print(label[:10])

[0 0 1 0 0 1 0 0 1 1]


In [20]:
import numpy as np
features_array = np.asarray(features)

In [16]:
features_array

array([[ 20.        , 111.        ,   4.6       , ...,   3.        ,
          3.        ,   0.        ],
       [  6.        ,  29.        ,   4.        , ...,   1.        ,
          0.        ,   0.        ],
       [ 28.        , 155.        ,   4.57142857, ...,   4.        ,
          0.        ,   0.        ],
       ...,
       [ 10.        ,  57.        ,   4.8       , ...,   1.        ,
          1.        ,   0.        ],
       [ 26.        , 125.        ,   3.84615385, ...,   3.        ,
          1.        ,   4.        ],
       [  6.        ,  26.        ,   3.5       , ...,   1.        ,
          0.        ,   2.        ]])

In [17]:
# data split into train and text

import numpy as np
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_array, label, test_size=0.33, random_state=42)


In [21]:
print(x_train)
print(y_train)

[[  5.         19.          3.        ...   0.          1.
    0.       ]
 [ 43.        221.          4.1627907 ...   3.          2.
    6.       ]
 [  4.         28.          6.25      ...   0.          0.
    0.       ]
 ...
 [  5.         45.          8.2       ...   1.          0.
    0.       ]
 [  5.         26.          4.4       ...   1.          1.
    0.       ]
 [  8.         39.          4.        ...   0.          1.
    1.       ]]
[0 0 0 ... 0 0 0]


In [22]:
print(x_test)
print(y_test)

[[ 34.         147.           3.35294118 ...   8.           1.
    2.        ]
 [ 24.         116.           3.875      ...   1.           5.
    3.        ]
 [ 24.         102.           3.29166667 ...   2.           2.
    2.        ]
 ...
 [ 10.          44.           3.5        ...   1.           0.
    2.        ]
 [  8.          35.           3.5        ...   0.           0.
    1.        ]
 [ 30.         133.           3.46666667 ...   2.           1.
    6.        ]]
[0 0 0 ... 0 0 0]


In [18]:
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.svm import SVC

model_SVM = SVC()
model_SVM.fit(x_train, y_train)
y_pred_SVM = model_SVM.predict(x_test)
print("SVM")
print("Accuracy score =", accuracy_score(y_test, y_pred_SVM))
print(metrics.classification_report(y_test, y_pred_SVM))

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100,max_depth=None,min_samples_split=2, random_state=0)
rf.fit(x_train,y_train)
y_pred_rf = rf.predict(x_test)
print("random")
print("Accuracy score =", accuracy_score(y_test, y_pred_rf))
print(metrics.classification_report(y_test, y_pred_rf))

from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(x_train,y_train)
y_pred_LR = LR.predict(x_test)
print("Logistic Regression")
print("Accuracy score =", accuracy_score(y_test, y_pred_LR))
print(metrics.classification_report(y_test, y_pred_LR ))

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors = 5)
neigh.fit(x_train,y_train)
y_pred_KNN = neigh.predict(x_test)
print("KNN")
print("Accuracy score =", accuracy_score(y_test, y_pred_KNN))
print(metrics.classification_report(y_test, y_pred_KNN ))

from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(x_train,y_train)
y_pred_naive = naive.predict(x_test)
print("Naive Bayes")
print("Accuracy score =", accuracy_score(y_test, y_pred_naive))
print(metrics.classification_report(y_test, y_pred_naive ))

from sklearn.ensemble import GradientBoostingClassifier
gradient = GradientBoostingClassifier(n_estimators=100,max_depth=None,min_samples_split=2, random_state=0)
gradient.fit(x_train,y_train)
y_pred_gradient = gradient.predict(x_test)
print("Gradient Boosting")
print("Accuracy score =", accuracy_score(y_test, y_pred_gradient))
print(metrics.classification_report(y_test, y_pred_gradient ))

from sklearn.tree import DecisionTreeClassifier
decision = DecisionTreeClassifier()
decision.fit(x_train,y_train)
y_pred_decision = decision.predict(x_test)
print("Decision Tree")
print("Accuracy score =", accuracy_score(y_test, y_pred_decision))
print(metrics.classification_report(y_test, y_pred_decision ))

SVM
Accuracy score = 0.933115823817292
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      1593
           1       0.83      0.63      0.71       246

    accuracy                           0.93      1839
   macro avg       0.89      0.80      0.84      1839
weighted avg       0.93      0.93      0.93      1839

random
Accuracy score = 0.965742251223491
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1593
           1       0.94      0.79      0.86       246

    accuracy                           0.97      1839
   macro avg       0.96      0.89      0.92      1839
weighted avg       0.97      0.97      0.96      1839

Logistic Regression
Accuracy score = 0.9380097879282219
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      1593
           1       0.83      0.67      0.74       246

    accuracy                           0.94  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


KNN
Accuracy score = 0.9352909189777052
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1593
           1       0.79      0.71      0.75       246

    accuracy                           0.94      1839
   macro avg       0.87      0.84      0.85      1839
weighted avg       0.93      0.94      0.93      1839

Naive Bayes
Accuracy score = 0.9320282762370854
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      1593
           1       0.72      0.80      0.76       246

    accuracy                           0.93      1839
   macro avg       0.84      0.88      0.86      1839
weighted avg       0.94      0.93      0.93      1839

Gradient Boosting
Accuracy score = 0.9439912996193583
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      1593
           1       0.78      0.81      0.80       246

    accuracy                           0