In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from wordcloud import WordCloud

import nltk
from nltk.corpus import stopwords

nltk.download('Stopwords')
nltk.download('punkt')

[nltk_data] Error loading Stopwords: Package 'Stopwords' not found in
[nltk_data]     index
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\psaur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [9]:
df.drop(columns = ['Unnamed: 2','Unnamed: 3', 'Unnamed: 4'], inplace = True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
df.rename(columns = {'v1': 'target', 'v2': 'text'}, inplace = True)

Data Pre-Processing

In [11]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
df.duplicated().sum()

403

In [13]:
len(df)

5572

In [15]:
df = df.drop_duplicates(keep = 'first')
len(df)

5169

Feature Engg

In [16]:
from nltk.stem.porter import PorterStemmer
import string
ps = PorterStemmer()

In [21]:
# Lowercase transformation and text pre-processing function
def transform_text(text):
    # Transform the text to lowercase
    text = text.lower()
    text = nltk.word_tokenize(text)

    # Removing special characters
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    # Removing the stop words and punctuataions
    text: y[:]
    y.clear()

    # Loop through the tokens and remove stopwords and puncuations.
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    # Stemming using Porter Stemmer
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))

    # join the processed tokes back into a single string
    return " ".join(y)

    

In [22]:
transform_text('Go until jurong point, crazy.. Avilable only in bugis n great world la e buffet... Cine there got amore wat...')

'go jurong point crazi .. avil bugi n great world la e buffet ... cine got amor wat ...'

In [23]:
df['transform_text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,target,text,transform_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi .. avail bugi n great wo...
1,0,Ok lar... Joking wif u oni...,ok lar ... joke wif u oni ...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor ... u c alreadi say ...
4,0,"Nah I don't think he goes to usf, he lives aro...",nah n't think goe usf live around though


In [25]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tfid = TfidfVectorizer(max_features = 500)

In [28]:
X = tfid.fit_transform(df['transform_text']).toarray()
y = df['target'].values

Train Test Split

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.20, random_state =2)

Model Training

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [40]:
svc = SVC(kernel = 'sigmoid', gamma = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state =2)
abc = AdaBoostClassifier(n_estimators = 50, random_state =2)
bc = BaggingClassifier(n_estimators = 50, random_state =2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state =2)
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state =2)
xgb = BaggingClassifier(n_estimators = 50, random_state =2)

In [49]:
models = {
    'SVC': svc,
    'KNN': knc,
    'MultinomialNB': mnb,
    'DecisionTree': dtc,
    'LogisticRegression': lrc,
    'RandomForest': rfc,
    'AdaBoost': abc,
    'Bagging': bc,
    'ExtraTrees': etc,
    'GradientBoosting': gbdt
}


In [46]:
from sklearn.metrics import accuracy_score, precision_score

def train_classifier(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    
    return accuracy, precision

In [50]:
accuracies = []
precisions = []

for name, model in models.items():
    acc, prec = train_classifier(
        model, X_train, y_train, X_test, y_test
    )
    
    print(f"\nFor: {name}")
    print("Accuracy:", acc)
    print("Precision:", prec)
    
    accuracies.append(acc)
    precisions.append(prec)



For: SVC
Accuracy: 0.9709864603481625
Precision: 0.9426229508196722

For: KNN
Accuracy: 0.9264990328820116
Precision: 1.0

For: MultinomialNB
Accuracy: 0.9758220502901354
Precision: 0.9669421487603306

For: DecisionTree
Accuracy: 0.9352030947775629
Precision: 0.8736842105263158

For: LogisticRegression
Accuracy: 0.9700193423597679
Precision: 0.9734513274336283

For: RandomForest
Accuracy: 0.9729206963249516
Precision: 0.9583333333333334

For: AdaBoost
Accuracy: 0.9758220502901354
Precision: 0.9669421487603306

For: Bagging
Accuracy: 0.9632495164410058
Precision: 0.8787878787878788

For: ExtraTrees
Accuracy: 0.97678916827853
Precision: 0.9596774193548387

For: GradientBoosting
Accuracy: 0.9535783365570599
Precision: 0.95
