In [None]:
import nltk
import numpy as np
import pandas as pd
from wordcloud import WordCloud
from nltk.corpus import stopwords

nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sarveshmhadgut/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sarveshmhadgut/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [4]:
df.dropna(axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.rename(columns={"v2": "text", "v1": "target"}, inplace=True)
df = df[["text", "target"]]
df

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,spam
5568,Will Ì_ b going to esplanade fr home?,ham
5569,"Pity, * was in mood for that. So...any other s...",ham
5570,The guy did some bitching but I acted like i'd...,ham


In [7]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df["target"] = encoder.fit_transform(df["target"])

df.head()

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
df.duplicated().sum()

np.int64(403)

In [9]:
len(df)

5572

In [10]:
df = df.drop_duplicates(keep="first")
df.duplicated().sum()

np.int64(0)

In [11]:
import string
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [12]:
english_stopwords = stopwords.words("english")


def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y.copy()
    y.clear()

    for i in text:
        if i not in english_stopwords and i not in string.punctuation:
            y.append(i)

    text = y.copy()
    y.clear()

    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

In [13]:
transform_text(
    "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
)

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [14]:
processed_col = df["text"].apply(transform_text)
df.insert(1, "processed_text", processed_col)

In [15]:
df

Unnamed: 0,text,processed_text,target
0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...,0
1,Ok lar... Joking wif u oni...,ok lar joke wif u oni,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...,1
3,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say,0
4,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though,0
...,...,...,...
5567,This is the 2nd time we have tried 2 contact u...,2nd time tri 2 contact u pound prize 2 claim e...,1
5568,Will Ì_ b going to esplanade fr home?,b go esplanad fr home,0
5569,"Pity, * was in mood for that. So...any other s...",piti mood suggest,0
5570,The guy did some bitching but I acted like i'd...,guy bitch act like interest buy someth els nex...,0


In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfidf = TfidfVectorizer(max_df=500)

In [17]:
X = tfidf.fit_transform(df["processed_text"]).toarray()

y = df["target"].values

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [20]:
svc = SVC(kernel="sigmoid", gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver="liblinear", penalty="l1")
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50, random_state=2)
xgb = XGBClassifier(n_estimators=50, random_state=2)

In [21]:
classifiers = {
    "Support Vector Machine": svc,
    "K Nearest Neighbor": knc,
    "Naive Bayes": mnb,
    "Decision Tree": dtc,
    "Logistic Regression": lrc,
    "Random Forest": rfc,
    "AdaBoost": abc,
    "Bagging": bc,
    "ExtraTrees": etc,
    "GradientBoosting": gbdt,
    "XGBoost": xgb,
}

In [22]:
from sklearn.metrics import accuracy_score, precision_score, classification_report


def train_classifier(clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train, y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy, precision

In [23]:
report = []
for name, clfs in classifiers.items():
    current_accuracy, current_precision = train_classifier(
        clfs, X_train, y_train, X_test, y_test
    )

    report.append(
        [
            name,
            np.round(current_accuracy * 100, 3),
            np.round(current_precision * 100, 3),
        ]
    )

In [24]:
report_df = pd.DataFrame(report, columns=["Classifier", "Accuracy", "Precision"])
report_df

Unnamed: 0,Classifier,Accuracy,Precision
0,Support Vector Machine,97.195,93.939
1,K Nearest Neighbor,89.652,100.0
2,Naive Bayes,96.132,99.065
3,Decision Tree,93.424,80.8
4,Logistic Regression,94.391,86.555
5,Random Forest,97.002,99.138
6,AdaBoost,91.489,78.788
7,Bagging,96.132,87.77
8,ExtraTrees,97.389,99.167
9,GradientBoosting,95.068,95.192
