In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import re

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer

import joblib

In [2]:
file = '..\\data\\raw\\spam.csv'
df = pd.read_csv(file, encoding = 'latin1')
df
s_det = df.copy()

In [3]:
s_det.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [4]:
s_det.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis = 1 , inplace = True)
s_det

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
s_det.rename(columns = {'v1' : 'target' , 'v2' : 'text'} , inplace = True)
s_det

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
s_det.isnull().sum()

target    0
text      0
dtype: int64

In [7]:
s_det.duplicated().sum()

np.int64(403)

In [8]:
s_det.drop_duplicates(inplace = True)
s_det.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5169 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  5169 non-null   object
 1   text    5169 non-null   object
dtypes: object(2)
memory usage: 121.1+ KB


In [9]:
s_det.to_csv("..\\data\\processed\\processed_spam.csv", index_label= 'sn')

In [10]:
s_det = pd.read_csv("..\\data\\processed\\processed_spam.csv")

In [11]:
s_det.drop(columns=['sn'], inplace = True)

In [12]:
s_det_1 = s_det.copy()

In [13]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

s_det['target'] = encoder.fit_transform(s_det['target'])

print("Label mapping:", dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))

Label mapping: {'ham': np.int64(0), 'spam': np.int64(1)}


In [14]:
s_det.reset_index(drop= True)
s_det

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5164,1,This is the 2nd time we have tried 2 contact u...
5165,0,Will Ì_ b going to esplanade fr home?
5166,0,"Pity, * was in mood for that. So...any other s..."
5167,0,The guy did some bitching but I acted like i'd...


In [15]:
nltk.download('stopwords')
words = stopwords.words('english')
print(words)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pralad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
s_det

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5164,1,This is the 2nd time we have tried 2 contact u...
5165,0,Will Ì_ b going to esplanade fr home?
5166,0,"Pity, * was in mood for that. So...any other s..."
5167,0,The guy did some bitching but I acted like i'd...


In [17]:
data = list()

for datas in s_det['text']:
    temp_ = re.sub('[^a-zA-Z]',' ',datas).lower().split()
    temp_ = [word for word in temp_ if word not in words]  # Remove stop words per word
    data.append(' '.join(temp_))

# print(data)
df = pd.DataFrame(data)

s_det['text_1'] = df
s_det_1['text_1'] = df

In [18]:
s_det

Unnamed: 0,target,text,text_1
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though
...,...,...,...
5164,1,This is the 2nd time we have tried 2 contact u...,nd time tried contact u u pound prize claim ea...
5165,0,Will Ì_ b going to esplanade fr home?,b going esplanade fr home
5166,0,"Pity, * was in mood for that. So...any other s...",pity mood suggestions
5167,0,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...


In [19]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
spam_msg = list()

for msg in s_det['text_1']:
    words = msg.split()
    stemmed_word = [stemmer.stem(word) for word in words]
    spam_msg.append(' '.join(stemmed_word))
    
df1 = pd.DataFrame(spam_msg, columns=['text_2'])
s_det['text_2'] = df1['text_2']
s_det_1['text_2'] = df1['text_2']

print(df1)

                                                 text_2
0     go jurong point crazi avail bugi n great world...
1                                 ok lar joke wif u oni
2     free entri wkli comp win fa cup final tkt st m...
3                   u dun say earli hor u c alreadi say
4                  nah think goe usf live around though
...                                                 ...
5164  nd time tri contact u u pound prize claim easi...
5165                              b go esplanad fr home
5166                                  piti mood suggest
5167  guy bitch act like interest buy someth els nex...
5168                                     rofl true name

[5169 rows x 1 columns]


In [20]:
s_det

Unnamed: 0,target,text,text_1,text_2
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...,free entri wkli comp win fa cup final tkt st m...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though,nah think goe usf live around though
...,...,...,...,...
5164,1,This is the 2nd time we have tried 2 contact u...,nd time tried contact u u pound prize claim ea...,nd time tri contact u u pound prize claim easi...
5165,0,Will Ì_ b going to esplanade fr home?,b going esplanade fr home,b go esplanad fr home
5166,0,"Pity, * was in mood for that. So...any other s...",pity mood suggestions,piti mood suggest
5167,0,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...,guy bitch act like interest buy someth els nex...


In [21]:
s_det_1

Unnamed: 0,target,text,text_1,text_2
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...,free entri wkli comp win fa cup final tkt st m...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though,nah think goe usf live around though
...,...,...,...,...
5164,spam,This is the 2nd time we have tried 2 contact u...,nd time tried contact u u pound prize claim ea...,nd time tri contact u u pound prize claim easi...
5165,ham,Will Ì_ b going to esplanade fr home?,b going esplanade fr home,b go esplanad fr home
5166,ham,"Pity, * was in mood for that. So...any other s...",pity mood suggestions,piti mood suggest
5167,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...,guy bitch act like interest buy someth els nex...


In [22]:
tfid = TfidfVectorizer(max_features = 3000)

x = tfid.fit_transform(s_det['text_2']).toarray()
y = s_det['target']

joblib.dump(tfid,'..\\model\\vectorizer.joblib')

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 2, train_size = 0.8)

In [23]:
svc = SVC(kernel = 'sigmoid', gamma = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2)
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
xgb = XGBClassifier(n_estimators = 50, random_state = 2)

In [24]:
models = {
    'SVC' : svc,
    'KNN' : knc,
    'MNB' : mnb,
    'DTC' : dtc,
    'LRC' : lrc,
    'RFC' : rfc,
    'ABC' : abc,
    'XGB' : xgb
}

In [25]:
def train_models(model,x_train,y_train,x_test,y_test):
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    return accuracy , precision

In [None]:
for name , model in models.items():
    current_accuracy , current_precision = train_models(model,x_train,y_train,x_test,y_test)
    print('for', name)
    print('accuracy score' , current_accuracy)
    print('precision score' , current_precision)
    
    joblib.dump(svc , f"..\\model\\{name}.joblib")

for SVC
accuracy score 0.9758220502901354
precision score 0.9669421487603306
for KNN
accuracy score 0.9090909090909091
precision score 1.0
for MNB
accuracy score 0.9738878143133463
precision score 1.0
for DTC
accuracy score 0.9342359767891683
precision score 0.8240740740740741
for LRC
accuracy score 0.9564796905222437
precision score 0.9514563106796117
for RFC
accuracy score 0.9748549323017408
precision score 0.9827586206896551


In [None]:
print(nltk.__version__)

In [None]:
print(re.__version__)