In [1]:
# ensure required packages are installed in the notebook environment


import numpy 
import pandas as pd
import matplotlib.pyplot as plt


from wordcloud import WordCloud

import nltk 
from nltk.corpus import stopwords



In [2]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to C:\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
df = pd.read_csv('spam.csv')

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.rename(columns={'v1': 'target', 'v2': 'text'}, inplace=True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
#data preprocessing
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.duplicated().sum()

np.int64(403)

In [9]:
len(df)

5572

In [10]:
df = df.drop_duplicates(keep='first')
len(df)

5169

In [11]:
#feature engineering

from nltk.stem.porter import PorterStemmer

import string
ps = PorterStemmer()

In [12]:
# Lowercase transformation and text preprocessing function
def transform_text(text):
    # Transform the text to lowercase
    text = text.lower()
    
    # Tokenization using NLTK
    text = nltk.word_tokenize(text)
    
    # Removing special characters
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
            
    # Removing stop words and punctuation
    text = y[:]
    y.clear()
    
    # Loop through the tokens and remove stopwords and punctuation
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
        
    # Stemming using Porter Stemmer
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    
    # Join the processed tokens back into a single string
    return " ".join(y)

In [13]:
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tfid = TfidfVectorizer(max_features = 500)

In [15]:
X = tfid.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test  = train_test_split(X,y,test_size= 0.20, random_state= 2)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [18]:
svc = SVC(kernel = "sigmoid", gamma = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators= 50, random_state=2)
abc = AdaBoostClassifier(n_estimators= 50, random_state =2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators= 50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50, random_state=2)
xgb = XGBClassifier(n_estimators = 50, random_state = 2)

In [19]:
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
    'xgb': xgb
    
}

In [20]:
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train, y_train)
    y_pred = clfs.predict(X_test)
    accuracy = (y_test, y_pred)
    precision = precision_score(y_test,y_pred)
    return accuracy, precision

In [21]:
import os

os.environ.setdefault("LOKY_MAX_CPU_COUNT", str(os.cpu_count()))
accuracy_scores = {}
precision_scores = {}

classifiers = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
    'xgb': xgb
}


for name, clf in classifiers.items():
    current_accuracy, current_precision = train_classifier(clf, X_train, y_train, X_test, y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)
    
    accuracy_scores[name] = current_accuracy
    precision_scores[name] = current_precision


For:  SVC
Accuracy:  (array([0, 0, 0, ..., 0, 0, 0], shape=(1034,)), array([0, 0, 0, ..., 0, 0, 0], shape=(1034,)))
Precision:  0.9333333333333333


[WinError 2] The system cannot find the file specified
  File "c:\Users\priya\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\priya\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 556, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\priya\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1038, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                   


For:  KNN
Accuracy:  (array([0, 0, 0, ..., 0, 0, 0], shape=(1034,)), array([0, 0, 0, ..., 0, 0, 0], shape=(1034,)))
Precision:  1.0

For:  NB
Accuracy:  (array([0, 0, 0, ..., 0, 0, 0], shape=(1034,)), array([0, 0, 0, ..., 0, 0, 0], shape=(1034,)))
Precision:  0.9655172413793104

For:  DT
Accuracy:  (array([0, 0, 0, ..., 0, 0, 0], shape=(1034,)), array([0, 0, 0, ..., 0, 0, 0], shape=(1034,)))
Precision:  0.9101123595505618

For:  LR
Accuracy:  (array([0, 0, 0, ..., 0, 0, 0], shape=(1034,)), array([0, 0, 0, ..., 0, 0, 0], shape=(1034,)))
Precision:  0.9629629629629629

For:  RF
Accuracy:  (array([0, 0, 0, ..., 0, 0, 0], shape=(1034,)), array([0, 0, 0, ..., 0, 0, 0], shape=(1034,)))
Precision:  0.9421487603305785

For:  Adaboost
Accuracy:  (array([0, 0, 0, ..., 0, 0, 0], shape=(1034,)), array([0, 0, 0, ..., 1, 0, 0], shape=(1034,)))
Precision:  0.8734177215189873

For:  Bgc
Accuracy:  (array([0, 0, 0, ..., 0, 0, 0], shape=(1034,)), array([0, 0, 0, ..., 0, 0, 0], shape=(1034,)))
Precision