In [None]:
import pandas as pd

# **Data Preparation**

**Loading data**

In [None]:
df = pd.read_csv('spam.csv', encoding='latin-1')
print(df.head())
df.shape

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


(5572, 5)

In [None]:
df['v1'].value_counts()

Unnamed: 0_level_0,count
v1,Unnamed: 1_level_1
ham,4825
spam,747


In [None]:
X = df.iloc[:, 1]
y = df.iloc[:, 0]

In [None]:
unique_values = y.unique()
print(unique_values)

['ham' 'spam']


**Cleaning the data**

In [None]:
import nltk
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter
from nltk.corpus import wordnet
import string

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def is_ponctuation(word):
    return all(caractere in string.punctuation for caractere in word)

In [None]:
def is_multiple_white_spaces(token):
    return all(caracter.isspace() for caracter in token)

In [None]:
def cleaning_data(text):

  tokens= word_tokenize(text)

  #converting to lower case
  tokens = [token.lower() for token in tokens]

  #performing stemming
  ps = PorterStemmer()
  tokens = [ps.stem(token) for token in tokens]

  #removing ponctuation
  tokens = [token for token in tokens if not is_ponctuation(token)]

  #removing multiple white spaces
  tokens = [token if not is_multiple_white_spaces(token) else ' ' for token in tokens]

  stop_words = set(stopwords.words('english'))
  tokens = [token for token in tokens if token not in stop_words ]

  # Reconstructing the text from tokens
  cleaned_text = ' '.join(tokens)

  return cleaned_text

In [None]:
X = [cleaning_data(text) for text in X]

In [None]:
y_mapped = y.map({'ham': 0, 'spam': 1})
print(y_mapped[0])
print(y_mapped[2])

0
1


In [None]:
import numpy as np


y_mapped_df = pd.DataFrame(y_mapped)
X_df = pd.DataFrame(X)



y_mapped_df.to_csv('y_mapped_df.csv', index=False, header=False)
X_df.to_csv('X_df.csv', index=False, header=False)

# **Text representation using the TF-IDF technique**

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer



n_features =10000
v = HashingVectorizer(n_features=n_features, alternate_sign=False)


transformed_X = v.transform(X)
transformed_X = transformed_X.toarray()

# **Building the model**

**spliting the data**

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(transformed_X, y_mapped, test_size=0.2, random_state=42)

**searching for best parameters**

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0],
    'fit_prior': [True, False],
}

grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train.ravel())

best_model = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)


Best Parameters:  {'alpha': 0.1, 'fit_prior': True}


In [None]:
nb_model = MultinomialNB(alpha= 0.1, fit_prior= True)


nb_model.fit(X_train, y_train.ravel())


y_pred = nb_model.predict(X_val)


accuracy = accuracy_score(y_val, y_pred)

print(f'Validation Accuracy: {accuracy * 100:.2f}%')

Validation Accuracy: 98.12%


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.98      0.88      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

