# Importing Necessaries Libraries

In [1]:
import pandas as pd
import re
import nltk
import string
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
pd.options.mode.chained_assignment = None

# Loading Dataset

In [2]:
train_data  = pd.read_csv(r'Data/spam.csv',encoding='latin-1')
train_data=train_data[['v1','v2']]

In [3]:
train_data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ï¿½_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# Data Preprocessing

Converting ham to 0 and spam to 1 in columnn 'v1'

In [4]:
train_data['v1']=train_data['v1'].apply(lambda x : 1 if x =='spam' else 0)

Removing Duplicates data

In [5]:
train_data= train_data.drop_duplicates()


In [6]:
train_data

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ï¿½_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


# Text Cleaning

In [7]:
stemmer = PorterStemmer()
def clean_text(text):
    text = text.lower()  # Lowercase all characters
    text = re.sub(r'@\S+', '', text)  # Remove Twitter handles
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'pic.\S+', '', text)
    text = re.sub(r"[^a-zA-Z+']", ' ', text)  # Keep only characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text + ' ')  # Keep words with length > 1 only
    return text


In [8]:
train_data['v2'] = train_data['v2'].apply(lambda x: clean_text(x))


In [9]:
train_data

Unnamed: 0,v1,v2
0,0,go until jurong point crazy available only ...
1,0,ok lar joking wif oni
2,1,free entry in wkly comp to win fa cup final tk...
3,0,u dun say so early hor c already then say
4,0,nah don't think he goes to usf he lives aroun...
...,...,...
5567,1,this is the nd time we have tried contact u...
5568,0,will going to esplanade fr home
5569,0,pity was in mood for that so any other s...
5570,0,the guy did some bitching but acted like i'd b...


In [10]:
train_data['v2'] =train_data['v2'].str.lower()

train_data['v2'] =train_data['v2'].apply(word_tokenize)

In [11]:
stemmer = PorterStemmer()
def stem_words(words):
    return [stemmer.stem(word) for word in words]
train_data['v2'] = train_data['v2'].apply(stem_words)

In [12]:
# Convert tokenized words back to text
train_data['v2'] = train_data['v2'].apply(lambda x: ' '.join(x))
train_data

Unnamed: 0,v1,v2
0,0,go until jurong point crazi avail onli in bugi...
1,0,ok lar joke wif oni
2,1,free entri in wkli comp to win fa cup final tk...
3,0,u dun say so earli hor c alreadi then say
4,0,nah do n't think he goe to usf he live around ...
...,...,...
5567,1,thi is the nd time we have tri contact u have ...
5568,0,will go to esplanad fr home
5569,0,piti wa in mood for that so ani other suggest
5570,0,the guy did some bitch but act like i 'd be in...


In [13]:
# Initialize the Count Vectorizer
count_vectorizer = CountVectorizer(max_features=5000)

# Apply the vectorizer to the 'v2' column
features = count_vectorizer.fit_transform(train_data['v2'])

# Convert the result to a dense array (if needed)
features = features.toarray()

In [14]:
features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
train_data

Unnamed: 0,v1,v2
0,0,go until jurong point crazi avail onli in bugi...
1,0,ok lar joke wif oni
2,1,free entri in wkli comp to win fa cup final tk...
3,0,u dun say so earli hor c alreadi then say
4,0,nah do n't think he goe to usf he live around ...
...,...,...
5567,1,thi is the nd time we have tri contact u have ...
5568,0,will go to esplanad fr home
5569,0,piti wa in mood for that so ani other suggest
5570,0,the guy did some bitch but act like i 'd be in...


# Training Model

In [16]:
X_train, X_test, y_train, y_test = train_test_split(features, train_data['v1'], test_size=0.2, random_state=42)

### Naive Bayes Multinomial Distribution Model

In [17]:
classifier = MultinomialNB()
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
report = classification_report(y_test,y_pred)
print(report)


              precision    recall  f1-score   support

           0       0.99      0.98      0.99       889
           1       0.90      0.93      0.92       145

    accuracy                           0.98      1034
   macro avg       0.94      0.96      0.95      1034
weighted avg       0.98      0.98      0.98      1034



In [18]:
# Perform 5-fold cross-validation (you can adjust 'cv' as needed)
cv_scores = cross_val_score(classifier, features, train_data['v1'], cv=5)

# Print the cross-validation scores
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Score: {cv_scores.mean()}')

Cross-Validation Scores: [0.98065764 0.97582205 0.97485493 0.97485493 0.97579864]
Mean CV Score: 0.976397639969966
