IMPORTING THE REQUIRED LIBRARIES

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


LOADING THA DATASET TO PANDAS DATAFRAME

In [2]:
# Load your SMS dataset
spam_data = pd.read_csv('spam.csv', encoding='latin-1')


FIRST 5 ROWS OF THE DATASET

In [3]:
spam_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


LAST 5 ROWS OF THE DATASET

In [4]:
spam_data.tail()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,


DATASET INFORMATION USING info() METHOD

In [5]:
spam_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


CHECKING THE NUMBER OF MISSING VALUES IN EACH COLUMN

In [6]:
spam_data.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

checking the number of spam messages and legit messages

DELETE THE LAST THREE COLUMNS

In [7]:
spam_data=spam_data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [8]:
spam_data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


CHANGING THE COLUMN NAMES

In [9]:
spam_data=spam_data.rename(columns={'v1':'label','v2':'text'})

In [10]:
spam_data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
spam_data['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

ham-->REPRESENTS LEGIT MESSAGES
spam-->REPRESENTS SPAM MESSAGES

SPLIT THE DATA INTO TRAINING AND TEST DATA

In [12]:
# Split the data into training and testing sets
X = spam_data['text']
y = spam_data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



TF-IDF VECTORIZATION

In [13]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # We can adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)



MULTINOMIAL NAIVE BAYES CLASSIFIER

In [14]:
# Initialize and train the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)



In [15]:
# Predict labels for the test set using Naive Bayes
y_pred_nb = nb_classifier.predict(X_test_tfidf)



LOGISTIC REGRESSION

In [16]:
# Initialize and train the Logistic Regression classifier
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_tfidf, y_train)

In [17]:
# Predict labels for the test set using Logistic Regression
y_pred_lr = lr_classifier.predict(X_test_tfidf)



SUPPORT VECTOR MACHINE

In [18]:
# Initialize and train the Support Vector Machine (SVM) classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)



In [19]:
# Initialize and train the Support Vector Machine (SVM) classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)



In [20]:
# Predict labels for the test set using SVM
y_pred_svm = svm_classifier.predict(X_test_tfidf)



In [21]:
# Evaluate the Naive Bayes model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
confusion_nb = confusion_matrix(y_test, y_pred_nb)
report_nb = classification_report(y_test, y_pred_nb)



In [22]:
# Evaluate the Logistic Regression model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
confusion_lr = confusion_matrix(y_test, y_pred_lr)
report_lr = classification_report(y_test, y_pred_lr)



In [23]:
# Evaluate the SVM model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
confusion_svm = confusion_matrix(y_test, y_pred_svm)
report_svm = classification_report(y_test, y_pred_svm)



In [24]:
# Print the results for all three classifiers
print("Naive Bayes Model:")
print("Accuracy:", accuracy_nb)
print("\nConfusion Matrix:\n", confusion_nb)
print("\nClassification Report:\n", report_nb)

print("\nLogistic Regression Model:")
print("Accuracy:", accuracy_lr)
print("\nConfusion Matrix:\n", confusion_lr)
print("\nClassification Report:\n", report_lr)

print("\nSupport Vector Machine (SVM) Model:")
print("Accuracy:", accuracy_svm)
print("\nConfusion Matrix:\n", confusion_svm)
print("\nClassification Report:\n", report_svm)

# Now we can use these trained models to classify SMS messages as spam or legitimate.


Naive Bayes Model:
Accuracy: 0.9668161434977578

Confusion Matrix:
 [[965   0]
 [ 37 113]]

Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115


Logistic Regression Model:
Accuracy: 0.967713004484305

Confusion Matrix:
 [[964   1]
 [ 35 115]]

Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       0.99      0.77      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115


Support Vector Machine (SVM) Model:
Accuracy: 0.9829596412556054

Confusion Matrix:
 [[963   2]
 [ 17 133