In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder


# Load the dataset
df = pd.read_csv('spam.csv',encoding='latin-1')

In [2]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
#dropping unnecessary columns
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [4]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# Download the stopwords dataset
!pip install nltk
import nltk
nltk.download('stopwords')


[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
    --------------------------------------- 0.0/1.5 MB 330.3 kB/s eta 0:00:05
   -- ------------------------------------- 0.1/1.5 MB 573.4 kB/s eta 0:00:03
   --- ------------------------------------ 0.1/1.5 MB 722.1 kB/s eta 0:00:02
   ------ --------------------------------- 0.3/1.5 MB 1.2 MB/s eta 0:00:02
   ------------- -------------------------- 0.5/1.5 MB 1.9 MB/s eta 0:00:01
   --------------------- ------------------ 0.8/1.5 MB 2.5 MB/s eta 0:00:01
   ---------------------- ----------------- 0.8/1.5 MB 2.6 MB/s eta 0:00:01
   ---------------------- ----------------- 0.8/1.5 MB 2.6 MB

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aesha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [8]:
# Rename the columns "v1 and "v2" to new names

new_column_names = {"v1":"Category","v2":"Message"}
df.rename(columns = new_column_names,inplace = True)

In [9]:
# Encode the labels
le = LabelEncoder()
df['Category'] = le.fit_transform(df['Category'])  # 0 for 'ham' and 1 for 'spam'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['Category'], test_size=0.2, random_state=42)

In [10]:
# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=5)

# Fit and transform the training data, transform the testing data
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Train Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

# Train Logistic Regression model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)

# Train Support Vector Machine model
svc = SVC()
svc.fit(X_train_tfidf, y_train)

In [12]:
from sklearn.metrics import classification_report

# Predictions
y_pred_nb = nb.predict(X_test_tfidf)
y_pred_lr = lr.predict(X_test_tfidf)
y_pred_svc = svc.predict(X_test_tfidf)

# Evaluation
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))
print("Support Vector Machine Classification Report:\n", classification_report(y_test, y_pred_svc))

Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.85      0.92       150

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       0.97      0.76      0.85       150

    accuracy                           0.97      1115
   macro avg       0.97      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Support Vector Machine Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.85      0.91       150

    accuracy                           0.98      1115
 

In [13]:
#Hyper parameter tuning
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Logistic Regression
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
}

# Initialize GridSearchCV
grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='f1')
grid.fit(X_train_tfidf, y_train)

# Best parameters and model
best_params = grid.best_params_
best_model = grid.best_estimator_

print("Best Parameters:\n", best_params)

Best Parameters:
 {'C': 10, 'solver': 'lbfgs'}


In [14]:
# Predictions
y_pred_best = best_model.predict(X_test_tfidf)

# Evaluation
print("Tuned Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_best))

Tuned Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.97      0.87      0.92       150

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [15]:
# Function to classify custom messages
def classify_message(message):
    message_tfidf = tfidf.transform([message])
    prediction = best_model.predict(message_tfidf)
    return 'spam' if prediction[0] == 1 else 'ham'

In [16]:
# Example usage
custom_message1 = "Congratulations! You've won a 1000Rs Dmart gift card. Click here to claim your prize."
custom_message2 = "Hi, how are you doing today?"

print(f"Message: {custom_message1}\nClassification: {classify_message(custom_message1)}")
print(f"Message: {custom_message2}\nClassification: {classify_message(custom_message2)}")

Message: Congratulations! You've won a 1000Rs Dmart gift card. Click here to claim your prize.
Classification: spam
Message: Hi, how are you doing today?
Classification: ham
