In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load the dataset
df = pd.read_csv("D:/code/Python/Model/SpamDetector/smsspamcollection.tsv", sep='\t')

In [3]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [5]:
# Check label distribution
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [6]:
#removeing any rows in the DataFrame df if it contain missing (NaN) values.
# df.dropna(inplace=True)

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x = df['message']

In [9]:
y = df['label']

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.33,random_state=42)

In [11]:
# From line 11 to 17 can be done from one line statement using pipeline which is done at line 18.
# Text preProcessing : tokenizing, remove stop words and etc are done with countvectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [13]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tiwar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tiwar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tiwar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
# Define the custom tokenizer function
def tokenize_and_lemmatize(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

In [15]:
# vectorizer = TfidfVectorizer()
vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize, stop_words='english')

In [16]:
#Fit Vectotrizer to the data (build a vocab, count the number of words...)
#vectorizer.fit(x_trian)
#Transform the original text message to VECTOR
#x_train_tfidf = vectorizer.transform(x_train)

#or

x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)



In [17]:
x_train_tfidf

<3733x5321 sparse matrix of type '<class 'numpy.float64'>'
	with 26510 stored elements in Compressed Sparse Row format>

In [18]:
# from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [19]:
# clf = LinearSVC()
clf = MultinomialNB()

In [20]:
clf.fit(x_train_tfidf,y_train)

In [21]:
#Through pipeline we can perform no of opertions in very few lines.
# from sklearn.pipeline import Pipeline
# text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
# text_clf.fit(x_train,y_train)

In [22]:
# y_train_pred = text_clf.predict(x_train) (using pipeline)
y_train_pred = clf.predict(x_train_tfidf)  #(without using pipeline)

In [23]:
y_test_pred = clf.predict(x_test_tfidf)

In [24]:
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score

In [25]:
# Predict on the training data
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy}")

Training Accuracy: 0.9740155371015269


In [26]:
# Predict on the test data
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.967373572593801


In [27]:
report = classification_report(y_test, y_test_pred)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1593
        spam       0.99      0.76      0.86       246

    accuracy                           0.97      1839
   macro avg       0.98      0.88      0.92      1839
weighted avg       0.97      0.97      0.97      1839



In [28]:
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[1592    1]
 [  59  187]]


In [35]:
sample_message = "Congratulations! You've been selected as a winner. Text Won to 44255 congratulations free entry to the concert."
# Transform the sample message using the same vectorizer
sample_tfidf = vectorizer.transform([sample_message])
# Predict using the classifier
clf.predict(sample_tfidf)

array(['spam'], dtype='<U4')

In [36]:
sample_message = "Hi how are you doing today?"
# Transform the sample message using the same vectorizer
sample_tfidf = vectorizer.transform([sample_message])
# Predict using the classifier
clf.predict(sample_tfidf)

array(['ham'], dtype='<U4')