In [1]:
import pandas as pd


file_path = 'Spam_SMS.csv'
data = pd.read_csv(file_path)


data.head()

Unnamed: 0,Class,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    filtered_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    
    return ' '.join(filtered_tokens)

data['Processed_Message'] = data['Message'].apply(preprocess_text) #


data[['Message', 'Processed_Message']].head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sriharsha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sriharsha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Message,Processed_Message
0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


tfidf_vectorizer = TfidfVectorizer(max_features=3000) # object
X = tfidf_vectorizer.fit_transform(data['Processed_Message']) # Input Data


y = (data['Class'] == 'spam').astype(int)


In [6]:
print(X[0])

  (0, 2807)	0.24263532812921912
  (0, 1037)	0.20268022213530607
  (0, 559)	0.3689739449553289
  (0, 1355)	0.3689739449553289
  (0, 2903)	0.29681289042883874
  (0, 1051)	0.2430987695126441
  (0, 465)	0.3689739449553289
  (0, 331)	0.33212000644245443
  (0, 638)	0.33823957042715463
  (0, 1944)	0.29823016936949226
  (0, 1015)	0.17543564536203393


In [9]:
print(y)

0       0
1       0
2       1
3       0
4       0
       ..
5569    1
5570    0
5571    0
5572    0
5573    0
Name: Class, Length: 5574, dtype: int32


In [10]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = LogisticRegression()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


In [11]:


accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

accuracy*100

96.32286995515696

In [9]:
print(report)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       954
           1       0.97      0.77      0.86       161

    accuracy                           0.96      1115
   macro avg       0.97      0.88      0.92      1115
weighted avg       0.96      0.96      0.96      1115



In [10]:
import pickle

# Serializing the model
model_file_path = 'spam_classifier_model.pkl'
vectorizer_file_path = 'tfidf_vectorizer.pkl'


with open(model_file_path, 'wb') as file:
    pickle.dump(model, file)


with open(vectorizer_file_path, 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)


