In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [12]:
a=pd.read_csv("mail_data.csv")

In [13]:
a=a.where((pd.notnull(a)),'')


In [14]:
a.shape

(5572, 2)

In [15]:
a.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
# label encoding

a.loc[a["Category"]=="spam","Category"]=0
a.loc[a["Category"]=="ham","Category"]=1


In [17]:
a.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
#separating with text and labels

x=a["Message"]
y=a["Category"]


In [19]:
#splitting into training and testing data

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3)
print(x.shape,x_train.shape,x_test.shape)

(5572,) (4457,) (1115,)


In [20]:
#feature extraction
b=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

x_train_features=b.fit_transform(x_train)
x_test_features=b.transform(x_test)

In [21]:
#converting y_test and y_train as integers
y_train=y_train.astype('int')
y_test=y_test.astype('int')


In [22]:
#Training model
model=LogisticRegression()

model.fit(x_train_features,y_train)


In [23]:
#prediction on training data

pred_train=model.predict(x_train_features)
accuracy_train=accuracy_score(pred_train,y_train)
print("Accuracy of training data:",accuracy_train)

Accuracy of training data: 0.9676912721561588


In [24]:
#prediction on testing data

pred_test=model.predict(x_test_features)
accuracy_test=accuracy_score(pred_test,y_test)
print("Accuracy of testing data:",accuracy_test)

Accuracy of testing data: 0.9668161434977578


In [25]:
# building predictive system

input_mail=["SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"
]

#convert text into feature vectors
input_mail_features=b.transform(input_mail)

prediction=model.predict(input_mail_features)
print(prediction)

if(prediction[0]==1):
  print("It is not a spam mail")
else:
  print("It is a spam mail")



[0]
It is a spam mail


In [26]:
input_mail=["saksham pls return home your friends are here and they hungy niggas already emptied ration"]

#convert text into feature vectors
input_mail_features=b.transform(input_mail)

prediction=model.predict(input_mail_features)
print(prediction)

if(prediction[0]==1):
  print("It is not a spam mail")
else:
  print("It is a spam mail")


[1]
It is not a spam mail


In [27]:
import pickle 
with open("spam_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(b, vectorizer_file)

print("Model and vectorizer saved successfully!")

Model and vectorizer saved successfully!


In [32]:
from sklearn.metrics import classification_report
y_pred = model.predict(x_test_features)
print("📊 Classification Report:\n")
print(classification_report(y_test, y_pred))

📊 Classification Report:

              precision    recall  f1-score   support

           0       1.00      0.76      0.86       155
           1       0.96      1.00      0.98       960

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

