In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#df=pd.read_csv('spam.csv')
try:
  df=pd.read_csv('spam.csv',encoding='utf-8')
except UnicodeDecodeError:
        df=pd.read_csv('spam.csv',encoding='latin-1')

In [3]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


1. data cleaning

In [5]:
df.isnull().sum()

Unnamed: 0,0
v1,0
v2,0
Unnamed: 2,5522
Unnamed: 3,5560
Unnamed: 4,5566


In [6]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [7]:
df.columns

Index(['v1', 'v2'], dtype='object')

In [8]:
df = df.rename(columns={'v1':'Label','v2':'Message'})

In [9]:
df

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [10]:
df.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
ham,4825
spam,747


2.convert categorical to numerical

In [12]:
from sklearn.preprocessing  import LabelEncoder
encoder = LabelEncoder()
df['Label'] = encoder.fit_transform(df['Label'])

In [13]:
df

Unnamed: 0,Label,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


3.train and test the spam detector

In [14]:
x=df['Message']
y=df['Label']

In [15]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [16]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


4.model building

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
model_pipeline = Pipeline([
    ('tfidf',TfidfVectorizer(stop_words='english')),
    ('classifier',MultinomialNB())
])

In [19]:
model_pipeline.fit(x_train,y_train)

In [20]:
y_pred = model_pipeline.predict(x_test)

In [21]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [22]:
accuracy=accuracy_score(y_pred,y_test)*100
print('accuracy score:',accuracy)

accuracy score: 96.68161434977578


In [23]:

from sklearn.metrics import classification_report

In [24]:
report=classification_report(y_pred,y_test,target_names=['ham','spam'])
print('classification_report:',report)

classification_report:               precision    recall  f1-score   support

         ham       1.00      0.96      0.98      1002
        spam       0.75      1.00      0.86       113

    accuracy                           0.97      1115
   macro avg       0.88      0.98      0.92      1115
weighted avg       0.98      0.97      0.97      1115



example

In [25]:

test_spam = ["WINNER! Your account has won a FREE prize! Text 'CLAIM' to 88899 now."]
pred_spam = model_pipeline.predict(test_spam)[0]
result_spam = 'SPAM' if pred_spam == 1 else 'NOT SPAM (Ham)'
print(f"'{test_spam[0]}' -> Predicted: {result_spam}")

'WINNER! Your account has won a FREE prize! Text 'CLAIM' to 88899 now.' -> Predicted: SPAM
