In [1]:
import pandas as pd

df = pd.read_csv("spam_data/spam.csv", encoding='latin1')
print(df.head())
print(df.columns)


     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


**CHeck for missing values**

In [2]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


**Unnamed 2,3,4 have mostly NaN values and they are not usefull for spam detcetion so we gotta remove them using garbage collector**

In [3]:
df=df[['v1','v2']]

In [4]:
df.columns=['label','sms-message']

In [5]:
df.head()

Unnamed: 0,label,sms-message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**converting label to 0,1**

In [6]:
df['label']=df['label'].map({'ham':0,'spam':1})

In [7]:
df.head()

Unnamed: 0,label,sms-message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.isnull().head()

Unnamed: 0,label,sms-message
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False


**'NLP' using this we clean the text**

In [9]:
df['sms-message']=df['sms-message'].str.lower()

In [10]:
df['sms-message'].head()

0    go until jurong point, crazy.. available only ...
1                        ok lar... joking wif u oni...
2    free entry in 2 a wkly comp to win fa cup fina...
3    u dun say so early hor... u c already then say...
4    nah i don't think he goes to usf, he lives aro...
Name: sms-message, dtype: object

In [11]:
import string

In [12]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [13]:
df['sms-message']=df['sms-message'].apply(lambda msg:msg.translate(str.maketrans('','',string.punctuation)))

In [14]:
df['sms-message'].head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: sms-message, dtype: object

**Removing stop words**

In [32]:
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df['sms-message'] = df['sms-message'].apply(lambda msg: ' '.join(word for word in msg.split() if word not in stop_words))

In [33]:
df['sms-message'].head()

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry 2 wkly comp win fa cup final tkts 2...
3                  u dun say early hor u c already say
4          nah dont think goes usf lives around though
Name: sms-message, dtype: object

In [34]:
df

Unnamed: 0,label,sms-message
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun say early hor u c already say
4,0,nah dont think goes usf lives around though
...,...,...
5567,1,2nd time tried 2 contact u u å£750 pound prize...
5568,0,ì b going esplanade fr home
5569,0,pity mood soany suggestions
5570,0,guy bitching acted like id interested buying s...


**identifing TF-IDF from tht database so that model can be trained**

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [52]:
vectorizer=TfidfVectorizer()

In [53]:
X=vectorizer.fit_transform(df['sms-message'])

In [54]:
y=df['label']

In [55]:
print(X.shape)
print(y.shape)

(5572, 9376)
(5572,)


**spliting the train and test datasets!**

In [56]:
from sklearn.model_selection import train_test_split

In [57]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [58]:
from sklearn.naive_bayes import MultinomialNB

In [59]:
model=MultinomialNB()

In [60]:
model.fit(X_train,y_train)

**Evaluate the model**

In [61]:
from sklearn.metrics import accuracy_score

In [62]:
y_p=model.predict(X_test)

In [63]:
print("Accuracy:", accuracy_score(y_test, y_p))

Accuracy: 0.9659192825112107


**A classification report**

In [64]:
from sklearn.metrics import  classification_report

In [65]:
print(classification_report(y_test,y_p))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115



**Final step to make a prediction using real message**

In [77]:
def predict_message(mg):
    mg=mg.lower()
    mg=mg.translate(str.maketrans('', '', string.punctuation))
    mg=' '.join(word for word in mg.split() if word not in stop_words)
    mg_vectorizor=vectorizer.transform([mg])

    prediction=model.predict(mg_vectorizor)[0]
    return 'spam' if(prediction==1) else 'Ham'

In [84]:
print(predict_message('Congratulations! You have won a $1000 prize, click here now!'))
print(predict_message('hey bro Maheshwar wassup!?'))

spam
Ham


**DONE**