# Spam Detection

![](https://miro.medium.com/max/1024/1*pWzWiDuVsox3nmeK_8pqlg.png)

# **Please Upvote if you like my work. Thanks and God Bless you!!!**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
sms=pd.read_csv('../input/sms-spam-collection-dataset/spam.csv')
sms.head()

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
sms.head()

In [None]:
#Removing unneccesary columns
sms.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)

#Renaming rest columns
sms.rename(columns={'v1':'label','v2':'message'},inplace=True)

In [None]:
sms.head()

In [None]:
sms.shape

Dataset have 5572 rows and 2 columns

In [None]:
sms['label'].value_counts()

In [None]:
print('Spam Ratio = ',np.round(len(sms[sms["label"]=='spam'])/len(sms['label']),2)*100)
print('Ham Ratio = ',np.round(len(sms[sms["label"]=='ham'])/len(sms['label']),2)*100)

In [None]:
#New column for length of message
sms['length']=sms['message'].str.len()
sms.head()

In [None]:
#Label coding 0 and 1
sms['label'].replace({'ham':0,'spam':1},inplace=True)

In [None]:
sms['label'].value_counts()

In [None]:
#Convert all messages to lower case
sms['message']=sms['message'].str.lower()

In [None]:
sms.head()

In [None]:
#Replace email addresses with 'email_address'
sms['message']=sms['message'].str.replace(r'\w+\.*\w+@\w+\.\w+','email_address')

#URL's with 'web_address'
sms['message']=sms['message'].str.replace(r'https\://\w+\.\w+\.\w+','web_address')

#Replace money symbols with 'moneysymb'
sms['message']=sms['message'].str.replace(r'\$|€|₹|rs\.*','moneysymb')

#Replace phone numbers with 'phone_num'
sms['message']=sms['message'].str.replace(r'\d{11}|\d{12}','phone_num')

#Replace number with 'num_ber'
sms['message']=sms['message'].str.replace(r'\d+\.{0,1}\d+','num_ber')

In [None]:
#Remove punctuations
sms['message']=sms['message'].str.replace(r'[^\w\d\s]',' ')

#Replace white spaces with one space
sms['message']=sms['message'].str.replace(r'[\s+]',' ')

#Remove leading and trailing space
sms['message']=sms['message'].str.replace(r'^\s+|\s+$','')

In [None]:
#Removing Stopwords
from nltk.corpus import stopwords
sw=set(stopwords.words('english')+['u','ū','ur','im','dont','doin','ure'])

sms['message']=sms['message'].apply(lambda x: ' '.join(term for term in x.split() if term not in sw))

In [None]:
#New column (clean length) after removal of punctuations and stopwords
sms['clean_length']=sms['message'].str.len()
sms.head()

In [None]:
print('Original length',sms['length'].sum())
print('Cleaned length',sms['clean_length'].sum())

In [None]:
#Message distribution before cleaning
f,ax=plt.subplots(1,2,figsize=(15,8))

sns.distplot(sms[sms['label']==1]['length'],bins=20,ax=ax[0],label='Spam message distribution',color='r')

ax[0].set_xlabel('Spam message length')
ax[0].legend()

sns.distplot(sms[sms['label']==0]['length'],bins=20,ax=ax[1],label='Ham message distribution',color='b')

ax[1].set_xlabel('Ham message length')
ax[1].legend()

plt.show()

In [None]:
#Message distribution after cleaning
f,ax=plt.subplots(1,2,figsize=(15,8))

sns.distplot(sms[sms['label']==1]['clean_length'],bins=20,ax=ax[0],label='Spam message distribution',color='r')

ax[0].set_xlabel('Spam message length')
ax[0].legend()

sns.distplot(sms[sms['label']==0]['clean_length'],bins=20,ax=ax[1],label='Ham message distribution',color='b')

ax[1].set_xlabel('Ham message length')
ax[1].legend()

plt.show()

In [None]:
#Getting sense of loud words in spam
from wordcloud import WordCloud

spams=sms['message'][sms['label']==1]
spamcloud=WordCloud(width=1200,height=800,background_color='white',max_words=25).generate(' '.join(spams))

plt.figure(figsize=(12,8),facecolor='r')
plt.imshow(spamcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

We can clearly see some words such as free, prize, win, claim, cash are indication of spam words

In [None]:
#Getting sense of loud words in Ham
from wordcloud import WordCloud

spams=sms['message'][sms['label']==0]
spamcloud=WordCloud(width=1200,height=800,background_color='white',max_words=25).generate(' '.join(spams))

plt.figure(figsize=(12,8),facecolor='k')
plt.imshow(spamcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
#Convert text into vectors
#Split feature and label
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

tf_vec=TfidfVectorizer()
features=tf_vec.fit_transform(sms['message'])

x=features
y=sms['label']

xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=7)

### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
naiv=MultinomialNB()
naiv.fit(xtrain,ytrain)
pred=naiv.predict(xtest)

In [None]:
print('Accuracy score', accuracy_score(pred,ytest))
print('-----------------------------------------')
print('Confusion Matrix')
print(confusion_matrix(pred,ytest))
print('-----------------------------------------')
print('Classification Report')
print(classification_report(pred,ytest))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
model.fit(xtrain,ytrain)
pred=model.predict(xtest)

In [None]:
print('Accuracy score', accuracy_score(pred,ytest))
print('-----------------------------------------')
print('Confusion Matrix')
print(confusion_matrix(pred,ytest))
print('-----------------------------------------')
print('Classification Report')
print(classification_report(pred,ytest))