![](https://bl3301files.storage.live.com/y4pXqMEhHh_1akHrM0TzWl-xSKzVyNxFtWrbwVPbfoZpMf7vj7zd2ayPElLwPmkisD0wx6MgU-xO2Uuk60lwOPmLD6msaNAWF4uBDA2YWXJU5f6YAEE2TLjTLLZmmO7WgRZDFqe5edrQALlJKS3W9TJuX2PcVGfR6X3AHDKK9thFDMe3CgCEsT39zdiDzua8ano/dataset-card.jpg?psid=1&width=600&height=281)

### Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.rcParams['patch.force_edgecolor']=True

### Importing the dataset

In [None]:
Data = pd.read_csv("../input/spam.csv",engine='python')
Data.head()

In [None]:
Data=Data.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
Data.rename(columns={'v1':'Category',
                    'v2':'Message'},inplace=True)

In [None]:
Data.head()

In [None]:
print('No. of Samples: {}'.format(Data.index.max()))
print('No. of nulls:\n{}'.format(Data.isnull().sum()))

In [None]:
Data['Msg_Length']=Data['Message'].apply(lambda X:len(X))

In [None]:
Data.head()

In [None]:
Data['Category'].value_counts()

In [None]:
sns.set_context(context='notebook',font_scale=2)
Data.hist(column='Msg_Length',by='Category',bins=100,figsize=(16,6))

In [None]:
print('Average length of spam messages: ',Data[Data['Category']=='spam']['Msg_Length'].mean(),'characters')
print('Average length of ham messages: ',Data[Data['Category']=='ham']['Msg_Length'].mean(),'characters')

## Text Cleaning

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
corpus=[]
for i in range(0,5572):
    msg=re.sub('[^a-zA-Z]',' ',Data['Message'][i])
    msg=msg.lower()
    msg=msg.split()
    msg=[word for word in msg if not word in set(stopwords.words('english'))]
    msg=' '.join(msg)
    corpus.append(msg)

In [None]:
corpus[0:5]

### Choosing an upper bound for no. of words for count vectorizer to reduce size of sparse matrix.

In [None]:
words=[]
for i in range(0,5572):
    msg=re.sub('[^a-zA-Z]',' ',Data['Message'][i])
    msg=msg.lower()
    msg=msg.split()
    msg=[word for word in msg if not word in set(stopwords.words('english'))]
    for word in msg:
        words.append(word)
df=pd.DataFrame(words,columns=['Words'])
df=df['Words'].value_counts().to_frame().reset_index()
df.head()

In [None]:
print('Total words in whole dataset: ',df.index.max())
df=df[df['Words']>5]
print('Total words with frequency greater than 5 in whole dataset: ',df.index.max())

### Creating the bag of words model with max features = 1500

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1500)
X=cv.fit_transform(corpus).toarray()
y=Data.iloc[:,0].values

In [None]:
X

### Let's add message length as a feature.

In [None]:
X=np.concatenate((X,np.array(Data['Msg_Length']).reshape(5572,1)),axis=1)
X

In [None]:
X.shape

In [None]:
y

### Baseline algorithm test
Let's test some algorithms and find out the best one for this problem.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,classification_report

In [None]:
Pipelines=[]
Pipelines.append(('Random Forest Classifier', Pipeline(steps=[('RF',RandomForestClassifier())])))
Pipelines.append(('SVC',Pipeline([('SVC',LinearSVC())])))
Pipelines.append(('MultinomialNB',Pipeline([('mNB',MultinomialNB())])))
Pipelines.append(('KNeighborsClassifier',Pipeline([('KNN',KNeighborsClassifier())])))
Pipelines.append(('GradientBoostingClassifier',Pipeline([('GBC',GradientBoostingClassifier())])))
Pipelines.append(('LogisticRegression',Pipeline([('LR',LogisticRegression())])))

In [None]:
for name,model in Pipelines:
    model.fit(X_train,y_train)
    print('CM of '+name+':'+'\n',confusion_matrix(y_test,model.predict(X_test)),'\n')
    print('CR of '+name+':'+'\n',classification_report(y_test,model.predict(X_test)),'\n')

Clearly, MultinomialNB is outperforming the other models.

## Spam detection model

In [None]:
Classifier=MultinomialNB()
Classifier.fit(X_train,y_train)
print('Confusion Matrix: \n', confusion_matrix(y_test,Classifier.predict(X_test)))
print('Classification Report: \n', classification_report(y_test,Classifier.predict(X_test)))

## Visualizations

In [None]:
ham_words = ''
spam_words = ''
spam = Data[Data['Category']=='spam']
ham = Data[Data['Category']=='ham']

In [None]:
for msg in spam['Message']:
    text=re.sub('[^a-zA-Z]',' ',msg)
    text = text.lower()
    text=text.split()
    text=[word for word in text if not word in set(stopwords.words('english'))]
    for words in text:
        spam_words = spam_words+words+' '
        
for msg in ham['Message']:
    text=re.sub('[^a-zA-Z]',' ',msg)
    text = text.lower()
    text=text.split()
    text=[word for word in text if not word in set(stopwords.words('english'))]
    for words in text:
        ham_words = ham_words+words+' '

In [None]:
from wordcloud import WordCloud

In [None]:
# Generate a word cloud image
spam_wordcloud = WordCloud(width=1200, height=720,random_state=101).generate(spam_words)
ham_wordcloud = WordCloud(width=1200, height=720,random_state=101).generate(ham_words)

In [None]:
#Spam Word cloud
plt.figure( figsize=(16,9), facecolor='w')
plt.imshow(spam_wordcloud)
plt.title('Spam word cloud')
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
# Ham word cloud
plt.figure( figsize=(16,9), facecolor='w')
plt.imshow(ham_wordcloud)
plt.title('Ham word cloud')
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

## I hope you enjoyed watching this kernel. Any Suggestions? :)