# Spam Message Classifier

**Table of Contents--**<br>
<ol>
    <li>Importing Libraries</li>
    <li>Data Visualization</li>
    <li>Data Preprocessing</li>
    <li>Model Training</li>
    <li>Conclusion</li>
</ol>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing libraries-->**

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

**Dataset reading--->**

In [None]:
df = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv',encoding='latin1')
df.head()


In [None]:
print("Null Values entry in Unnamed:2 column={}\nNull Values entry in Unnamed:3 column={}\nNull Values entry in Unnamed:4 column={}".format(df['Unnamed: 2'].isnull().sum(),df['Unnamed: 3'].isnull().sum(),df['Unnamed: 2'].isnull().sum()))

In [None]:
print("shape of dataset: {}".format(df.shape))

In [None]:
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)

In [None]:
df.head(10)

In [None]:
df.rename(columns={'v1':'Label','v2':'Msg'},inplace=True)

# Data Visualization

In [None]:
print("total no. of ham msgs: {}\ntotal no. of spam msgs: {}".format(df['Label'].value_counts()[0],df['Label'].value_counts()[1]))
df['Label'].value_counts().plot.bar()

In [None]:
df.describe()

In [None]:
df['len']=df['Msg'].apply(len)
df.head()

Let's see the longest msg  and its label

In [None]:
print("Message={}\n\nLabel={}".format(df['Msg'][df['len'].idxmax()],df['Label'][df['len'].idxmax()]))

Let's see the shortest msg and its label

In [None]:
print("Message={}\n\nLabel={}".format(df['Msg'][df['len'].idxmin()],df['Label'][df['len'].idxmin()]))

In [None]:
plt.style.use('seaborn-darkgrid')
plt.figure(figsize=(10,5))
sns.distplot(df['len'],kde=False,color='red',hist=True)
plt.xlabel("Message Length",size=15)
plt.ylabel("Frequency",size=15)
plt.title("Length Histogram",size=15)

In [None]:
plt.figure(figsize=(12, 8))

df[df['Label']=='ham'].len.plot(bins=35, kind='hist', color='red', 
                                       label='Ham messages', alpha=0.6)
df[df['Label']=='spam'].len.plot(kind='hist', color='blue', 
                                       label='Spam messages', alpha=0.6)
plt.legend()
plt.xlabel("Message Length")

From this we can see that, spam label messages are of shorter length than ham messages

# Data Preprocessing

In [None]:
import re
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
ps = PorterStemmer() # Using porterstemmer for text preprocessing
message = []
for i in range(0, df.shape[0]):
    review = re.sub('[^a-zA-Z]', ' ', df['Msg'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    message.append(review)

In [None]:
df['clean_msg']=np.empty((len(message),1))
for i in range(len(message)):
    df['clean_msg'][i]=message[i]
df['clean_msg_len']=df['clean_msg'].apply(len)
df.head()


In [None]:
df['Msg'].describe()

In [None]:
df['clean_msg'].describe()

In [None]:
df=pd.concat([df, pd.get_dummies(df['Label'])], axis=1)
df.drop(['Label'],axis=1,inplace=True)
df.drop(['spam'],axis=1,inplace=True)
df.rename(columns={'ham':'label'},inplace=True)
df.head()


**Note**: 1-Ham, 0-Spam

# Vectorization

In [None]:
X=df['clean_msg']
X

In [None]:
Y=df['label']


**Note: I'm using Countvectorizer for vectorization**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(message).toarray()
X





In [None]:
# print(X)
Y=np.array(Y)
type(Y)
# t(Y)

# Model Training

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.10, random_state = 0)


In [None]:
print("X_train shape: {}\n X_test shape: {}\nY_train shape: {}\nY_test shape: {}".format(X_train.shape,X_test.shape,y_train.shape,y_test.shape))

In [None]:
# list for storing accuracy score of different algorithms
acc=[]

**Naive Bayes Classifier**

In [None]:
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)
pred=spam_detect_model.predict(X_test)

print("Accuracy of Naive Bayes Classifier is: {}".format(metrics.accuracy_score(y_test,pred)))
acc.append(metrics.accuracy_score(y_test,pred))

**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
LR=LogisticRegression(solver='liblinear')
LR.fit(X_train,y_train)
yhat = LR.predict(X_test)
print("LogisticRegression's Accuracy:{0}".format(metrics.accuracy_score(y_test, yhat)))
acc.append(metrics.accuracy_score(y_test,yhat))

**SVM**

In [None]:
from sklearn import svm
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train) 
yhat = clf.predict(X_test)
print("SVM's Accuracy:{0}".format(metrics.accuracy_score(y_test, yhat)))
acc.append(metrics.accuracy_score(y_test, yhat))

**Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
Random_forest = RandomForestClassifier(n_estimators=50)
Random_forest.fit(X_train,y_train)
randomForest_predict = Random_forest.predict(X_test)
randomForest_score = metrics.accuracy_score(y_test, randomForest_predict)
print("Random Forest Score :",randomForest_score)
acc.append(metrics.accuracy_score(y_test,randomForest_predict ))

**Gradient Boosting**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbk = GradientBoostingClassifier(random_state=100, n_estimators=150,min_samples_split=100, max_depth=6)
gbk.fit(X_train, y_train)
gbk_predict = gbk.predict(X_test)
print("Gradient Boosting Score :",metrics.accuracy_score(y_test,gbk_predict ))
acc.append(metrics.accuracy_score(y_test,gbk_predict ))

**KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
mx=-1
for i in range(1,25):
    
    neigh=KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    yhat = neigh.predict(X_test)
    KNN_score=metrics.accuracy_score(y_test, yhat)
    print("KNN Accuracy at {} is {}".format(i,KNN_score))
    mx=max(mx,KNN_score)
    print("\n")
acc.append(mx)
mx

# Conclusion

In [None]:
algo_name=['Naive Bayes Classifier','Logistic Regression','SVM','Random Forest Classifier','Gradient Boosting','KNN']
acc=np.array(acc)
from numpy import median
plt.figure(figsize=(10,8))
sns.barplot(y=acc*100,x=algo_name,estimator=median,palette="Blues_d")
plt.xlabel('Algorithm Name',size=30)
plt.xticks(rotation=45)
plt.ylabel('Accuracy',size=30)



# Please Upvote, if you found this notebook helpful.