# **MESSAGE SPAM CLASSIFIER**

THIS NOTEBOOK ANALYZISES THE SPAM CLASSIFIER DATASET AND CLASSIFIES THE SPAM AND HAM MESSAGES USING VARIOUS MACHINE LEARNING ALGORITHMS. 

In [None]:
#IMPORTING THE NECESSARY LIBRARIES
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#READING THE CSV FILE
df = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv",encoding='latin-1')

In [None]:
#DISPLAYING THE FIRST 5 ROWS OF THE DATASET
df.head()

In [None]:
#DATA CLEANING
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.columns = ['Label', 'SMS']
df.head()

In [None]:
df['Label'] = df['Label'].map({'ham':0, 'spam':1})
df = df[['SMS', 'Label']]
df.head()

**DATA VISUALIZATION**

In [None]:
sns.countplot(df['Label'])
plt.xlabel('Label')
plt.title('Number of HAM and SPAM messages')
plt.show()

 **WORD CLOUDS**

In [None]:
from wordcloud import WordCloud,STOPWORDS
spam_msg=df[df['Label'] == 1]
spam_words=' '.join(spam_msg['SMS'])
spam_msg

In [None]:
ham_msg=df[df['Label'] == 0]
ham_words=' '.join(ham_msg['SMS'])
ham_msg

In [None]:
#HAM WORD CLOUD
ham_cloud=WordCloud(width=1000,height=500).generate(ham_words)
plt.figure(figsize=(12,12))
plt.imshow(ham_cloud)
plt.axis('off')
plt.show()

In [None]:
#SPAM WORD CLOUD
spam_cloud=WordCloud(width=1000,height=500).generate(spam_words)
plt.figure(figsize=(12,12))
plt.imshow(spam_cloud)
plt.axis('off')
plt.show()

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
ps=WordNetLemmatizer()
corpus=[]
for i in range(0,len(df)):
    reviews=re.sub('[^a-zA-Z]',' ',df['SMS'][i])
    reviews=reviews.lower()
    reviews=reviews.split()
    reviews=[ps.lemmatize(word) for word in reviews if not word in set(stopwords.words('english'))]
    reviews=' '.join(reviews)
    corpus.append(reviews)

**MODELS**

In [None]:
#ASSIGNING FEATURES AND TARGET VARIABLE
X = df['SMS'].values
y = df['Label'].values

In [None]:
#SPLITTING THE DATA
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

**NAIVE BAYES**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

cv = CountVectorizer(max_features=3700)

X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

**DECISION TREE CLASSIFIER**

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(random_state=0)
dtc.fit(X_train,y_train)
dtc_pred=dtc.predict(X_test)

In [None]:
cm_dtc=confusion_matrix(y_test,dtc_pred)
cm_dtc

In [None]:
acc_dtc=accuracy_score(y_test,dtc_pred)
acc_dtc

In [None]:
print(classification_report(y_test,dtc_pred))

In [None]:
#COMPARING THE TWO CLASSIFICATION REPORTS
print(classification_report(y_test, y_pred))
print(classification_report(y_test,dtc_pred))