In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score

In [None]:
#importing dataset
ds = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv')
ds.head()

In [None]:
#checking for null values
ds.isnull().sum()

**TO MANY NULL VALUES IN THESE COLUMNS, THEREFORE DROPPING THEM ALL**

In [None]:
#dropping last three columns
ds.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis = 1,inplace = True)

#renaming columns(v1 and v2) for better understanding
ds.columns = ['category','message']

ds.head()

In [None]:
sns.countplot(x = 'category',data = ds)

**TOTAL VALUES OF HAM IS AROUND 4800 AND FOR SPAM IS AROUNF 800**

In [None]:
#text cleaning
clean = []
for i in range(0,len(ds)):
    
    #removing punctuations with a space
    text = re.sub('[^a-zA-Z]',' ', ds['message'][i])
    
    #converting all text to lowercase
    text = text.lower()
    
    #tokenizing and stemming
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if word not in set(stopwords.words('english'))]
    text = ' '.join(text)
    clean.append(text)
    

In [None]:
clean[0:5]

In [None]:
#creating bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x = cv.fit_transform(clean).toarray()
y = ds.iloc[:,0].values

In [None]:
#encoding categorical data of y
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)

In [None]:
#splitting dataset into training and testing sets.
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)

# **Logistic Regression**

In [None]:
#training model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train,y_train)

#getting confusion matrix
y_pred = lr.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
print('confusion matrix:\n',cm)

#checking accuracy
lra = accuracy_score(y_test,y_pred)
print('accuracy score = ',lra)

# **Kernel SVM**

In [None]:
#training model
from sklearn.svm import SVC
svc = SVC(kernel = 'rbf')
svc.fit(x_train,y_train)

#getting confusion matrix
y_pred = svc.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
print('confusion matrix:\n',cm)

#checking accuracy
sva = accuracy_score(y_test,y_pred)
print('accuracy score = ',accuracy_score(y_test,y_pred))

# **Multinomial Naive Bayes**

In [None]:
#training model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(x_train,y_train)

#getting confusion matrix
y_pred = nb.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
print('confusion matrix:\n',cm)

#checking accuracy
nba = accuracy_score(y_test,y_pred)
print('accuracy score = ',accuracy_score(y_test,y_pred))

# **Decision Tree**

In [None]:
#training model
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion = 'entropy')
dt.fit(x_train,y_train)

#getting confusion matrix
y_pred = dt.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
print('confusion matrix:\n',cm)

#checking accuracy
dta = accuracy_score(y_test,y_pred)
print('accuracy score = ',accuracy_score(y_test,y_pred))

# **Random Forest**

In [None]:
#training model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 20, criterion = 'entropy',random_state = 0)
rf.fit(x_train,y_train)

#getting confusion matrix
y_pred = rf.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
print('confusion matrix:\n',cm)

#checking accuracy
rfa = accuracy_score(y_test,y_pred)
print('accuracy score = ',accuracy_score(y_test,y_pred))

In [None]:
#comparing accuracies
plt.figure(figsize= (8,7))
ac = [lra,sva,nba,dta,rfa]
name = ['Logistic Regression','Kernel Svm','Multinomial Naive Bayes','Decision Tree', 'Random Forest']
sns.barplot(x = ac,y = name,palette='pastel')
plt.title("Plotting the Model Accuracies", fontsize=16, fontweight="bold")

**ALL MODELS GAVE ALMOST EQUAL ACCURACIES OF AROUND (97-98)%**