In [16]:
import pandas as pd
messages=pd.read_csv('SMSSpamCollection',sep='\t',names=['label','message'])

### Data Cleaning and Processing

In [22]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saksh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet=WordNetLemmatizer()

In [24]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA-Z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[wordnet.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

### Creating the TF-IDF model

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv=TfidfVectorizer(max_features=5000)
X=cv.fit_transform(corpus).toarray()

In [29]:
X.shape

(5572, 5000)

In [31]:
y=pd.get_dummies(messages['label'],drop_first=True)

### Train Test split

In [32]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=0)

### Using Naive Bayes

In [54]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model=MultinomialNB().fit(X_train,y_train)

  return f(**kwargs)


In [55]:
y_pred=spam_detect_model.predict(X_test)

In [56]:
from sklearn.metrics import confusion_matrix
confusion_m=confusion_matrix(y_test,y_pred)
print(confusion_m)

[[955   0]
 [ 25 135]]


In [57]:
from sklearn.metrics import accuracy_score
accuracy1=accuracy_score(y_test,y_pred)
print(accuracy1)

0.9775784753363229


### Using Logistic Regression

In [58]:
from sklearn.linear_model import LogisticRegression
spam_detect_model=LogisticRegression(random_state=0).fit(X_train, y_train)

  return f(**kwargs)


In [59]:
y_pred=spam_detect_model.predict(X_test)

In [60]:
from sklearn.metrics import confusion_matrix
confusion_m=confusion_matrix(y_test,y_pred)
print(confusion_m)

[[953   2]
 [ 35 125]]


In [61]:
from sklearn.metrics import accuracy_score
accuracy2=accuracy_score(y_test,y_pred)
print(accuracy2)

0.9668161434977578


### Using Decision Tree

In [62]:
from sklearn.tree import DecisionTreeClassifier
spam_detect_model = DecisionTreeClassifier(random_state=0).fit(X_train,y_train)

In [63]:
y_pred=spam_detect_model.predict(X_test)

In [64]:
from sklearn.metrics import confusion_matrix
confusion_m=confusion_matrix(y_test,y_pred)
print(confusion_m)

[[941  14]
 [ 21 139]]


In [65]:
from sklearn.metrics import accuracy_score
accuracy3=accuracy_score(y_test,y_pred)
print(accuracy3)

0.968609865470852


### Using Random Forest

In [75]:
from sklearn.ensemble import RandomForestClassifier
spam_detect_model = RandomForestClassifier(max_depth=15, random_state=0).fit(X_train,y_train)

  spam_detect_model = RandomForestClassifier(max_depth=15, random_state=0).fit(X_train,y_train)


In [76]:
y_pred=spam_detect_model.predict(X_test)


In [77]:
from sklearn.metrics import confusion_matrix
confusion_m=confusion_matrix(y_test,y_pred)
print(confusion_m)

[[955   0]
 [ 64  96]]


In [78]:
from sklearn.metrics import accuracy_score
accuracy4=accuracy_score(y_test,y_pred)
print(accuracy4)

0.9426008968609866


### Using Support Vector Machine

In [80]:
from sklearn import svm
spam_detect_model = svm.SVC(kernel='linear').fit(X_train,y_train)

  return f(**kwargs)


In [81]:
y_pred=spam_detect_model.predict(X_test)

In [82]:
from sklearn.metrics import confusion_matrix
confusion_m=confusion_matrix(y_test,y_pred)
print(confusion_m)

[[954   1]
 [ 17 143]]


In [83]:
from sklearn.metrics import accuracy_score
accuracy5=accuracy_score(y_test,y_pred)
print(accuracy5)

0.9838565022421525


In [109]:
df={'Naive Bayes':accuracy1,'Logistic Regression':accuracy2,'Decision Tree':accuracy3,'Random Forest':accuracy4,'Support Vector Machine':accuracy5}
result=pd.DataFrame({'Accuracy':[accuracy1,accuracy2,accuracy3,accuracy4,accuracy5]},index=['Naive Bayes','Logistic Regression','Random Forest','Decision Tree','Support Vector Machine'])

In [110]:
result

Unnamed: 0,Accuracy
Naive Bayes,0.977578
Logistic Regression,0.966816
Random Forest,0.96861
Decision Tree,0.942601
Support Vector Machine,0.983857
