In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
msg=pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv",encoding="latin-1")
msg.head()

In [None]:
msg.drop(columns=["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1,inplace=True)
msg.rename(columns={"v1":"label","v2":"message"},inplace=True)
msg.head()

In [None]:
import nltk

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
msg.info()

In [None]:
msg.label.value_counts()

## So we have more records which is classified as ham(not a spam message)
## So the model can pretty well learn the pattern for a message which is not a spam

# Implementing wordcloud to visualize the most repeated words

In [None]:
import wordcloud

In [None]:
from wordcloud import WordCloud

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
spams=msg["message"][msg["label"]=="spam"]
spam_cloud=WordCloud(width=700,height=500,background_color="white",max_words=50).generate(' '.join(spams))

plt.figure(figsize=(10,8),facecolor='r')
plt.imshow(spam_cloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

## We can see that the words free,call,urgent,claim are mostly used in spam messages

In [None]:
ham=msg["message"][msg["label"]=="ham"]
ham_cloud=WordCloud(width=700,height=500,background_color="white",max_words=50).generate(' '.join(ham))

plt.figure(figsize=(10,8),facecolor='r')
plt.imshow(ham_cloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

## we can clearly see some of the most repeated words which is used in ham messages

# Data cleaning and pre-processing

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
corpus=[]

In [None]:
for i in range(0,len(msg)):
    review=re.sub('[^a-zA-Z]', ' ', msg['message'][i])
    review=review.lower()
    review=review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus[:5]

### Here we can see that the less important words are removed with stopwords
### Punctuations are removed
### The sentence is lowered and splitted.
### And stematization is applied to the words

In [None]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [None]:
X[:3]

### We can see that the words are vectorized (i.e converted as numerical)

In [None]:
label=msg["label"]

In [None]:
label.head()

In [None]:
label=label.replace({"ham":1,"spam":0})

In [None]:
y=label

# Splitting data for train and test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 420)

# Classifying the data using algorithms

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
dtc = DecisionTreeClassifier()
ran = RandomForestClassifier(n_estimators=90)
knn = KNeighborsClassifier(n_neighbors=79)
svm = SVC(random_state=6)
nbc=MultinomialNB()

In [None]:
models = {"Decision tree" : dtc,
          "Random forest" : ran,
          "KNN" : knn,
          "SVM" : svm,"Naive Bayes":nbc}
test_scores= { }
train_scores={ }

In [None]:
for key, value in models.items():    
    model = value
    model.fit(X_train, y_train)
    test_scores[key] = model.score(X_test, y_test)
    train_scores[key]=model.score(X_train,y_train)

In [None]:
datal = {"train":train_scores,"test":test_scores} 
  
# Creates pandas DataFrame. 
score_frame = pd.DataFrame(datal, index =test_scores.keys())
score_frame.sort_values(by=["train","test"], axis=0 ,ascending=False, inplace=True) 
# print the data 
score_frame

## In this we can see that most of the algorithms gives good accuracy except KNN
## Among these we can see that the naive bayes works well with spam classification
## Naive Bayes gives low bias and low variance

# Evaluation Metrics

In [None]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [None]:
pred=nbc.predict(X_test)

## Confusion matrix - Actual vs presdicted

In [None]:
conf=confusion_matrix(y_test,pred)
conf

## Accuracy

In [None]:
accuracy=accuracy_score(y_test,pred)
accuracy

## Classification Report

In [None]:
report=classification_report(y_test,pred,output_dict=True)

In [None]:
df = pd.DataFrame(report).transpose()

In [None]:
df