In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import re
import string
from wordcloud import WordCloud
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.metrics import confusion_matrix 


In [None]:
d = pd.read_csv('/content/drive/MyDrive/spam_classification/SPAM.csv')

In [None]:
d

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
d['Category'].value_counts()


ham     4825
spam     747
Name: Category, dtype: int64

In [None]:

import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib Inline
%pylab

Using matplotlib backend: agg
Populating the interactive namespace from numpy and matplotlib


In [None]:
from pylab import *
rcParams['figure.figsize'] = 8,8
sns.countplot(x= "Category", data= d) 
plt.title('Countplot for Span vs Ham as Imbalanced dataset') 
plt.xlabel('Is the SMS Span?') 
plt.ylabel("Count")

Text(0, 0.5, 'Count')

In [None]:
def convert_lowercase(text):
    text = text.lower()
    return text

d['Message'] = d['Message'].apply(convert_lowercase)

In [None]:
def remove_url(text):
  
    re_url = re.compile('https?://\S+|www\.\S+')
    return re_url.sub('', text)

d['Message'] = d['Message'].apply(remove_url)

In [None]:
exclude = string.punctuation

def remove_punc(text):
    return text.translate(str.maketrans('', '', exclude))

d['Message'] = d['Message'].apply(remove_punc)

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

def remove_stopwords(text):
    new_list = []
    words = word_tokenize(text)
    stopwrds = stopwords.words('english')
    for word in words:
        if word not in stopwrds:
            new_list.append(word)
    return ' '.join(new_list)

d['Message'] = d['Message'].apply(remove_stopwords)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def perform_stemming(text):
    stemmer = PorterStemmer()
    new_list = []
    words = word_tokenize(text)
    for word in words:
        new_list.append(stemmer.stem(word))

    return " ".join(new_list)

d['Message'] = d['Message'].apply(perform_stemming)

In [None]:
d


Unnamed: 0,Category,Message
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah dont think goe usf live around though
...,...,...
5567,spam,2nd time tri 2 contact u u £750 pound prize 2 ...
5568,ham,ü b go esplanad fr home
5569,ham,piti mood soani suggest
5570,ham,guy bitch act like id interest buy someth els ...


In [None]:
text = " ".join(d[d['Category'] == 'spam']['Message'])
plt.figure(figsize = (15, 10))
wordcloud = WordCloud(max_words=500, height= 800, width = 1500,  background_color="black", colormap= 'viridis').generate(text)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
text = " ".join(d[d['Category'] == 'ham']['Message'])
plt.figure(figsize = (15, 10))
wordcloud = WordCloud(max_words=500, height= 800, width = 1500,  background_color="black", colormap= 'viridis').generate(text)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
d['Category'] = d['Category'].replace({'spam':0,'ham':1})

In [None]:
X = d["Message"]
y = d['Category'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42, stratify = y)

In [None]:
tfidf = TfidfVectorizer(max_features= 2500, min_df= 2)
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

In [None]:
from sklearn import datasets, metrics, model_selection, svm

def train_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)
    accuracy = round(accuracy_score(y_test, y_pred), 3)
    precision = round(precision_score(y_test, y_pred), 3)
    recall = round(recall_score(y_test, y_pred), 3)

    print(f'Accuracy of the model: {accuracy}')
    print(f'Precision Score of the model: {precision}')
    print(f'Recall Score of the model: {recall}')


    sns.set_context('notebook', font_scale= 1.3)
    ax1 = confusion_matrix(y_train, y_pred)
    ax2 = metrics.roc_curve(y_train, y_prob)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

#Testing on the following classifiers
classifiers = [MultinomialNB(), 
               RandomForestClassifier(),
               KNeighborsClassifier(), 
               SVC()]
for cls in classifiers:
    cls.fit(X_train, y_train)

# Dictionary of pipelines and model types for ease of reference
pipe_dict = {0: "NaiveBayes", 1: "RandomForest", 2: "KNeighbours",3: "SVC"}

# Cossvalidation 
for i, model in enumerate(classifiers):
    cv_score = cross_val_score(model, X_train,y_train,scoring="accuracy", cv=10)
    print("%s: %f " % (pipe_dict[i], cv_score.mean()))

NaiveBayes: 0.979585 
RandomForest: 0.976443 
KNeighbours: 0.910480 
SVC: 0.977563 


In [None]:
classifier = MultinomialNB()
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_train)  

In [None]:
from sklearn.metrics import confusion_matrix  
cm = confusion_matrix(y_train, y_pred)  
cm

array([[116,  33],
       [  3, 963]])

In [None]:
sample = "its emergency to come to california" 
