In [None]:
import numpy as np 
import pandas as pd
df = pd. read_csv('spam.csv', encoding='ANSI')
df. to_csv('spam.csv', encoding='utf-8', index=False)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.head()

In [None]:
#1. Data Cleaning 
#2. EDA 
#3. Text Preprocessing 
#4. Model Building 
#5. evaluation
#6. improvements 
#7. creating website 
#8 . deploy to heroku 

# Data Cleaning 

In [None]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [None]:
df.sample(5)

In [None]:
#renaming the columns 
df.rename(columns={'v1':'target','v2':'text'},inplace =True)

In [None]:
df.head()

In [None]:
# since in target columns its written either ham or spam so we apply label encoding here 
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['target']=encoder.fit_transform(df['target'])

In [None]:
#spam = 1 , ham =0 
df.head()

In [None]:
#checking missing values 
df.isnull().sum()  #no missing values present

In [None]:
#check for duplicate values 
df.duplicated().sum() #403 duplicate values 

In [None]:
#remove duplicates 
df=df.drop_duplicates(keep='first')

In [None]:
df.duplicated().sum()  #no duplicates present 

In [None]:
df.shape

# 2. EDA

In [None]:
df['target'].value_counts()  #ham is 4516 and spam is 653

In [None]:
import matplotlib.pyplot as plt 
plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct='%.2f')

In [None]:
#can see 88% data is not spam and 13% is spam 
#data is imbalanced 

In [None]:
#now we create 3 columns in which we can find number of characters , no. of words and no. of sentences in the sms 

In [None]:
import nltk 

In [None]:
#nltk.download('punkt')

In [None]:
df['num_characters']=df['text'].apply(len)  #no of characters in sms columns

In [None]:
#no. of words 
df['num_words']=df['text'].apply(lambda x : len(nltk.word_tokenize(x)))  #breaking sentences into words 

In [None]:
df.head()

In [None]:
#df['text'].apply(lambda x:nltk.sent_tokenize(x))  #breaks sentences in the text column

In [None]:
#number of sentences 
df['num_sentences']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
df[['num_characters','num_words','num_sentences']].describe()

In [None]:
# max char - 910 , word-220 and sentences -28 used in a text column 
#on an average 79 char , 18.5 words and 1.98 sentences used in a text xolumn 
# min char - 2 , word-1 and sentences -1 used in a text column 


In [None]:
#for ham 
df[df['target']==0][['num_characters','num_words','num_sentences']].describe()

In [None]:
#for spam
df[df['target']==1][['num_characters','num_words','num_sentences']].describe()

In [None]:
# can see that length of spam message is larger than ham messages 

In [None]:
#lets visualize through graph

In [None]:
import seaborn as sns 
plt.figure(figsize=(12,6))
sns.histplot(df[df['target']==0]['num_characters'])
sns.histplot(df[df['target']==1]['num_characters'],color='red')

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df[df['target']==0]['num_words'])
sns.histplot(df[df['target']==1]['num_words'],color='red')

In [None]:
#lets see relationship between two columns 
sns.pairplot(df,hue='target')

In [None]:
sns.heatmap(df.corr(),annot=True)

In [None]:
#with target highest correlation is 0.38 which is num_characters 
#since correlation between three columns(target is not included ) is very high which shows strong correlation so we cant keep all the
#columns and we will only keep num_characters as it has high correlation with target as compares to num_words or num_sentences 


# 3.Data preprocessing


In [None]:
#lower case 
# tokenization 
# Removing special characters'
# Removing stop words and punctuation
# stemming

from nltk.stem.porter import PorterStemmer

import string  
ps=PorterStemmer()


In [None]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y=[]
    for i in text:
        if i.isalnum():
            y.append(i)
    text = y[:]
    y.clear()
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    
    return " ".join(y)

In [None]:
from nltk.corpus import stopwords


In [None]:
transform_text('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')

In [None]:
df['transformed_text']=df['text'].apply(transform_text)

In [None]:
df.head()

In [None]:
#now we will create a word cloud 
# in this all the important words will be highlighted(make the words big ) 

In [None]:
from wordcloud import WordCloud
wc=WordCloud(width=500,height=500,min_font_size=10,background_color='white')
spam_wc = wc.generate(df[df['target']==1]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(12,6))
plt.imshow(spam_wc)

In [None]:
ham_wc = wc.generate(df[df['target']==0]['transformed_text'].str.cat(sep=" "))
plt.figure(figsize=(12,6))
plt.imshow(ham_wc)

In [None]:
df.head()

In [None]:
# now we have to find top 30 words used in spam or ham respectively  
#the first step is to split the word in list in transformed text column which is spam and put it in alist 
spam_corpus=[]
for msg in df[df['target']==1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

In [None]:
len(spam_corpus)

In [None]:
#now we will find occurence of each word in spam_corpus 
from collections import Counter 
Counter(spam_corpus)

In [None]:
#if we want to find most common 30 words in the spam corpus
Counter(spam_corpus).most_common(30)

In [None]:
#changing the above to a DataFrame
pd.DataFrame(Counter(spam_corpus).most_common(30))

In [None]:
# now we will plot barplot between 0 and 1st columns
plt.figure(figsize=(12,6))
sns.barplot(pd.DataFrame(Counter(spam_corpus).most_common(30))[0],pd.DataFrame(Counter(spam_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
#doing same for ham 
ham_corpus=[]
for msg in df[df['target']==0]['transformed_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [None]:
len(ham_corpus)

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(pd.DataFrame(Counter(ham_corpus).most_common(30))[0],pd.DataFrame(Counter(ham_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()

# Model Building 


In [None]:
#we will use naive bayes here as it is believed that naive bayes performs better on textual data 
#we will use further ensemble learning to imporove the data 

In [None]:
df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv=CountVectorizer()

In [None]:
X=cv.fit_transform(df['transformed_text']).toarray()

In [None]:
X.shape  #5169 sms and 6717 words 

In [None]:
X

In [None]:
y=df['target'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score  #this is a high precision model in which we want to reduce true positive
gnb=GaussianNB()
bnb=BernoulliNB()
mnb=MultinomialNB()

In [None]:
gnb.fit(X_train,y_train)
y_pred1=gnb.predict(X_test)

In [None]:
print('accuracy of gaussian', accuracy_score(y_test,y_pred1))
print('Confusion matrix of gaussian',confusion_matrix(y_test,y_pred1))
print('precision score of gaussian',precision_score(y_test,y_pred1))

In [None]:
#precision score of gaussian is very low
#precision is how good is the model in predicting specific category 

In [None]:
mnb.fit(X_train,y_train)
y_pred2=mnb.predict(X_test)
print('accuracy of multinomial', accuracy_score(y_test,y_pred2))
print('Confusion matrix of multinomial',confusion_matrix(y_test,y_pred2))
print('precision score of multinomial',precision_score(y_test,y_pred2))

In [None]:
#accuracy is good but precision score is still low and in this precision score is important 

In [None]:
bnb.fit(X_train,y_train)
y_pred3=bnb.predict(X_test)
print('accuracy of bernoulli', accuracy_score(y_test,y_pred3))
print('Confusion matrix of bernoulli',confusion_matrix(y_test,y_pred3))
print('precision score of bernoulli',precision_score(y_test,y_pred3))

In [None]:
#accuracy is good and precision is also good so bernoulli is best here 

In [None]:
#now using tfidf vectorizer instead of count vectorizer 

In [None]:
tfidf=TfidfVectorizer()

In [None]:
X1=tfidf.fit_transform(df['transformed_text']).toarray()
#y will be same 

In [None]:
X1

In [None]:
X1.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X1,y,test_size=0.2,random_state=2)

In [None]:
gnb.fit(X_train,y_train)
y_pred1=gnb.predict(X_test)

In [None]:
print('accuracy of gaussian', accuracy_score(y_test,y_pred1))
print('Confusion matrix of gaussian',confusion_matrix(y_test,y_pred1))
print('precision score of gaussian',precision_score(y_test,y_pred1))

In [None]:
#precision score of gaussian is very low
#precision is how good is the model in predicting specific category 

In [None]:
mnb.fit(X_train,y_train)
y_pred2=mnb.predict(X_test)
print('accuracy of multinomial', accuracy_score(y_test,y_pred2))
print('Confusion matrix of multinomial',confusion_matrix(y_test,y_pred2))
print('precision score of multinomial',precision_score(y_test,y_pred2))

In [None]:
#here precision performs really better as its not giving any false positive 

In [None]:
bnb.fit(X_train,y_train)
y_pred3=bnb.predict(X_test)
print('accuracy of bernoulli', accuracy_score(y_test,y_pred3))
print('Confusion matrix of bernoulli',confusion_matrix(y_test,y_pred3))
print('precision score of bernoulli',precision_score(y_test,y_pred3))

In [None]:
#here we can go either with mnb or bnb but we will go with mnb as its precision is very good although accuracy is low 

In [None]:
#we choose tfidf --> MNB

In [None]:
#now we bring many ML models and compare it with Multinomial MNB 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [None]:
clfs={
    'SVC' : svc,
    'KN' : knc, 
    'NB': mnb, 
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb

}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    return accuracy ,precision

In [None]:
train_classifier(svc,X_train,y_train,X_test,y_test)

In [None]:
accuracy_scores=[]
precision_scores=[]

for name,clf in  clfs.items():
    print('name',name)
    current_accuracy,current_precision=train_classifier(clf,X_train,y_train,X_test,y_test)
    print(current_accuracy)
    print(current_precision)
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)



In [None]:
performance_df=pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)

In [None]:
performance_df

In [None]:
#here we consider Naive bayes as it has highest precision 
#we can too use random forest but in terms of textual data we find Naive bayes better 

In [None]:
performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")

In [None]:
performance_df1

In [None]:
sns.catplot(x='Algorithm',y='value',hue='variable',data=performance_df1,kind='bar',height=5)
plt.ylim(0.5,1.0)
plt.xticks(rotation='vertical')
plt.show()

# Model Improvement