In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### <center> Imported libraries

In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd 
import sklearn 
from nltk import word_tokenize
import string
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import KFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score
from sklearn.ensemble import  RandomForestClassifier
from xgboost import XGBClassifier


### <center>Loaded the Dataset

In [None]:
data = pd.read_csv('../input/spamfilter/emails.csv')

In [None]:
data

### <center> EDA

In [None]:
duplicate = data[data.duplicated()]
duplicate

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data['spam'].value_counts()

In [None]:
sns.countplot(data['spam'])

### <center>Preprocessing

In [None]:
#Function that tokenizes each and every email into words and returns it's length

def count_words(text):
    words = word_tokenize(text)
    return len(words)

In [None]:
#Applying the function to df['text'] and storing the count in another column
data['count']=data['text'].apply(count_words)
import nltk
nltk.download('punkt')

In [None]:
data['count']

In [None]:
data.groupby('spam')['count'].mean()

In [None]:
def process_text(text):
    no_punc = [char for char in text if char not in string.punctuation]
    no_punc = ''.join(no_punc)
    
    
    return ' '.join([word for word in no_punc.split() if word.lower() not in stopwords.words('english')])

In [None]:
data['text']=data['text'].apply(process_text)

In [None]:
#After cleaning the text. We will now carry out the process of Stemming to reduce infected words to their root
data['text']

In [None]:
stopword = set(stopwords.words('english'))
stopword.add('Subject')
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopword])
data['text'] = data['text'].apply(lambda x: remove_stopwords(x))
data.head()

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer

In [None]:
def stemming (text):
    return ''.join([stemmer.stem(word) for word in text])

In [None]:
data['text']=data['text'].apply(stemming)
data['text']

In [None]:
data.head()

In [None]:
spam = " ".join(data[data['spam'] == 1]['text'].tolist())
non_spam = " ".join(data[data['spam'] == 0]['text'].tolist())

### <center>Visualization of Stopwords

In [None]:
def return_top_words(text,words = 10):
    allWords = nltk.tokenize.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words('english')
    allWordExceptStopDist = nltk.FreqDist(w.lower() for w in allWords if w not in stopwords)    
    mostCommontuples= allWordExceptStopDist.most_common(words)
    mostCommon = [tupl[0] for tupl in mostCommontuples]
    return mostCommon
top_10_spam = return_top_words(spam,10)
top_10_non_spam = return_top_words(non_spam,10)

In [None]:
print(top_10_spam)
print(top_10_non_spam)

In [None]:
# Import packages
import matplotlib.pyplot as plt
%matplotlib inline
# Define a function to plot word cloud
def plot_cloud(wordcloud):
    # Set figure size
    plt.figure(figsize=(40, 30))
    # Display image
    plt.imshow(wordcloud) 
    # No axis details
    plt.axis("off");

In [None]:
# Import package
# Generate word cloud
wordcloud = WordCloud(width = 800, height = 800, random_state=0,background_color="white", collocations=False,  stopwords = STOPWORDS).generate(spam)
# Plot
#plot_cloud(wordcloud)
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show() 

In [None]:
# Generate word cloud
wordcloud = WordCloud(width = 800, height = 800, random_state=1, background_color='white',  collocations=False, stopwords = STOPWORDS).generate(non_spam)
# Plot
#plot_cloud(wordcloud)
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show() 

### <center> Applying the model's

In [None]:
X = data['text']
y = data['spam']

In [None]:
#TF-IDF
#Text feature extraction is the process of taking out a list of words from the text data and then transforming them into a feature set which is usable by a classifier. 
#This work emphasizes on the review of available feature extraction methods. 
#The following techniques can be used for extracting features from text data.
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X_ct  = vectorizer.transform(X)
X_ct

In [None]:
# Splitting the data
X_train,X_test,y_train,y_test = train_test_split(X_ct,y,test_size=0.2,random_state=42)

In [None]:
nb= MultinomialNB()
kfold = KFold(n_splits=5000,shuffle=True)
print("Accuracy using Cross Validation is :",np.mean(cross_val_score(nb,X_ct,data['spam'],cv=kfold,scoring="accuracy"))*100," %")

In [None]:
#x_train is the training data set. y_train is the set of labels to all the data in x_train
#The shape attribute for numpy arrays returns the dimensions of the array. If Y has n rows and m columns, then Y. shape is (n,m) 
print(X_train.shape)
print(y_train.shape)

In [None]:
print(X_test.shape)
print(y_test.shape)

In [None]:
nb= MultinomialNB()
nb.fit(X_train,y_train)
y_pred = nb.predict(X_test)
print("accuracy score is: ",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
plot_roc_curve(nb,X_test,y_test)

In [None]:
plot_confusion_matrix(nb,X_test,y_test)

In [None]:
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train,y_train)
y_pred1 = knn_classifier.predict(X_test)
print("accuracy_score is :",accuracy_score(y_test,y_pred1))
print(classification_report(y_test,y_pred1))

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred2 = rf.predict(X_test)
print("accuracy score is: ",accuracy_score(y_test,y_pred2))
print(classification_report(y_test,y_pred2))

In [None]:
xg = XGBClassifier()
xg.fit(X_train, y_train)
y_pred4 = xg.predict(X_test)
print("accuracy score is: ",accuracy_score(y_test,y_pred4))
print(classification_report(y_test,y_pred4))