In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import nltk
import pandas as pd
import string
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import regex as re
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

In [None]:
data = pd.read_csv('/kaggle/input/spam-filter/emails.csv')      # reading data

In [None]:
data.head()        # checking data

### Observing Data

In [None]:
data.info()

In [None]:
# Checking null/missing values
data.isnull().sum()

In [None]:
# Checking counts of spams and non-spams
data['spam'].value_counts()

### Preprocessing

In [None]:
# Removing Punctutaion
def remove_punctuation(text):
    no_punct="".join([words for words in text if words not in string.punctuation])
    return no_punct
data["text"] = data['text'].apply(lambda x: remove_punctuation(x))
data.head()

In [None]:
# Removing Stopwords
stopword = set(stopwords.words('english'))
stopword.add('Subject')
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopword])
data['text'] = data['text'].apply(lambda x: remove_stopwords(x))
data.head()

In [None]:
# Tokenizing
'''Tokenization is the process of breaking text into smaller pieces called tokens. 
These smaller pieces can be sentences, words, or sub-words.'''
def tokenize(text):
    split=re.split("\W+",text) 
    return split
data['text']=data['text'].apply(lambda x: tokenize(x.lower()))
data.head()

In [None]:
# Lemmatizing
'''Lemmatizing is the process of reducing a word to its root form.'''
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text])

data['text'] = data["text"].apply(lambda text: lemmatize_words(text))
data.head()

In [None]:
# Splitting the data according to spam and non-spam 
spam = " ".join(data[data['spam'] == 1]['text'].tolist())
non_spam = " ".join(data[data['spam'] == 0]['text'].tolist())

In [None]:
# Finding most repeated words in the data
def return_top_words(text,words = 10):
    allWords = nltk.tokenize.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words('english')
    allWordExceptStopDist = nltk.FreqDist(w.lower() for w in allWords if w not in stopwords)    
    mostCommontuples= allWordExceptStopDist.most_common(words)
    mostCommon = [tupl[0] for tupl in mostCommontuples]
    return mostCommon

In [None]:
top_10_spam = return_top_words(spam,10)
top_10_non_spam = return_top_words(non_spam,10)

In [None]:
print(top_10_spam)
print(top_10_non_spam)

### WordCloud

In [None]:
stopwords = set(STOPWORDS) 
  
# iterate through the csv file 
for val in data.text: 
      
    # typecaste each val to string 
    val = str(val) 
  
    # split the value 
    tokens = val.split() 
      
    # Converts each token into lowercase 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
      
  
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(spam) 

In [None]:
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show() 

In [None]:
stopwords = set(STOPWORDS) 
  
# iterate through the csv file 
for val in data.text: 
      
    # typecaste each val to string 
    val = str(val) 
  
    # split the value 
    tokens = val.split() 
      
    # Converts each token into lowercase 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
      
  
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(non_spam) 

In [None]:
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show() 

In [None]:
X = data['text']
y = data['spam']

In [None]:
# TF-IDF (Term Frequency - Inverse Document Frequency)
'''This is a technique to quantify a word in documents, we generally compute a weight to each word
which signifies the importance of the word in the document and corpus. 
This method is a widely used technique in Information Retrieval and Text Mining.'''
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X_ct  = vectorizer.transform(X)

In [None]:
# Splitting the data
X_train,X_test,y_train,y_test = train_test_split(X_ct,y,test_size=0.2,random_state=42)

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
print(X_test.shape)
print(y_test.shape)

### **KNN Classifier**

In [None]:
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train,y_train)
y_pred1 = knn_classifier.predict(X_test)
print("accuracy score is :",accuracy_score(y_test,y_pred1))
print(classification_report(y_test,y_pred1))

#### Accuracy Score for KNN Classifier is 97%

### **Naive Bayes**

In [None]:
nb= MultinomialNB()
nb.fit(X_train,y_train)
y_pred2 = nb.predict(X_test)
print("accuracy score is: ",accuracy_score(y_test,y_pred2))
print(classification_report(y_test,y_pred2))

#### Accuracy score for Naive Bayes Classifier is 89%

### Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred3 = rf.predict(X_test)
print("accuracy score is: ",accuracy_score(y_test,y_pred3))
print(classification_report(y_test,y_pred3))

#### Accuracy Score for Random Forest is 97%

### XGBoost

In [None]:
xg = XGBClassifier()
xg.fit(X_train, y_train)
y_pred4 = xg.predict(X_test)
print("accuracy score is: ",accuracy_score(y_test,y_pred4))
print(classification_report(y_test,y_pred4))

#### Accuracy Score for XGBoost is 98%