In [None]:
%matplotlib inline
from IPython.display import Image, HTML

In [None]:
import os
os.getcwd()

In [None]:
# os.chdir("/downloads/Phishingin")

In [None]:
Image(filename="spam.png",width=1000,height=500)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data=pd.read_csv('emails.csv')
data.head(15)

In [None]:
data.shape

In [None]:
data.text=data.text.transform(lambda x:x.replace("Subject:",''))

In [None]:
data.head()

In [None]:
import nltk
from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

# PRE-PROCESSING MODULE

In [None]:
tk=TweetTokenizer()
ps = PorterStemmer()
lem=WordNetLemmatizer()
def cleaning(s):
    s = str(s)
    s = s.lower()
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub(r'[^\w]', ' ', s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace(",","")
    s = s.replace("[\w*"," ")
    s=s.lower()
    s=tk.tokenize(s)
    s=[ps.stem(word) for word in s if not word in set(stopwords.words('english'))]
    s=[lem.lemmatize(word) for word in s]
    s= ' '.join(s)
    return s




data['content'] = [cleaning(s) for s in data['text']]

In [None]:
data['content'][1]

In [None]:
all_words = ' '.join([text for text in data['content']])

# EXPLORATORY DATA ANALYSIS

In [None]:
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110,background_color="white").generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()


In [None]:
normal_words =' '.join([text for text in data['content'][data['spam'] == 0]])

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
negative_words = ' '.join([text for text in data['content'][data['spam'] == 1]])
wordcloud = WordCloud(width=800, height=500,random_state=21, max_font_size=110).generate(negative_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()


In [None]:
def hashtag_extract(x):
    hashtags = []
    for i in x:
        ht =re.findall(r'\w+', i)
        hashtags.append(ht)
    return hashtags

In [None]:
HT_regular = hashtag_extract(data['content'][data['spam'] == 0])

HT_negative = hashtag_extract(data['content'][data['spam'] == 1])

In [None]:
HT_regular = sum(HT_regular,[])
HT_negative = sum(HT_negative,[])

In [None]:
import seaborn as sns

In [None]:
a = nltk.FreqDist(HT_regular)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count': list(a.values())})
# selecting top 10 most frequent hashtags     
d = d.nlargest(columns="Count", n = 10) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

In [None]:
b = nltk.FreqDist(HT_negative)
e = pd.DataFrame({'Hashtag': list(b.keys()), 'Count': list(b.values())})
# selecting top 10 most frequent hashtags
e = e.nlargest(columns="Count", n = 10)   
plt.figure(figsize=(16,5))
ax = sns.barplot(data=e, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()



In [None]:
from sklearn.feature_extraction.text import  CountVectorizer

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,1,),stop_words=stopwords.words('english')).fit(data['content'])

In [None]:
X=vectorizer.transform(data['content']).toarray()

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tran=TfidfTransformer().fit(X)

X=tran.transform(X).toarray()


In [None]:
from sklearn.model_selection import train_test_split


train_x,test_x,train_y,test_y=train_test_split(X,data.spam.values,test_size=0.25,random_state=0)

# LOGISTIC REGRESSION

In [None]:
import warnings
warnings.filterwarnings("ignore") 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
model_log=LogisticRegression()

In [None]:
model_log.fit(train_x,train_y)

In [None]:
test_y#actual values

In [None]:
model_log.score(test_x,test_y)

In [None]:
y_pred = model_log.predict(test_x)
y_pred

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error

In [None]:
cm_df = pd.DataFrame(confusion_matrix(test_y, y_pred).T, index=model_log.classes_,columns=model_log.classes_)
cm_df.index.name = 'Predicted'
cm_df.columns.name = 'True'
print(cm_df)

In [None]:
print(classification_report(test_y, y_pred))

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score, cohen_kappa_score
fpr, tpr, _ = roc_curve(test_y, y_pred)
# Calculate the AUC
roc_auc = auc(fpr, tpr)
print('ROC AUC: %0.2f' % roc_auc)
# Plot of a ROC curve for a specific class
plt.figure(figsize=(15,10))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

# NAIVE BAYES CLASSIFIER

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(train_x,train_y)

In [None]:
model.score(train_x,train_y)

In [None]:
model.score(test_x,test_y)

In [None]:
y_predict = model.predict(test_x)
y_predict

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error

In [None]:
cm_df = pd.DataFrame(confusion_matrix(test_y, y_predict).T, index=model.classes_,columns=model.classes_)
cm_df.index.name = 'Predicted'
cm_df.columns.name = 'True'
print(cm_df)

In [None]:
print(classification_report(test_y, y_predict))

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score, cohen_kappa_score
fpr, tpr, _ = roc_curve(test_y, y_predict)
# Calculate the AUC
roc_auc = auc(fpr, tpr)
print('ROC AUC: %0.2f' % roc_auc)
# Plot of a ROC curve for a specific class
plt.figure(figsize=(15,10))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

# Performance Comparison

In [None]:
MLA = [LogisticRegression(),GaussianNB()]

In [None]:
from sklearn.metrics import mean_squared_error,confusion_matrix, precision_score, recall_score, auc,roc_curve

In [None]:
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)


row_index = 0
for alg in MLA:
    
    
    predicted = alg.fit(train_x, train_y).predict(test_x)
    fp, tp, th = roc_curve(test_y, predicted)
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index,'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Train Accuracy'] = round(alg.score(train_x, train_y), 4)
    MLA_compare.loc[row_index, 'MLA Test Accuracy'] = round(alg.score(test_x, test_y), 4)
    MLA_compare.loc[row_index, 'MLA Precission'] = precision_score(test_y, predicted)
    MLA_compare.loc[row_index, 'MLA Recall'] = recall_score(test_y, predicted)
    MLA_compare.loc[row_index, 'MLA AUC'] = auc(fp, tp)





    row_index+=1
    
MLA_compare.sort_values(by = ['MLA Test Accuracy'], ascending = False, inplace = True)    
MLA_compare

In [None]:
plt.subplots(figsize=(15,6))
sns.barplot(x="MLA Name", y="MLA Test Accuracy",data=MLA_compare,palette='hot',edgecolor=sns.color_palette('dark',7))
plt.xticks(rotation=90)
plt.title('MLA Train Accuracy Comparison')
plt.show()
