In [84]:
import pandas as pd
import numpy as np
import re
import math
# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
# NLP libraries
from sklearn.feature_extraction.text import CountVectorizer
import nltk

In [85]:
plt.style.use('ggplot')

In [86]:
email_df = pd.read_csv('./data/phishing_all_data.csv')

In [87]:
email_df.shape

(27416, 2)

In [88]:
email_df['phishing'].value_counts()

False    17787
True      9629
Name: phishing, dtype: int64

In [89]:
phishing_df = email_df[email_df['phishing']].reset_index(drop=True)
ham_df = email_df[~email_df['phishing']].reset_index(drop=True)

In [90]:
# import the nltk stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords 

ENGLISH_STOP_WORDS = stopwords.words('english')
stemmer = nltk.stem.PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rosswillett/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [91]:
def custom_tokenizer(text):
    # allow alphanumeric characters and spaces only
    alpha_only_text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # split sentence into words
    listofwords = alpha_only_text.split(' ')
    listofstemmed_words = []
    
    # remove stopwords and any tokens that are just empty strings
    for word in listofwords:
        if (not word in ENGLISH_STOP_WORDS) and (word!=''):
            # Stem words
            stemmed_word = stemmer.stem(word)
            listofstemmed_words.append(stemmed_word)

    return listofstemmed_words

## Analyzing Email Words

### Analyzing Top Phishing Words

In [92]:
phishing_word_vectorizer = CountVectorizer(
    tokenizer=custom_tokenizer,
    min_df=0.1,
)
phishing_word_vectorizer.fit(phishing_df['content'])
phishing_word_vectorized = phishing_word_vectorizer.transform(phishing_df['content'])



In [93]:
phishing_word_df = pd.DataFrame(
    data=phishing_word_vectorized.toarray(),
    columns=phishing_word_vectorizer.get_feature_names_out(),
)
phishing_word_sum_df = pd.DataFrame(
    {"counts": phishing_word_vectorized.toarray().sum(axis=0)},
    index=phishing_word_vectorizer.get_feature_names_out(),
).sort_values("counts", ascending=False)

In [94]:
phishing_word_sum_df

Unnamed: 0,counts
account,17517
bank,12665
money,12633
email,9127
us,8582
...,...
oper,1177
write,1174
recent,1165
hear,1160


In [95]:
top_phish_words_df = phishing_word_df.loc[:,phishing_word_sum_df.index]

### Analyzing Top Ham Emails

In [96]:
ham_word_vectorizer = CountVectorizer(
    tokenizer=custom_tokenizer,
    min_df=0.1,
)
ham_word_vectorizer.fit(ham_df['content'])
ham_word_vectorized = ham_word_vectorizer.transform(ham_df['content'])



In [97]:
ham_word_sum_df = pd.DataFrame(
    {"counts": ham_word_vectorized.toarray().sum(axis=0)},
    index=ham_word_vectorizer.get_feature_names_out(),
).sort_values("counts", ascending=False)

In [98]:
ham_word_sum_df

Unnamed: 0,counts
use,6406
get,5232
one,5047
list,4682
time,4375
would,4287
new,4243
like,4200
work,4097
email,3568


### Analyzing Phishing NGrams

In [99]:
phishing_ngram_vectorizer = CountVectorizer(
    tokenizer=custom_tokenizer,
    min_df=10,
    ngram_range=(2, 3),
)
phishing_ngram_vectorizer.fit(phishing_df['content'])
phishing_ngram_vectorized = phishing_ngram_vectorizer.transform(phishing_df['content'])



In [100]:
phishing_ngram_df = pd.DataFrame(
    {"counts": phishing_ngram_vectorized.toarray().sum(axis=0)},
    index=phishing_ngram_vectorizer.get_feature_names_out(),
).sort_values("counts", ascending=False)

In [101]:
phishing_ngram_df.head(30)

Unnamed: 0,counts
x x,53760
x x x,49243
next kin,3365
unit state,1895
bank account,1772
email address,1704
secur compani,1441
state dollar,1256
unit state dollar,1183
hundr thousand,1174


### Analyzing Ham NGrams

In [102]:
ham_ngram_vectorizer = CountVectorizer(
    tokenizer=custom_tokenizer,
    min_df=10,
    ngram_range=(2, 3),
)
ham_ngram_vectorizer.fit(ham_df['content'])
ham_ngram_vectorized = ham_ngram_vectorizer.transform(ham_df['content'])



In [103]:
ham_ngram_df = pd.DataFrame(
    {"counts": ham_ngram_vectorized.toarray().sum(axis=0)},
    index=ham_ngram_vectorizer.get_feature_names_out(),
).sort_values("counts", ascending=False)

In [104]:
ham_ngram_df.head(30)

Unnamed: 0,counts
mail list,1845
linux user,1109
user group,1092
linux user group,1076
irish linux,1071
irish linux user,1071
inform list maintain,1068
inform list,1068
unsubscript inform list,1068
unsubscript inform,1068


In [105]:
combined_word_df = pd.merge(left=phishing_word_df, right=ham_word_df, how='outer', on='word')
combined_word_df.fillna(0)

NameError: name 'ham_word_df' is not defined

In [None]:
combined_word_df.sample(10)

## Vectorize All Emails

In [None]:
email_word_vectorizer = CountVectorizer(
    tokenizer=custom_tokenizer,
)
email_word_vectorizer.fit(email_df['content'])
email_word_vectorized = email_word_vectorizer.transform(email_df['content'])

In [None]:
email_word_df = pd.DataFrame(
    data=email_word_vectorized.toarray(),
    columns=email_word_vectorizer.get_feature_names_out(),
)
email_word_df.head(10)

### Build DF to Build Models On

In [None]:
combined_top_words_df = list(set(ham_word_sum_df.index.to_list() + phishing_word_sum_df.index.to_list()))
combined_top_words_df

In [None]:
working_df = email_word_df.loc[:, combined_top_words_df]

In [None]:
working_df['is_phishing'] = np.where(email_df['phishing'], 1, 0)
working_df

## Building Initial Models

In [106]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler

In [107]:
x = working_df.iloc[:, :-1]
y = working_df['is_phishing']

In [108]:
log_reg_model = LogisticRegression()
log_reg_model.fit(x, y)
log_reg_model.score(x, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9195360373504523

In [112]:
minMaxScaler = MinMaxScaler()
minMaxScaler.fit(x)
x_mm_scaled = minMaxScaler.transform(x)
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(x_mm_scaled, y)
knn_model.score(x_mm_scaled, y)

0.9269769477677269

In [113]:
dec_tree_model = DecisionTreeClassifier()
dec_tree_model.fit(x, y)
dec_tree_model.score(x, y)

0.9657134519988329