In [101]:
import pandas as pd
import numpy as np
import re
import math

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Model selection libraries
from sklearn.model_selection import train_test_split, GridSearchCV

# Processing libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer

# Modeling libraries
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Model Evaluation Libraries
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Import Dimensionality Reduction libraries
from sklearn.decomposition import PCA

# Import pipeline library
from sklearn.pipeline import Pipeline

from tempfile import mkdtemp

In [131]:
# Import warnings and supress them
import warnings
warnings.filterwarnings('ignore')

In [12]:
plt.style.use('ggplot')

In [13]:
email_df = pd.read_csv('./data/phishing_all_data.csv')

In [14]:
email_df.shape

(27416, 2)

In [15]:
email_df['phishing'].value_counts()

False    17787
True      9629
Name: phishing, dtype: int64

In [17]:
email_df

Unnamed: 0,content,phishing
0,Your opinion about me? 1. Over 2. Jada 3. Kusr...,False
1,What's up? Do you want me to come online? If y...,False
2,So u workin overtime nigpun?,False
3,"Also sir, i sent you an email about how to log...",False
4,Please Stay At Home. To encourage the notion o...,True
...,...,...
27411,You appear to be using an email application th...,False
27412,CNET Investor Dispatch Quote LookupEnter symbo...,False
27413,Todays Headlines from The Register -----------...,False
27414,"Hi Everyone, There seem to be several bonehead...",False


In [34]:
email_df['link_count'] = email_df['content'].str.count('http')

In [36]:
email_df['secure_link_count'] = email_df['content'].str.count('https')

In [58]:
def getWordCount(text):
    # allow alphanumeric characters and spaces only
    alpha_only_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    no_space_alpha_text = re.sub(r'\s\s*', '\s', text)
    
    # split sentence into words
    listofwords = alpha_only_text.split(' ')
    
    return len(listofwords)

In [59]:
email_df['word_count'] = email_df['content'].apply(getWordCount)

In [60]:
email_df

Unnamed: 0,content,phishing,link_count,secure_link_count,word_count
0,Your opinion about me? 1. Over 2. Jada 3. Kusr...,False,0,0,26
1,What's up? Do you want me to come online? If y...,False,0,0,17
2,So u workin overtime nigpun?,False,0,0,5
3,"Also sir, i sent you an email about how to log...",False,0,0,33
4,Please Stay At Home. To encourage the notion o...,True,0,0,24
...,...,...,...,...,...
27411,You appear to be using an email application th...,False,2,0,1317
27412,CNET Investor Dispatch Quote LookupEnter symbo...,False,0,0,747
27413,Todays Headlines from The Register -----------...,False,24,0,412
27414,"Hi Everyone, There seem to be several bonehead...",False,2,0,108


In [61]:
X = email_df.drop(columns=['phishing'])
y = email_df.loc[:, ['phishing']]

In [94]:
X_remainder, X_test, y_remainder, y_test = train_test_split(
    X,
    y,
    stratify=y,
    random_state=1337,
    test_size=0.2,
)
X_remainder.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_remainder.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [95]:
X_train, X_validation, y_train, y_validation = train_test_split(
    X_remainder,
    y_remainder,
    stratify=y_remainder,
    random_state=1337,
    test_size=0.2,
)
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_validation.reset_index(drop=True, inplace=True)

In [64]:
phishing_df = X_train[y_train['phishing']].reset_index(drop=True)
ham_df = X_train[~y_train['phishing']].reset_index(drop=True)

In [65]:
# import the nltk stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords 

ENGLISH_STOP_WORDS = stopwords.words('english')
stemmer = nltk.stem.PorterStemmer()

[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


In [66]:
def custom_tokenizer(text):
    # allow alphanumeric characters and spaces only
    alpha_only_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    alpha_only_text_no_double_space = re.sub(r'\s\s*', '\s', text)
    
    # split sentence into words
    listofwords = alpha_only_text.split(' ')
    listofstemmed_words = []
    
    # remove stopwords and any tokens that are just empty strings
    for word in listofwords:
        if (not word in ENGLISH_STOP_WORDS) and (word!=''):
            # Stem words
            stemmed_word = stemmer.stem(word)
            listofstemmed_words.append(stemmed_word)

    return listofstemmed_words

## Analyzing Email Words

### Analyzing Top Phishing Words

In [67]:
phishing_word_vectorizer = CountVectorizer(
    tokenizer=custom_tokenizer,
    min_df=0.1,
)
phishing_word_vectorizer.fit(phishing_df['content'])
phishing_word_vectorized = phishing_word_vectorizer.transform(phishing_df['content'])



In [68]:
phishing_word_df = pd.DataFrame(
    data=phishing_word_vectorized.toarray(),
    columns=phishing_word_vectorizer.get_feature_names_out(),
)
phishing_word_sum_df = pd.DataFrame(
    {"counts": phishing_word_vectorized.toarray().sum(axis=0)},
    index=phishing_word_vectorizer.get_feature_names_out(),
).sort_values("counts", ascending=False)

In [69]:
phishing_word_sum_df

Unnamed: 0,counts
account,10950
bank,8184
money,8176
email,6016
fund,5315
...,...
still,760
write,754
recent,753
hear,752


In [70]:
top_phish_words_df = phishing_word_df.loc[:,phishing_word_sum_df.index]

### Analyzing Top Ham Emails

In [71]:
ham_word_vectorizer = CountVectorizer(
    tokenizer=custom_tokenizer,
    min_df=0.1,
)
ham_word_vectorizer.fit(ham_df['content'])
ham_word_vectorized = ham_word_vectorizer.transform(ham_df['content'])



In [72]:
ham_word_sum_df = pd.DataFrame(
    {"counts": ham_word_vectorized.toarray().sum(axis=0)},
    index=ham_word_vectorizer.get_feature_names_out(),
).sort_values("counts", ascending=False)

In [73]:
ham_word_sum_df

Unnamed: 0,counts
use,4153
get,3363
one,3214
list,2993
new,2783
would,2770
time,2752
like,2737
work,2714
email,2389


### Analyzing Phishing NGrams

In [74]:
phishing_ngram_vectorizer = CountVectorizer(
    tokenizer=custom_tokenizer,
    min_df=10,
    ngram_range=(2, 3),
)
phishing_ngram_vectorizer.fit(phishing_df['content'])
phishing_ngram_vectorized = phishing_ngram_vectorizer.transform(phishing_df['content'])



In [75]:
phishing_ngram_df = pd.DataFrame(
    {"counts": phishing_ngram_vectorized.toarray().sum(axis=0)},
    index=phishing_ngram_vectorizer.get_feature_names_out(),
).sort_values("counts", ascending=False)

In [76]:
phishing_ngram_df.head(30)

Unnamed: 0,counts
next kin,2229
email address,1171
unit state,1161
bank account,1121
secur compani,913
state dollar,753
hundr thousand,750
unit state dollar,713
fax number,658
south africa,623


### Analyzing Ham NGrams

In [77]:
ham_ngram_vectorizer = CountVectorizer(
    tokenizer=custom_tokenizer,
    min_df=10,
    ngram_range=(2, 3),
)
ham_ngram_vectorizer.fit(ham_df['content'])
ham_ngram_vectorized = ham_ngram_vectorizer.transform(ham_df['content'])



In [78]:
ham_ngram_df = pd.DataFrame(
    {"counts": ham_ngram_vectorized.toarray().sum(axis=0)},
    index=ham_ngram_vectorizer.get_feature_names_out(),
).sort_values("counts", ascending=False)

In [79]:
ham_ngram_df.head(30)

Unnamed: 0,counts
mail list,1176
linux user,715
user group,703
linux user group,697
irish linux user,694
irish linux,694
unsubscript inform list,692
unsubscript inform,692
inform list maintain,692
inform list,692


In [80]:
combined_word_df = pd.merge(left=phishing_word_df, right=ham_word_df, how='outer', on='word')
combined_word_df.fillna(0)

NameError: name 'ham_word_df' is not defined

In [None]:
combined_word_df.sample(10)

## Vectorize All Emails

## Building Initial Models

In [83]:
email_word_vectorizer = CountVectorizer(
    tokenizer=custom_tokenizer,
    min_df=0.01
)
email_word_vectorizer.fit(['content'])
email_word_vectorized = email_word_vectorizer.transform(X_train['content'])



In [136]:
cv_transf = ColumnTransformer([
    (
        'count_vectorizer',
        CountVectorizer(
            tokenizer=custom_tokenizer,
            min_df=0.01
        ),
        'content',
    )
],
remainder='passthrough'
)

In [None]:
X_train = cv_transf.fit_transform(X_train)

In [85]:
X_train = pd.DataFrame(
    data=X_train.toarray(),
    columns=cv_transf.get_feature_names_out(),
)
X_train.head(10)

Unnamed: 0,0,0100,0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,...,ye,year,yet,york,you2,youi,youll,your,youv,Unnamed: 21
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
X_train = pd.concat([X_train, X_train_word_df], axis=1)
X_train.drop(columns=['content'], inplace=True)

In [97]:
X_train

Unnamed: 0,link_count,secure_link_count,word_count,0,0100,0x0,0x1,0x2,0x3,0x4,...,ye,year,yet,york,you2,youi,youll,your,youv,Unnamed: 21
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,96,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,12,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,8,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17540,2,0,238,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17541,0,0,12,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17542,0,0,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17543,0,0,9,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)
log_reg_model.score(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9439156454830436

In [112]:
minMaxScaler = MinMaxScaler()
minMaxScaler.fit(x)
x_mm_scaled = minMaxScaler.transform(x)
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(x_mm_scaled, y)
knn_model.score(x_mm_scaled, y)

0.9269769477677269

In [100]:
log_reg_model.score(X_validation, y_validation)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- content
Feature names seen at fit time, yet now missing:
- 0
- 0100
- 0x0
- 0x1
- 0x2
- ...


In [113]:
dec_tree_model = DecisionTreeClassifier()
dec_tree_model.fit(x, y)
dec_tree_model.score(x, y)

0.9657134519988329

### Build Test Pipelines

In [None]:
# combined_top_words_df = list(set(ham_word_sum_df.index.to_list() + phishing_word_sum_df.index.to_list()))
# combined_top_words_df

In [123]:
cachedir = mkdtemp()
mod_pipeline = Pipeline([
        ('vectorize', cv_transf),
        ('scaler', StandardScaler()),
        ('model', LogisticRegression())
    ],
    memory=cachedir
)

In [137]:
basic_model_cv = GridSearchCV(
    estimator=mod_pipeline,
    cv=5,
    param_grid=[
        {
            'scaler': [None, StandardScaler(), MinMaxScaler()],
            'model': [LogisticRegression()]
        },
        {
            'scaler': [StandardScaler(), MinMaxScaler()],
            'model': [KNeighborsClassifier()]
        },
        {
            'scaler': [None],
            'model': [DecisionTreeClassifier()]
        }
    ],
    verbose=2
)

In [138]:
basic_model_cv.fit(X_remainder, y_remainder)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ............model=LogisticRegression(), scaler=None; total time=  33.7s
[CV] END ............model=LogisticRegression(), scaler=None; total time=  33.7s
[CV] END ............model=LogisticRegression(), scaler=None; total time=  33.7s
[CV] END ............model=LogisticRegression(), scaler=None; total time=  33.7s
[CV] END ............model=LogisticRegression(), scaler=None; total time=  33.7s
[CV] END model=LogisticRegression(), scaler=StandardScaler(); total time=  26.5s
[CV] END model=LogisticRegression(), scaler=StandardScaler(); total time=  26.7s
[CV] END model=LogisticRegression(), scaler=StandardScaler(); total time=  27.0s
[CV] END model=LogisticRegression(), scaler=StandardScaler(); total time=  27.7s
[CV] END model=LogisticRegression(), scaler=StandardScaler(); total time=  27.2s
[CV] END ..model=LogisticRegression(), scaler=MinMaxScaler(); total time=  26.6s
[CV] END ..model=LogisticRegression(), scaler=Min

In [143]:
basic_model_cv.best_params_

{'model': LogisticRegression(), 'scaler': None}

In [141]:
basic_model_cv.best_score_

0.9505291563776448