In [45]:
'''
external classifier.ipynb

0. read in data (df_test2, df_test2_subject_cleaned)
1. classifier predicting is_external (binary) from body (using Random Forest)
        input: df_test2.csv, rows = 6467, 1.25% of original enron dataset
2. classifier predicting is_external (binary) from subject (using Random Forest)
        input: df_test2_subject_cleaned.csv, rows = 6235, 1.25% of original enron dataset
3. hyperparameter tuning of classifier predicting is_external (binary) from body (using Random Forest, Grid Search)       
'''

'\nexternal classifier.py \n\n0. read in data (df_test2, df_test2_subject_cleaned)\n1. classifier predicting is_external (binary) from body (using Random Forest)\n        input: df_test2.csv, rows = 6467, 1.25% of original enron dataset\n2. classifier predicting is_external (binary) from subject (using Random Forest)\n        input: df_test2_subject_cleaned.csv, rows = 6235, 1.25% of original enron dataset\n3. hyperparameter tuning of classifier predicting is_external (binary) from body (using Random Forest, Grid Search)       \n'

In [46]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
import sys
import copy
import sklearn
import nltk
import re
import email
import random


In [47]:
import xgboost


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD 
from sklearn.preprocessing import normalize 


In [49]:
#
# 0. read in data (df_test2, df_test2_subject_cleaned)
#


In [50]:
# df_test2.csv, rows = 6467, 1.25% of original enron dataset
# output: df


In [51]:
df = pd.read_csv('df_test2.csv')
print(df.shape)
print(df.columns)


(6467, 13)
Index(['index_orig', 'file', 'message', 'email_to', 'email_from', 'cc', 'bcc',
       'subject', 'x_from', 'x_to', 'date_sent', 'body', 'is_external'],
      dtype='object')


In [52]:
# filter out null body, is_external (sanity check)

df.dropna(subset = ['body', 'is_external'], inplace = True)
df.reset_index(inplace = True)
print(df.shape)
print(df.columns)


(6467, 14)
Index(['index', 'index_orig', 'file', 'message', 'email_to', 'email_from',
       'cc', 'bcc', 'subject', 'x_from', 'x_to', 'date_sent', 'body',
       'is_external'],
      dtype='object')


In [53]:
# convert is_external to int

df['is_external'] = df['is_external'].astype(int)


In [54]:
df['is_external'].describe()


count    6467.000000
mean        0.329519
std         0.470075
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: is_external, dtype: float64

In [55]:
# df_test2_subject_cleaned.csv, rows = 6235, 1.25% of original enron dataset
# output: df_sub


In [56]:
df_sub = pd.read_csv('df_test2_clean_subject.csv')
print(df_sub.shape)
print(df_sub.columns)


(6241, 13)
Index(['index_orig', 'file', 'message', 'email_to', 'email_from', 'cc', 'bcc',
       'subject', 'x_from', 'x_to', 'date_sent', 'body', 'is_external'],
      dtype='object')


In [57]:
# filter out null subject, is_external (sanity check)

df_sub.dropna(inplace = True, subset = ['subject', 'is_external'])
df_sub.reset_index(inplace = True)
print(df_sub.shape)
print(df_sub.columns)


(6241, 14)
Index(['index', 'index_orig', 'file', 'message', 'email_to', 'email_from',
       'cc', 'bcc', 'subject', 'x_from', 'x_to', 'date_sent', 'body',
       'is_external'],
      dtype='object')


In [58]:
# convert is_external to int

df_sub['is_external'] = df_sub['is_external'].astype(int)


In [59]:
df_sub['is_external'].describe()


count    6241.000000
mean        0.328954
std         0.469871
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: is_external, dtype: float64

In [60]:
#
# 1. classifier predicting is_external (binary) from body (using Random Forest)
#         input: df_test2.csv, rows = 6467, 1.25% of original enron dataset
#


In [61]:
# Assign body and is_external to X, y

X, y = df['body'], df['is_external']


In [62]:
# clean and lemmatize
# from https://stackabuse.com/text-classification-with-python-and-scikit-learn/

documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters. I'm keeping for now
    #document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(the_word) for the_word in document]
    document = ' '.join(document)
    
    documents.append(document)
    

In [63]:
# vectorize

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

vectorizer = CountVectorizer(max_features=50, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()


In [64]:
# apply tfidf

from sklearn.feature_extraction.text import TfidfTransformer

tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()


In [65]:
# split train and test (80/20)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [66]:
# fit training data using Random Forest

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(X_train, y_train) 


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [67]:
# metrics for training set

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

y_pred = classifier.predict(X_train)
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))
print(accuracy_score(y_train, y_pred))
print(roc_auc_score(y_train, y_pred))


[[3427   43]
 [ 233 1470]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.96      3470
           1       0.97      0.86      0.91      1703

    accuracy                           0.95      5173
   macro avg       0.95      0.93      0.94      5173
weighted avg       0.95      0.95      0.95      5173

0.9466460467813648
0.9253953440360375


In [68]:
# metrics for test set

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))


[[780  86]
 [200 228]]
              precision    recall  f1-score   support

           0       0.80      0.90      0.85       866
           1       0.73      0.53      0.61       428

    accuracy                           0.78      1294
   macro avg       0.76      0.72      0.73      1294
weighted avg       0.77      0.78      0.77      1294

0.7789799072642968
0.7167015605102415


In [69]:
# look at default hyperparameters

classifier.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [70]:
#
# 2. classifier predicting is_external (binary) from subject (using Random Forest)
#        input: df_test2_subject_cleaned.csv, rows = 6235, 1.25% of original enron dataset#
#


In [71]:
# Assign body and is_external to X, y

X, y = df_sub['subject'], df_sub['is_external']


In [72]:
# clean and lemmatize
# from https://stackabuse.com/text-classification-with-python-and-scikit-learn/

documents = []

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters. I'm keeping for now
    #document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(the_word) for the_word in document]
    document = ' '.join(document)
    
    documents.append(document)


In [73]:
# vectorize

vectorizer = CountVectorizer(max_features=500, min_df=2, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()

# apply tfidf

tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

# split train and test (80/20)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# fit training data using Random Forest

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [77]:
# metrics for training set

y_pred = classifier.predict(X_train)
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))
print(accuracy_score(y_train, y_pred))
print(roc_auc_score(y_train, y_pred))


[[3254   96]
 [ 571 1071]]
              precision    recall  f1-score   support

           0       0.85      0.97      0.91      3350
           1       0.92      0.65      0.76      1642

    accuracy                           0.87      4992
   macro avg       0.88      0.81      0.83      4992
weighted avg       0.87      0.87      0.86      4992

0.866386217948718
0.8117983165778901


In [75]:
# metrics for test set

y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))


[[728 110]
 [291 120]]
              precision    recall  f1-score   support

           0       0.71      0.87      0.78       838
           1       0.52      0.29      0.37       411

    accuracy                           0.68      1249
   macro avg       0.62      0.58      0.58      1249
weighted avg       0.65      0.68      0.65      1249

0.6789431545236189
0.5803529432259639


In [76]:
# look at default hyperparameters

classifier.get_params()


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [33]:
#
# 3. hyperparameter tuning of classifier predicting is_external (binary) from body (using Random Forest, Grid Search)       
#


In [34]:
# Assign body and is_external to X, y

X, y = df['body'], df['is_external']


In [35]:
# clean and lemmatize
# from https://stackabuse.com/text-classification-with-python-and-scikit-learn/

documents = []

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters. I'm keeping for now
    #document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(the_word) for the_word in document]
    document = ' '.join(document)
    
    documents.append(document)


In [36]:
# vectorize
vectorizer = CountVectorizer(max_features=50, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()

# apply tfidf
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()


In [37]:
# hyperparameters to tune

''' 
n_estimators = number of trees in the foreset
max_features = max number of features considered for splitting a node
max_depth = max number of levels in each decision tree
min_samples_split = min number of data points placed in a node before the node is split
min_samples_leaf = min number of data points allowed in a leaf node
bootstrap = method for sampling data points (with or without replacement)
'''


' \nn_estimators = number of trees in the foreset\nmax_features = max number of features considered for splitting a node\nmax_depth = max number of levels in each decision tree\nmin_samples_split = min number of data points placed in a node before the node is split\nmin_samples_leaf = min number of data points allowed in a leaf node\nbootstrap = method for sampling data points (with or without replacement)\n'

In [38]:
# default hyperparameters from first run above

''' 
'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 50,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 2000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}
'''
 

" \n'bootstrap': True,\n 'ccp_alpha': 0.0,\n 'class_weight': None,\n 'criterion': 'gini',\n 'max_depth': 50,\n 'max_features': 'auto',\n 'max_leaf_nodes': None,\n 'max_samples': None,\n 'min_impurity_decrease': 0.0,\n 'min_impurity_split': None,\n 'min_samples_leaf': 2,\n 'min_samples_split': 5,\n 'min_weight_fraction_leaf': 0.0,\n 'n_estimators': 2000,\n 'n_jobs': None,\n 'oob_score': False,\n 'random_state': 0,\n 'verbose': 0,\n 'warm_start': False}\n"

In [39]:
# choose small hyperparameter range of target hyperparameters and around the default values

from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False,True],
    'max_depth': [25,50,75],
    'max_features': ['auto'],
    'min_samples_leaf': [1,2,3],
    'min_samples_split': [4,5,6],
    'n_estimators': [1500, 2000, 2500]
}


In [40]:
# create classifier classes

classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier_grid = GridSearchCV(estimator = classifier, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)


In [41]:
# fit (3-fold cross validation)

classifier_grid.fit(X_train, y_train)


Fitting 3 folds for each of 162 candidates, totalling 486 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 110.7min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 280.3min
[Parallel(n_jobs=-1)]: Done 486 out of 486 | elapsed: 379.4min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=0,
                                   

In [42]:
# find best model from GridSearch

best_grid = classifier_grid.best_estimator_
print(best_grid)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=50, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, n_estimators=1500,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)


In [43]:
# metrics for training set

y_pred = best_grid.predict(X_train)
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))
print(accuracy_score(y_train, y_pred))
print(roc_auc_score(y_train, y_pred))


[[3310   60]
 [1256  361]]
              precision    recall  f1-score   support

           0       0.72      0.98      0.83      3370
           1       0.86      0.22      0.35      1617

    accuracy                           0.74      4987
   macro avg       0.79      0.60      0.59      4987
weighted avg       0.77      0.74      0.68      4987

0.7361138961299378
0.6027243916179907


In [44]:
# metrics for test set

y_pred = best_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))


[[796  28]
 [365  58]]
              precision    recall  f1-score   support

           0       0.69      0.97      0.80       824
           1       0.67      0.14      0.23       423

    accuracy                           0.68      1247
   macro avg       0.68      0.55      0.51      1247
weighted avg       0.68      0.68      0.61      1247

0.6848436246992783
0.5515676283596135
