In [38]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt

In [39]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

In [40]:
source_filename = "enron_random_clean1_senders.pkl"
master = pd.read_pickle(source_filename)
master["label"] = master.entity.apply(lambda x: 1 if x == "signature" else 0)
master = master[master.nSigBlocks > 0]
master["filename_cat"] = master.filename.astype('category')

In [4]:
excluded_columns = [
    "line", 
    "filename", 
    "entity", 
    "label", 
    "nSig", 
    "firstchar", 
    'nlines',
     'len_avg',
    'len_min',
    'len_max',
    'nBlanks',
    'nNonBlanks',
    'nSigBlocks',
    'pred_label',
    "next_label",
    "sigToLinesRatio",
    "pred_file",
    "next_file",
    "lineNo",
#   "len",
    "pred_named_entity",
    "named_entity",
    "next_named_entity",
    "sender", 
    "sender_name",
    "filename_cat"]

In [5]:
def split_df(df):
    filenames = df.filename.unique()
    from sklearn.model_selection import train_test_split
    train_filenames, test_filenames = train_test_split(filenames, test_size=0.2, random_state=42)
    train = df[df.filename.isin(train_filenames)]
    test = df[df.filename.isin(test_filenames)]
    featured_columns = [c for c in df.columns if c not in excluded_columns]
    X_train = train.loc[:, featured_columns]
    X_test = test.loc[:, featured_columns]
    y_train = train.label
    y_test = test.label
    return X_train, X_test, y_train, y_test, train, test

In [6]:
no_shift_columns = ["prev_same_entity", "next_same_entity"]

In [7]:
def shift_df(df):
    prev_val = -1
    next_val = 2
    df["pred_file"] = df.filename.shift(1, fill_value=prev_val)
    df["next_file"] = df.filename.shift(-1, fill_value=next_val)
    columns_to_shift = [c for c in df.columns if c not in excluded_columns and c not in no_shift_columns]
        
    print(f"Columns to shift: {columns_to_shift}")
    for col in columns_to_shift:
        print(f"Shifting columng {col}")
        pred_name = f"prev_{col}"
        df[pred_name] = df[col].shift(1, fill_value=prev_val)
        df[pred_name] = df.apply(lambda row: row[pred_name] if row.pred_file == row.filename else prev_val, axis=1)
        
        next_name = f"next_{col}"
        df[next_name] = df[col].shift(-1, fill_value=next_val)
        df[next_name] = df.apply(lambda row: row[next_name] if row.next_file == row.filename else next_val, axis=1)
    return df

In [8]:
from sklearn import svm
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter

In [9]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GroupKFold

# Grid Search

In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
parameters = {
    'C': [0.002, 0.07, 0.1, 0.5, 1, 1.25, 1.5, 2, 10, 12, 16, 32, 64, 0.047, 0.052, 0.057, 0.062, 0.067, 0.1, 0.5, 1, 1.25, 1.5, 2],
#     'C': [0.002, 0.07, 0.1, 0.5, 1, 1.25, 1.5, 2, 10, 12],
#     'C': [0.002, 0.07, 0.1, 0.5, 1, 1.25, 1.5, 2],
    'class_weight': ["balanced", {1: 1}, {1: 2}, {1: 2.5}, {1: 2.25}, {1: 3}]
}

In [18]:
shifted = shift_df(master)

Columns to shift: ['len', 'blank', 'email', 'url', 'phone', 'sigdelimiter', 'special', 'words', 'header', 'name', 'endquote', 'tabs1', 'tabs2', 'tabs3', 'punct20', 'punct50', 'punct90', 'reply', 'startpunct', 'replypunct', 'wrote', 'alphanum90', 'alphanum50', 'alphanum10', 'num90', 'num50', 'num10', 'title', 'many_titles', 'person', 'org', 'posFromEnd', 'last', 'prevlast', 'last5', 'last11', 'posRatio', 'posRatioFromEnd', 'posRatioNB', 'lenRatio', 'lenRatioMax', 'less_avg_len', 'more_avg_len', 'less_avg_len75', 'less_avg_len50', 'has_sender', 'has_sender_name']
Shifting columng len
Shifting columng blank
Shifting columng email
Shifting columng url
Shifting columng phone
Shifting columng sigdelimiter
Shifting columng special
Shifting columng words
Shifting columng header
Shifting columng name
Shifting columng endquote
Shifting columng tabs1
Shifting columng tabs2
Shifting columng tabs3
Shifting columng punct20
Shifting columng punct50
Shifting columng punct90
Shifting columng reply
Shif

In [31]:
X_train, X_test, y_train, y_test, train, test = split_df(master)

In [15]:
from imblearn.over_sampling import SMOTE

In [107]:
# Before SMOTE
print("Before SMOTE:")
print(train.nSigBlocks.value_counts())
print(train.nSigBlocks.value_counts(normalize=True))

Before SMOTE:
1    1851
0    387 
Name: nSigBlocks, dtype: int64
1    0.827078
0    0.172922
Name: nSigBlocks, dtype: float64


In [125]:
X_train, X_test, y_train, y_test, train, test = split_df(shifted)
# Add temporarily
X_train["label"] = train.label
X_train["nSigBlocks"] = train.nSigBlocks
X_train["filename_code"] = train.filename_cat.cat.codes

# Apply smote
sm = SMOTE(random_state=42, sampling_strategy=.4)
X_train, y_train_nSigBlocks = sm.fit_resample(X_train, train.nSigBlocks)
y_train = X_train.label

# After SMOTE
print("Afer SMOTE:")
print(X_train.nSigBlocks.value_counts())
print(X_train.nSigBlocks.value_counts(normalize=True))

# Cleanup
train_filenames = X_train["filename_code"]
X_train.drop(["label", "filename_code", "nSigBlocks"], axis=1, inplace=True)

assert "label" not in X_train.columns
assert "filename_code" not in X_train.columns
assert len(y_train) == len(X_train)

Afer SMOTE:
1    1851
0    740 
Name: nSigBlocks, dtype: int64
1    0.714396
0    0.285604
Name: nSigBlocks, dtype: float64


In [130]:
svc = svm.LinearSVC(random_state=42, dual=True, max_iter=100000)
clf = GridSearchCV(svc, parameters, scoring="f1", cv=GroupKFold(n_splits=10), n_jobs=-1, refit=False, verbose=10)
clf.fit(X_train, y_train, groups=train_filenames)

print(f"Best score: {clf.best_score_:.3}")
print()
print(f"Best params: {clf.best_params_}")

Fitting 10 folds for each of 144 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:   54.7s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed:  

Best score: 0.819

Best params: {'C': 0.057, 'class_weight': {1: 1}}


[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed: 26.0min finished


In [32]:
linear = svm.LinearSVC(C=16, random_state=42, dual=True, class_weight={1: 2.25}, max_iter=100000)
linear.fit(X_train, y_train)
linear_pred = linear.predict(X_test)
print(f1_score(y_true=y_test, y_pred=linear_pred))

0.7910447761194029




In [41]:
print(classification_report(y_true=y_test, y_pred=linear_pred, digits=3))

              precision    recall  f1-score   support

           0      0.988     0.957     0.972       507
           1      0.707     0.898     0.791        59

    accuracy                          0.951       566
   macro avg      0.847     0.927     0.881       566
weighted avg      0.958     0.951     0.953       566



In [42]:
test["pred"] = linear_pred
TP = test[(test.label == 1) & (test.pred == 1)] # is signature and predicted signature
TN = test[(test.label == 0) & (test.pred == 0)] # is not signature and predicted not signature

FP = test[(test.label == 0) & (test.pred == 1)] # is not signature but predicted signature
FN = test[(test.label == 1) & (test.pred == 0)] # is signature but predicted not signature

In [43]:
print(confusion_matrix(y_test, linear_pred))

[[485  22]
 [  6  53]]


In [44]:
X_train.columns

Index(['len', 'blank', 'email', 'url', 'phone', 'sigdelimiter', 'special',
       'words', 'header', 'name', 'endquote', 'tabs1', 'tabs2', 'tabs3',
       'punct20', 'punct50', 'punct90', 'reply', 'startpunct', 'replypunct',
       'wrote', 'alphanum90', 'alphanum50', 'alphanum10', 'num90', 'num50',
       'num10', 'title', 'many_titles', 'person', 'org', 'posFromEnd', 'last',
       'prevlast', 'last5', 'last11', 'posRatio', 'posRatioFromEnd',
       'posRatioNB', 'lenRatio', 'lenRatioMax', 'less_avg_len', 'more_avg_len',
       'less_avg_len75', 'less_avg_len50', 'prev_same_entity',
       'next_same_entity', 'has_sender', 'has_sender_name'],
      dtype='object')

# Add blocking

In [45]:
def update_prediction(row):    
    same_file = row.pred_file == row.filename and row.next_file == row.filename
    if not same_file:
        return row.pred
    
    if row.pred_predict == 1 and row.next_predict == 1:        
        return 1
    if row.pred_predict == 0 and row.next_predict == 0:        
        return 0
        
    return row.pred

In [46]:
test["pred"] = linear_pred
test["pred_predict"] = test.pred.shift(1)
test["next_predict"] = test.pred.shift(-1)
test["pred_file"] = test.filename.shift(1)
test["next_file"] = test.filename.shift(-1)
test["new_pred"] = test.apply(lambda row: update_prediction(row), axis=1)
print(classification_report(y_true=test.label, y_pred=test.new_pred, digits=3))
print("\nBefore blocking:")
print(classification_report(y_true=test.label, y_pred=test.pred, digits=3))

              precision    recall  f1-score   support

           0      0.990     0.970     0.980       507
           1      0.783     0.915     0.844        59

    accuracy                          0.965       566
   macro avg      0.886     0.943     0.912       566
weighted avg      0.968     0.965     0.966       566


Before blocking:
              precision    recall  f1-score   support

           0      0.988     0.957     0.972       507
           1      0.707     0.898     0.791        59

    accuracy                          0.951       566
   macro avg      0.847     0.927     0.881       566
weighted avg      0.958     0.951     0.953       566



In [48]:
print(f"{precision_score(y_true=test.label, y_pred=test.pred):.3f}")

0.557


In [43]:
print(f"Precision with blocking: {precision_score(y_true=test.label, y_pred=test.new_pred):.3f}")
print(f"Recall with blocking: {recall_score(y_true=test.label, y_pred=test.new_pred):.3f}")
print(f"F1 with blocking: {f1_score(y_true=test.label, y_pred=test.new_pred):.3f}")

Precision with blocking: 0.573
Recall with blocking: 0.729
F1 with blocking: 0.642


In [103]:
print(classification_report(y_true=test.loc[test.blank==0].label, y_pred=test.loc[test.blank==0].pred, digits=3))

              precision    recall  f1-score   support

           0      0.986     0.953     0.969       579
           1      0.585     0.826     0.685        46

    accuracy                          0.944       625
   macro avg      0.785     0.890     0.827       625
weighted avg      0.956     0.944     0.948       625



In [47]:
print(confusion_matrix(test.label, test.new_pred))

[[492  15]
 [  5  54]]


In [48]:
print(confusion_matrix(test.label, test.pred))

[[485  22]
 [  6  53]]


In [65]:
((15-18)/18)*100

-16.666666666666664