In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt

In [35]:
from sklearn import svm
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter, defaultdict
from sklearn.model_selection import GroupKFold, GroupShuffleSplit

In [54]:
source_filename = "enron_random_clean1_senders.pkl"
master = pd.read_pickle(source_filename)
master["label"] = master.entity.apply(lambda x: 1 if x == "signature" else 0)
master = master[master.nSigBlocks > 0]

In [5]:
excluded_columns = ["line", 
                    "filename", 
                    "entity", 
                    "label", 
                    "nSig", 
                    "firstchar", 
                    'nlines',
                     'len_avg',
                    'len_min',
                    'len_max',
                    'nBlanks',
                    'nNonBlanks',
                    'nSigBlocks',
                    'pred_label',
                    "next_label",
                    "sigToLinesRatio",
                    "pred_file",
                    "next_file",
                    "lineNo",
#                     "len",
                    "pred_named_entity",
                    "named_entity",
                    "next_named_entity",
                    "sender", 
                    "sender_name"
                    
                    
                    
 ]

In [6]:
def split_df(df):
    filenames = df.filename.unique()
    from sklearn.model_selection import train_test_split
    train_filenames, test_filenames = train_test_split(filenames, test_size=0.2, random_state=42)
    train = df[df.filename.isin(train_filenames)]
    test = df[df.filename.isin(test_filenames)]
    featured_columns = [c for c in df.columns if c not in excluded_columns]
    X_train = train.loc[:, featured_columns]
    X_test = test.loc[:, featured_columns]
    y_train = train.label
    y_test = test.label
    return X_train, X_test, y_train, y_test, train, test

In [7]:
X_train, X_test, y_train, y_test, train, test = split_df(master)

In [57]:
group_kfold = GroupKFold(n_splits=5)
results = defaultdict(list)
for train_index, test_index in group_kfold.split(X_test, y_test, test.filename):
    curr_X_train = X_test.iloc[train_index, :]
    curr_y_train = y_test.iloc[train_index]
    
    curr_X_test = X_test.iloc[test_index, :]
    curr_y_test = y_test.iloc[test_index]
    
    
    curr_linear = svm.LinearSVC(C=10, random_state=42, dual=True, class_weight={1: 2}, max_iter=100000)
    curr_linear.fit(curr_X_train, curr_y_train)
    curr_pred = curr_linear.predict(curr_X_test)
    results["precision"].append(precision_score(curr_y_test, curr_pred))
    results["recall"].append(recall_score(curr_y_test, curr_pred))
    results["f1"].append(f1_score(curr_y_test, curr_pred))
    
print_kfold_results(results)



precision: 0.53
recall: 0.78
f1: 0.56




In [46]:
for k, v in results.items():
    print(f"{k}: {np.mean(v):.2f}")

precision: 0.60
recall: 0.81
f1: 0.64


# Cross validation with blocking

In [49]:
def update_prediction(row):
    same_file = row.pred_file == row.filename and row.next_file == row.filename
    if not same_file:
        return row.pred
    
    if row.pred_predict == 1 and row.next_predict == 1:
        return 1
    if row.pred_predict == 0 and row.next_predict == 0:
        return 0
        
    return row.pred

In [56]:
def print_kfold_results(results):
    for k, v in results.items():
        print(f"{k}: {np.mean(v):.2f}")

In [50]:
group_kfold = GroupKFold(n_splits=5)
results = defaultdict(list)
for train_index, test_index in group_kfold.split(X_test, y_test, test.filename):
    curr_X_train = X_test.iloc[train_index, :]
    curr_y_train = y_test.iloc[train_index]
    
    curr_test = test.iloc[test_index, :]
    curr_X_test = X_test.iloc[test_index, :]
    curr_y_test = y_test.iloc[test_index]
    
    
    curr_linear = svm.LinearSVC(C=0.5, random_state=42, dual=True, class_weight={1: 2}, max_iter=100000)
    curr_linear.fit(curr_X_train, curr_y_train)
    curr_pred = curr_linear.predict(curr_X_test)
    
    # Blocking
    curr_test["pred"] = curr_pred
    curr_test["pred_predict"] = curr_test.pred.shift(1)
    curr_test["next_predict"] = curr_test.pred.shift(-1)
    curr_test["new_pred"] = curr_test.apply(lambda row: update_prediction(row), axis=1)
    
    curr_pred = curr_test.new_pred
    
    results["precision"].append(precision_score(curr_y_test, curr_pred))
    results["recall"].append(recall_score(curr_y_test, curr_pred))
    results["f1"].append(f1_score(curr_y_test, curr_pred))
    
print_kfold_results(results)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui