In [3]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('bmh')
import warnings
warnings.simplefilter('ignore')

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [4]:
def get_best_model_and_score(model, params, X, y):
    from sklearn.model_selection import GridSearchCV
    grid = GridSearchCV(model, params, scoring='roc_auc', error_score=0, cv=3, n_jobs=-1)
    grid.fit(X, y)
    print(f"Best roc auc score: {grid.best_score_:.2f}")
    print(f"Best parameters: {grid.best_params_}")
    print(f"Avg. time to fit: {grid.cv_results_['mean_fit_time'].mean()*1000:.0f} ms")
    print(f"Avg. time to predict: {grid.cv_results_['mean_score_time'].mean()*1000:.0f} ms")
    return grid.best_estimator_

In [5]:
def apply_best_estimator(estimator):
    print(f"... using estimator:\n{estimator}")
    estimator.fit(X_train, y_train)
    print(f"Train accuracy: {accuracy_score(estimator.predict(X_train), y_train)}")

    submission = pd.DataFrame()
    submission['name'] = X_test_name
    submission['poi'] = estimator.predict_proba(X_test)[:, 1]
    print(f"Predicted label = {estimator.predict(X_test)}")
    return submission

In [6]:
df_train = pd.read_csv('train_data.csv')
print(df_train.shape)
print(df_train.columns)
df = df_train.drop(['name', 'email_address'], axis=1)
df = df.fillna(0)
df['poi'] = df['poi'].replace([False, True], [0, 1])
all_cols = df.columns.tolist()
all_cols.remove('poi')
fig, axes = plt.subplots(10, 2, figsize=(10, 20))
mask = df['poi'] == 1
for col, ax in zip(all_cols, axes.ravel()):
    bins = np.linspace(df[col].min(), df[col].max(), 11)
    sns.distplot(df[col][mask], kde=True, color='red', ax=ax, hist_kws={'edgecolor':'w'})
    sns.distplot(df[col][~mask], kde=True, color='blue', ax=ax, hist_kws={'edgecolor':'w'})
    ax.set_title(col)
plt.tight_layout()

FileNotFoundError: File b'train_data.csv' does not exist

In [7]:
df_train = pd.read_csv('train_data.csv').fillna(0)
df_train = df_train.drop('poi', axis=1)
df_test = pd.read_csv('test_features.csv').fillna(0)
df = pd.concat([df_train, df_test])
df = df.reset_index(drop=True)
df.shape

FileNotFoundError: File b'train_data.csv' does not exist

In [8]:
for i in range(df.shape[0]):
    try:
        last_name = df.loc[i, 'name'].split(' ')[0].lower()
        first_name = df.loc[i, 'name'].split(' ')[1].lower()
        if last_name in df.loc[i, 'email_address'] or first_name in df.loc[i, 'email_address']:
            #print(f"{i} {df.loc[i, 'name']} == {df.loc[i, 'email_address']}")
            continue
        else:
            print(f"{i} {df.loc[i, 'name']} != {df.loc[i, 'email_address']}")
    except:
        print(f"{i} {df.loc[i, 'name']} : {df.loc[i, 'email_address']}")

NameError: name 'df' is not defined

In [7]:
col_payments = [
    'salary',
    'bonus',
    'long_term_incentive',
    'deferred_income',
    'deferral_payments',
    'loan_advances',
    'other',
    'expenses',
    'director_fees',
]
df1 = df[col_payments]
df1['sum'] = df1.sum(axis=1)
df1['total_payments'] = df['total_payments']
df1['delta'] = df1['sum'] - df1['total_payments']
df1['name'] = df['name']
df1[df1['delta'] != 0.0]

Unnamed: 0,salary,bonus,long_term_incentive,deferred_income,deferral_payments,loan_advances,other,expenses,director_fees,sum,total_payments,delta,name
50,0.0,0.0,0.0,0.0,-102500.0,0.0,0.0,0.0,3285.0,-99215.0,102500.0,-201715.0,BELFER ROBERT
139,0.0,0.0,0.0,0.0,0.0,0.0,137864.0,0.0,137864.0,275728.0,15456290.0,-15180562.0,BHATNAGAR SANJAY


In [8]:
col_stock = [
    'exercised_stock_options',
    'restricted_stock',
    'restricted_stock_deferred',
]
df1 = df[col_stock]
df1['sum'] = df1.sum(axis=1)
df1['total_stock_value'] = df['total_stock_value']
df1['delta'] =df1['total_stock_value'] - df1['sum']
df1['name'] = df['name']
df1[df1['delta'] != 0]

Unnamed: 0,exercised_stock_options,restricted_stock,restricted_stock_deferred,sum,total_stock_value,delta,name
50,3285.0,0.0,44093.0,47378.0,-44093.0,-91471.0,BELFER ROBERT
139,2604490.0,-2604490.0,15456290.0,15456290.0,0.0,-15456290.0,BHATNAGAR SANJAY


#### Observations
> (1) "name" and "email_address" columns have the same meaning --> user ID    
> (2) The "TOTAL" row in the test set is the sum of all rows --> redundant   
> (3) exercised_stock_options + restricted_stock + restricted_stock_deferred = total_stock_value    
> (4) EXCEPTION: total_stock_value of "BELFER ROBERT" and "BHATNAGAR SANJAY" are incorrect     
> (5) Sum of [
    'salary',
    'bonus',
    'long_term_incentive',
    'deferred_income',
    'deferral_payments',
    'loan_advances',
    'other',
    'expenses',
    'director_fees'] = 'total_payments"   
> (6) EXCEPTION: total_payments of "BELFER ROBERT" and "BHATNAGAR SANJAY" are incorrect

In [9]:
X_train = pd.read_csv('train_data.csv')
y_train = X_train['poi'].copy()
y_train = y_train.replace([False, True], [0, 1])
X_train = X_train.drop(['poi', 'name', 'email_address', 'total_payments', 'total_stock_value'], axis=1)

X_test = pd.read_csv('test_features.csv')
X_test_name = X_test['name']
X_test = X_test.drop(['name', 'email_address', 'total_payments', 'total_stock_value'], axis=1)

X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

X_train['total_payments'] = [0.0] * X_train.shape[0]
X_test['total_payments'] = [0.0] * X_test.shape[0]
for col in col_payments:
    X_train['total_payments'] = X_train['total_payments'] + X_train[col]
    X_test['total_payments'] = X_test['total_payments'] + X_test[col]

X_train['total_stock_value'] = [0.0] * X_train.shape[0]
X_test['total_stock_value'] = [0.0] * X_test.shape[0]
for col in col_stock:
    X_train['total_stock_value'] = X_train['total_stock_value'] + X_train[col]
    X_test['total_stock_value'] = X_test['total_stock_value'] + X_test[col]
    
print(X_train.shape, X_test.shape, y_train.shape)
print(f"{np.unique(y_train)} y_train bincount: {np.bincount(y_train)}")

scaler = Normalizer()
for col in X_train.columns:
    scaler.fit(X_train[col].values.reshape(-1, 1))
    X_train[col] = scaler.transform(X_train[col].values.reshape(-1, 1))
    X_test[col] = scaler.transform(X_test[col].values.reshape(-1, 1))
    
clf = KNeighborsClassifier()

model = Pipeline([
    ('clf', clf),
])
params = {
    'clf__n_neighbors': [1, 3, 5, 7, 9, 11],
}

estimator = get_best_model_and_score(model, params, X_train, y_train)
submission = apply_best_estimator(estimator)

(113, 19) (33, 19) (113,)
[0 1] y_train bincount: [100  13]
Best roc auc score: 0.73
Best parameters: {'clf__n_neighbors': 1}
Avg. time to fit: 3 ms
Avg. time to predict: 3 ms
... using estimator:
Pipeline(memory=None,
     steps=[('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform'))])
Train accuracy: 0.8672566371681416
Predicted label = [1 1 1 1 1 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 1]




In [10]:
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)
print(clf.predict(X_test))

submission = pd.DataFrame({'name': X_test_name})
submission['poi'] = y_pred[:, 1]

[1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1]


In [11]:
submission.to_csv('submission_20190220#5.csv', index=None)
submission

Unnamed: 0,name,poi
0,BELDEN TIMOTHY N,0.666667
1,BOWEN JR RAYMOND M,0.666667
2,HANNON KEVIN P,1.0
3,DELAINEY DAVID W,1.0
4,CAUSEY RICHARD A,0.666667
5,HICKERSON GARY J,1.0
6,FREVERT MARK A,0.333333
7,CHAN RONNIE,0.0
8,DONAHUE JR JEFFREY M,0.333333
9,REYNOLDS LAWRENCE,0.0
