In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import sys
import pickle
import string

import matplotlib.pyplot as plt
from sklearn import neighbors
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.cross_validation import KFold, train_test_split, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data, test_classifier

### Task 1: Select what features you'll use.

In [2]:
### features_list is a list of strings, each of which is a feature name.
features_list = ['poi','salary', 'deferral_payments', 'total_payments', 'loan_advances',\
    'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value',\
    'expenses', 'exercised_stock_options', 'other', 'long_term_incentive',\
    'restricted_stock', 'director_fees', 'to_messages', 'from_poi_to_this_person',\
    'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']

### Load the dictionary containing the dataset, create DataFrame from dictionary
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers

In [3]:
data_dict.pop('TOTAL', 0)
data_dict.pop('THE TRAVEL AGENCY IN THE PARK', 0) # from reading the pdf

{'bonus': 'NaN',
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'NaN',
 'exercised_stock_options': 'NaN',
 'expenses': 'NaN',
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 362096,
 'poi': False,
 'restricted_stock': 'NaN',
 'restricted_stock_deferred': 'NaN',
 'salary': 'NaN',
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 362096,
 'total_stock_value': 'NaN'}

### Task 3: Create new feature(s)

In [4]:
def compute_ratio(numerator, denominator, ratio_list):
    for name in data_dict:
        if data_dict[name][denominator] == 'NaN' or data_dict[name][denominator] == 0 \
        or data_dict[name][numerator] == 'NaN':
            data_dict[name][ratio_list] = 0
        if data_dict[name][denominator] != 'NaN' and data_dict[name][denominator] != 0 \
        and data_dict[name][numerator] != 'NaN':
            ratio = data_dict[name][numerator] / float(data_dict[name][denominator])
            data_dict[name][ratio_list] = ratio
        else:
            data_dict[name][ratio_list] = 'NaN'
    print "New feature", ratio_list, "is added to data_dict."

compute_ratio('from_this_person_to_poi', 'to_messages', 'ratio_to_poi')
compute_ratio('from_poi_to_this_person', 'from_messages', 'ratio_from_poi') 
compute_ratio('bonus', 'salary', 'ratio_bonus_to_salary')
features_list.extend(('ratio_to_poi', 'ratio_from_poi', 'ratio_bonus_to_salary'))

New feature ratio_to_poi is added to data_dict.
New feature ratio_from_poi is added to data_dict.
New feature ratio_bonus_to_salary is added to data_dict.


In [5]:
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, remove_NaN=True, remove_all_zeroes=True, sort_keys=True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers

In [6]:
def clf_accuracy(classifier):
    clf = classifier()
    clf.fit(features, labels)
    pred = clf.predict(features)
    return classifier, "Accuracy:", "%.3f" % accuracy_score(pred, labels)

### All classifiers are overfit
print clf_accuracy(GaussianNB)
print clf_accuracy(SVC)
print clf_accuracy(DecisionTreeClassifier)
print clf_accuracy(neighbors.KNeighborsClassifier)
print clf_accuracy(AdaBoostClassifier)
print clf_accuracy(RandomForestClassifier)

(<class 'sklearn.naive_bayes.GaussianNB'>, 'Accuracy:', '0.825')
(<class 'sklearn.svm.classes.SVC'>, 'Accuracy:', '1.000')
(<class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'Accuracy:', '1.000')
(<class 'sklearn.neighbors.classification.KNeighborsClassifier'>, 'Accuracy:', '0.902')
(<class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>, 'Accuracy:', '1.000')
(<class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'Accuracy:', '0.965')


### Task 5: Tune your classifier to achieve better than .3 precision and recall 

#### Feature scaling

In [7]:
### Fill all missing values with zero, so they can be discounted by MinMaxScaler
df = pd.DataFrame.from_dict(data_dict, orient='index', dtype=np.float)
df = df.replace('NaN', 0)

### Remove string data in column email_address
del df['email_address']

### Transform all the features into floating points and seperate the label
features_df = df.drop(['poi'], axis=1)
labels_df = df['poi']

In [8]:
scaler = MinMaxScaler()
features_rescaled = scaler.fit_transform(features_df)

### Update features array, turn labels into array
features = features_rescaled
labels = labels_df.as_matrix().astype(int)

#### Select Top Features

In [9]:
### Use SelectKBest to find the cut-off point in the number of features selected
### Chi-square scores work better with normalized data
### Hard-coded the features according to the results in the write-up because SelectKBest produces different outcomes

#skb = SelectKBest(chi2, k='all')
#fit = skb.fit(features, labels)

#temp = features_list[1:]
#d = {'features': temp, 'scores': fit.scores_}
#chi2_df = pd.DataFrame(d).sort_values(by='scores', ascending=False)
#chi2_df

In [10]:
### Visualize the score of each data, select cut-off point at the sharpest decline
#chi2_df.plot.bar(x='features')

In [11]:
features_selected = ['poi', 'bonus', 'long_term_incentive', 'exercised_stock_options', 'restricted_stock_deferred']

#### Validation

In [12]:
### Extract features and labels from dataset
data = featureFormat(my_dataset, features_selected, sort_keys = True, remove_all_zeroes = True, remove_NaN = True)
labels, features = targetFeatureSplit(data)

In [13]:
### Use StratifiedShuffleSplit to maximize the 
sss = StratifiedShuffleSplit(labels, n_iter=100, test_size=0.25, train_size=0.6, random_state=46)
for train_idx, test_idx in sss: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append(features[ii] )
            labels_train.append(labels[ii] )
        for jj in test_idx:
            features_test.append(features[jj] )
            labels_test.append(labels[jj] )
    
print len(labels_train)
print len(labels_test)

75
32


In [14]:
### Try DecisionTree, no need to perform feature scaling
clf = DecisionTreeClassifier()
params = {'min_samples_split': range(2, 11),
         'class_weight': ['balanced', {True: 12, False: 1}, {True: 10, False: 1}, {True: 8, False: 1}],
         'splitter': ['random', 'best']
         }
clf_gs = GridSearchCV(clf, param_grid=params, scoring='f1')
clf_gs.fit(features_train, labels_train)
clf_estimator = clf_gs.best_estimator_
clf_params = clf_gs.best_params_
print "Best Estimator: ", clf_estimator
print "Best Params: ", clf_params
test_classifier(clf_estimator, my_dataset, features_selected, folds = 1000)

Best Estimator:  DecisionTreeClassifier(class_weight={False: 1, True: 8}, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='random')
Best Params:  {'min_samples_split': 2, 'splitter': 'random', 'class_weight': {False: 1, True: 8}}
DecisionTreeClassifier(class_weight={False: 1, True: 8}, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='random')
	Accuracy: 0.79050	Precision: 0.32455	Recall: 0.31520	F1: 0.31981	F2: 0.31703
	Total predictions: 32000	True positives: 1576	False positives: 3280	False negatives: 3424	True negatives: 23720



In [15]:
forest = RandomForestClassifier()
params = {'n_estimators': range(1, 10),
          'criterion': ['gini', 'entropy'],
          'min_samples_split': range(1, 6, 2),
          'class_weight': ['balanced', {True: 12, False: 1}, {True: 10, False: 1}, {True: 8, False: 1}]
          }
forest_gs = GridSearchCV(forest, param_grid=params, scoring='f1')
forest_gs.fit(features, labels)
forest_estimator = forest_gs.best_estimator_
forest_params = forest_gs.best_params_
print "Best Estimator: ", forest_estimator
print "Best Params: ", forest_params
test_classifier(forest_estimator, my_dataset, features_selected, folds = 1000)

  'precision', 'predicted', average, warn_for)


Best Estimator:  RandomForestClassifier(bootstrap=True, class_weight={False: 1, True: 12},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=1, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Best Params:  {'min_samples_split': 5, 'n_estimators': 1, 'criterion': 'gini', 'class_weight': {False: 1, True: 12}}
RandomForestClassifier(bootstrap=True, class_weight={False: 1, True: 12},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=1, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
	Accuracy: 0.76197	Precision: 0.29754	Recall: 0.38460	F1: 0.33551	F2: 0.36334
	Total predictions: 32000	True positives: 19

In [16]:
scaler = MinMaxScaler()
knn = KNeighborsClassifier()
pipeline = Pipeline([('scaler', scaler), ('knn', knn)])
params = {'knn__n_neighbors': range(1, 10),
          'knn__weights': ['uniform', 'distance'],
          'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
          }
knn_gs = GridSearchCV(pipeline, param_grid=params, scoring='recall')
knn_gs.fit(features, labels)
knn_estimator = knn_gs.best_estimator_
knn_params = knn_gs.best_params_
print "Best Estimator: ", knn_estimator
print "Best Params: ", knn_params
test_classifier(knn_estimator, my_dataset, features_selected, folds = 1000)

Best Estimator:  Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform'))])
Best Params:  {'knn__algorithm': 'auto', 'knn__weights': 'uniform', 'knn__n_neighbors': 3}
Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform'))])
	Accuracy: 0.82441	Precision: 0.33855	Recall: 0.12980	F1: 0.18765	F2: 0.14806
	Total predictions: 32000	True positives:  649	False positives: 1268	False negatives: 4351	True negatives: 25732



In [27]:
svc = SVC()
scaler = MinMaxScaler()
pipe = Pipeline([('scaler', scaler), ('svc', svc)])
params = {'svc__kernel':['rbf', 'linear'], 
          'svc__degree': range(1, 3),
          'svc__C': range(8, 25, 2),
          'svc__class_weight': [{True: 14, False: 1}, {True: 12, False: 1}, {True: 10, False: 1}]
          }
svc_gs = GridSearchCV(pipe, param_grid=params, scoring='precision', cv=sss)
svc_gs.fit(features, labels)
svc_estimator = svc_gs.best_estimator_
svc_params = svc_gs.best_params_
print "Best Estimator: ", svc_estimator
print "Best Params: ", svc_params
test_classifier(svc_estimator, my_dataset, features_selected, folds = 1000)

Best Estimator:  Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svc', SVC(C=22, cache_size=200, class_weight={False: 1, True: 10}, coef0=0.0,
  decision_function_shape=None, degree=1, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
Best Params:  {'svc__class_weight': {False: 1, True: 10}, 'svc__degree': 1, 'svc__kernel': 'rbf', 'svc__C': 22}
Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svc', SVC(C=22, cache_size=200, class_weight={False: 1, True: 10}, coef0=0.0,
  decision_function_shape=None, degree=1, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
	Accuracy: 0.58109	Precision: 0.22649	Recall: 0.69600	F1: 0.34176	F2: 0.49201
	Total predictions: 32000	True positives: 3480	False positives: 11885	False negatives: 1520	True negatives: 15115



#### Test one newly implemented feature

In [33]:
### Ratio-bonus_to_salary
features_selected.append('ratio_bonus_to_salary')

data = featureFormat(my_dataset, features_selected, sort_keys = True, remove_all_zeroes = True, remove_NaN = True)
labels, features = targetFeatureSplit(data)
sss = StratifiedShuffleSplit(labels, n_iter=100, test_size=0.25, train_size=0.6, random_state=46)
for train_idx, test_idx in sss: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append(features[ii] )
            labels_train.append(labels[ii] )
        for jj in test_idx:
            features_test.append(features[jj] )
            labels_test.append(labels[jj] )
test = DecisionTreeClassifier()
params = {'min_samples_split': range(2, 11),
         'class_weight': ['balanced', {True: 12, False: 1}, {True: 10, False: 1}, {True: 8, False: 1}],
         'splitter': ['random', 'best']
         }
test_gs = GridSearchCV(clf, param_grid=params, scoring='f1')
test_gs.fit(features_train, labels_train)
test_estimator = test_gs.best_estimator_
test_params = test_gs.best_params_
print "Best Estimator: ", test_estimator
print "Best Params: ", test_params
test_classifier(test_estimator, my_dataset, features_selected, folds = 1000)

Best Estimator:  DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='random')
Best Params:  {'min_samples_split': 3, 'splitter': 'random', 'class_weight': 'balanced'}
DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='random')
	Accuracy: 0.77200	Precision: 0.29441	Recall: 0.32880	F1: 0.31066	F2: 0.32129
	Total predictions: 32000	True positives: 1644	False positives: 3940	False negatives: 3356	True negatives: 23060



In [34]:
### With the new feature ratio_salary_to_bonus, the DecisionTreeClassifier no longer satisfies the 0.3 requirements.

#### Task 6: Dump your classifier, dataset, and features_list so anyone can check your results. 

In [35]:
dump_classifier_and_data(clf_estimator, my_dataset, features_list)