In [3]:
# %load poi_id.py
# %load poi_id.py
#!/usr/bin/python

import math
import sys
import pickle
import numpy as np
from sklearn.grid_search import GridSearchCV
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','bonus', 'salary', 'percent_to_poi', 'percent_from_poi', 
                 'exercised_stock_options', 'total_payments', 'restricted_stock_deferred'] # You will need to use more features

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

print len(data_dict)

for k,v in data_dict.items():
    print len(v)
    break
### Task 2: Remove outliers
data_dict.pop('TOTAL')

### Task 3: Create new feature(s)
### Creating 2 new features: percent_to_poi and percent_from_poi
### These features calculate the percentage of emails to and from an employee
### that were sent or received from a person of interest.
### They were calculated by taking the number of emails to or from a poi and dividing 
### it by the total number emails sent or received
for k,v in data_dict.items(): 
    if v['from_poi_to_this_person'] != 'NaN' and v['from_this_person_to_poi'] != 'NaN': 
        v['percent_from_poi'] = float(v['from_poi_to_this_person']) / float(v['to_messages'])
        v['percent_to_poi'] = float(v['from_this_person_to_poi']) / float(v['from_messages']) 
    else: 
        v['percent_from_poi'] = 'NaN' 
        v['percent_to_poi'] = 'NaN'

### Created a new feature: percent_stock_exercised
### this feature is the percent of stock an employee exercised given their total stock
for k,v in data_dict.items():
    if v['exercised_stock_options'] != 'NaN' and v['total_stock_value'] != 'NaN':
        v['percent_stock_exercised'] = float(v['exercised_stock_options']) / float(v['total_stock_value'])
    else:
        v['percent_stock_exercised'] = 'NaN'
        
count = 0
for k,v in data_dict.items():
    if v['poi'] == True:
        count += 1
        
print count


### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)


### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
print "Fitting the classifier to the training set"
param_grid_kn = {
        'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'p': [1,2],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'leaf_size': [5, 10, 20, 30, 50, 100, 200]
        }

param_grid_dt = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [1, 5, 10, 15, 20, 40, 100],
        'min_samples_split': [2, 3, 4, 5]
        }

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
#clf = GaussianNB()

# Decision Trees
from sklearn.tree import DecisionTreeClassifier
#clf = GridSearchCV(DecisionTreeClassifier(), param_grid_dt)
#clf = DecisionTreeClassifier(criterion = "entropy", max_depth=10)
#clf = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=20,
            #max_features=None, max_leaf_nodes=None,
            #min_impurity_decrease=0.0, min_impurity_split=None,
            #min_samples_leaf=1, min_samples_split=5,
            #min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            #splitter='best')

# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
#clf = GridSearchCV(KNeighborsClassifier(), param_grid_kn)
clf = KNeighborsClassifier(algorithm='auto', leaf_size=5, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=1,
           weights='uniform')

# Random Forest
from sklearn.ensemble import RandomForestClassifier
#clf = RandomForestClassifier(criterion="entropy", max_depth=5, n_estimators=5)


### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.4, random_state=42)

clf = clf.fit(features_train, labels_train)
#print "Best estimator found by grid search:"
#print clf.best_estimator_
pred = clf.predict(features_test)


from sklearn.metrics import precision_score, recall_score

precision = precision_score(labels_test, pred)
recall = recall_score(labels_test, pred)

print clf.score(features_test,labels_test)
print precision
print recall

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)

146
21
18
Fitting the classifier to the training set
0.896551724138
0.75
0.375


In [5]:
# %load tester.py
#!/usr/bin/pickle

""" a basic script for importing student's POI identifier,
    and checking the results that they get from it 
 
    requires that the algorithm, dataset, and features list
    be written to my_classifier.pkl, my_dataset.pkl, and
    my_feature_list.pkl, respectively

    that process should happen at the end of poi_id.py
"""

import pickle
import sys
from sklearn.cross_validation import StratifiedShuffleSplit
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"

def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."

CLF_PICKLE_FILENAME = "my_classifier.pkl"
DATASET_PICKLE_FILENAME = "my_dataset.pkl"
FEATURE_LIST_FILENAME = "my_feature_list.pkl"

def dump_classifier_and_data(clf, dataset, feature_list):
    with open(CLF_PICKLE_FILENAME, "w") as clf_outfile:
        pickle.dump(clf, clf_outfile)
    with open(DATASET_PICKLE_FILENAME, "w") as dataset_outfile:
        pickle.dump(dataset, dataset_outfile)
    with open(FEATURE_LIST_FILENAME, "w") as featurelist_outfile:
        pickle.dump(feature_list, featurelist_outfile)

def load_classifier_and_data():
    with open(CLF_PICKLE_FILENAME, "r") as clf_infile:
        clf = pickle.load(clf_infile)
    with open(DATASET_PICKLE_FILENAME, "r") as dataset_infile:
        dataset = pickle.load(dataset_infile)
    with open(FEATURE_LIST_FILENAME, "r") as featurelist_infile:
        feature_list = pickle.load(featurelist_infile)
    return clf, dataset, feature_list

def main():
    ### load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    ### Run testing script
    test_classifier(clf, dataset, feature_list)

if __name__ == '__main__':
    main()


KNeighborsClassifier(algorithm='auto', leaf_size=5, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=1,
           weights='uniform')
	Accuracy: 0.89733	Precision: 0.68025	Recall: 0.43400	F1: 0.52991	F2: 0.46787
	Total predictions: 15000	True positives:  868	False positives:  408	False negatives: 1132	True negatives: 12592

