In [19]:
import os
os.chdir("F:\\ud120\\final_project\\")
path= os.getcwd() + "\\"
path

'F:\\ud120\\final_project\\'

In [20]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import pickle
import sys
import scipy
import matplotlib 
#import pylab
import matplotlib.pyplot as plt
from functools import partial
from sklearn.preprocessing import Imputer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit




In [89]:
# finction for reading a dictionary as a dataframe
def dict_to_dataframe(dictionary):

    df = pd.DataFrame.from_dict(dictionary).transpose()
    df.apply(partial(pd.to_numeric, errors='ignore'))

    df.reset_index(level=0, inplace=True)
    columns = list(df.columns)
    columns[0] = 'staff_name'
    df.columns = columns
    
    return(df)

# function for counting 'NaN' values without replacing
def count_nan(column):
    k = 0
    for value in column:
        if value == 'NaN':
            k += 1
    p = 100.0*k/len(column)
    return k, p

# function for cleaning 'NaN' values without replacing
def column_without_nan(column):
    data = []
    for value in column:
        if value == 'NaN':
            continue
        data.append(value)
    return data

# function for cleaning 'NaN' values with replacing
def column_with_npnan(column):
    data = []
    for value in column:
        if value == 'NaN':
            value = np.nan
        data.append(value)
    return np.array(data)

# function for displaying 3 top values
def show_three_top(data, feature):
    sorted_list = sorted(column_without_nan(data[feature]), reverse=True)[0:3]
    return sorted_list

def featureFormat( dictionary, features, remove_NaN=True, 
                  remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
    return_list = []

    # Key order - first branch is for Python 3 compatibility on mini-projects,
    # second branch is for compatibility on final project.
    if isinstance(sort_keys, str):
        keys = pickle.load(open(sort_keys, "rb"))
    elif sort_keys:
        keys = sorted(dictionary.keys())
    else:
        keys = dictionary.keys()

    for key in keys:
        tmp_list = []
        for feature in features:
            try:
                dictionary[key][feature]
            except KeyError:
                print ("error: key ", feature, " not present")
                return
            value = dictionary[key][feature]
            if value=="NaN" and remove_NaN:
                value = 0
            tmp_list.append( float(value) )
        # Logic for deciding whether or not to add the data point.
        append = True
        # exclude 'poi' class as criteria.
        if features[0] == 'poi':
            test_list = tmp_list[1:]
        else:
            test_list = tmp_list
        ### if all features are zero and you want to remove
        ### data points that are all zero, do that here
        if remove_all_zeroes:
            append = False
            for item in test_list:
                if item != 0 and item != "NaN":
                    append = True
                    break
        ### if any features for a given data point are zero
        ### and you want to remove data points with any zeroes,
        ### handle that here
        if remove_any_zeroes:
            if 0 in test_list or "NaN" in test_list:
                append = False
        ### Append the data point if flagged for addition.
        if append:
            return_list.append( np.array(tmp_list) )

    return np.array(return_list) 

def targetFeatureSplit( data ):
    target = []
    features = []
    for item in data:
        target.append( item[0] )
        features.append( item[1:] )

    return target, features

PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"


def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print ("Warning: Found a predicted label not == 0 or 1.")
                print ("All predictions should take value 0 or 1.")
                print ("Evaluating performance for processed predictions:")
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print (clf)
        print (PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5))
        print (RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives))
        print ("")
    except:
        print ("Got a divide by zero when trying out:", clf)
        print ("Precision or recall may be undefined due to a lack of true positive predicitons.")

CLF_PICKLE_FILENAME = path+"my_classifier.pkl"
DATASET_PICKLE_FILENAME = path+"my_dataset.pkl"
FEATURE_LIST_FILENAME = path+"my_feature_list.pkl"
        
def dump_classifier_and_data(clf, dataset, feature_list):
    with open(CLF_PICKLE_FILENAME, "wb") as clf_outfile:
        pickle.dump(clf, clf_outfile)
    with open(DATASET_PICKLE_FILENAME, "wb") as dataset_outfile:
        pickle.dump(dataset, dataset_outfile)
    with open(FEATURE_LIST_FILENAME, "wb") as featurelist_outfile:
        pickle.dump(feature_list, featurelist_outfile)

def load_classifier_and_data():
    with open(CLF_PICKLE_FILENAME, "rb") as clf_infile:
        clf = pickle.load(clf_infile)
    with open(DATASET_PICKLE_FILENAME, "rb") as dataset_infile:
        dataset = pickle.load(dataset_infile)
    with open(FEATURE_LIST_FILENAME, "rb") as featurelist_infile:
        feature_list = pickle.load(featurelist_infile)
    return clf, dataset, feature_list

def main():
    ### load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    ### Run testing script
    test_classifier(clf, dataset, feature_list)




In [90]:
enron_data = pickle.load(open("final_project_dataset.pkl", "rb"))

# Get names and count the persons of interest
k_poi=0
poi = []
for i in range(len(enron_data.keys())):
    person = list(enron_data.keys())[i]
    if enron_data[person]['poi'] == True:
        k_poi += 1
        poi.append(person)

# Construct the dataframe from the dictionary
enron_df = dict_to_dataframe(enron_data)

import warnings
warnings.filterwarnings('ignore')

# Create columns 'staff_name', 'salary' without NaN
salary_name = enron_df[['staff_name', 'salary']]
salary_name = salary_name[salary_name['salary'] != 'NaN']

# Find the name of the outlier
salary_name['staff_name'][salary_name['salary'].idxmax()]

# Delete the record 'TOTAL'
del enron_data['TOTAL']
enron_df = enron_df[enron_df['staff_name'] != 'TOTAL']

# Create columns 'staff_name', 'salary' without NaN after deleting the outlier
salary_name = enron_df[['staff_name', 'salary']]
salary_name = salary_name[salary_name['salary'] != 'NaN']

# Create dataframe with replaced NaN by zero
enron_df1 = pd.DataFrame(enron_df)
enron_df1 = enron_df1.convert_objects(convert_numeric=True)
enron_df1 = enron_df1.fillna(0)
# Create dataframe to check total payments
enron_df2 = pd.DataFrame()
enron_df2['staff_name'] = enron_df1['staff_name']
enron_df2['total_check'] = enron_df1['bonus'] + enron_df1['director_fees'] + enron_df1['deferral_payments'] + \
    enron_df1['deferred_income'] + enron_df1['loan_advances'] + enron_df1['long_term_incentive'] + \
    enron_df1['expenses'] + enron_df1['other'] + enron_df1['salary']
enron_df2['total_payments'] = enron_df1['total_payments']
enron_df2['stock_check'] = (enron_df1['restricted_stock'] + enron_df1['exercised_stock_options'] + \
                            enron_df1['restricted_stock_deferred'])
enron_df2['total_stock_value'] = enron_df1['total_stock_value']
enron_df2['same_total'] = (enron_df2['total_check'] == enron_df2['total_payments'])
enron_df2['same_stock'] = (enron_df2['stock_check'] == enron_df2['total_stock_value'])
enron_df2['poi'] = enron_df1['poi']

# Replacing values in 2 rows in the dictionary
enron_data['BELFER ROBERT']['deferred_income'] = -102500
enron_data['BELFER ROBERT']['deferral_payments'] = 'NaN'
enron_data['BELFER ROBERT']['director_fees'] = 102500
enron_data['BELFER ROBERT']['expenses'] = 3285
enron_data['BELFER ROBERT']['total_payments'] = 3285
enron_data['BELFER ROBERT']['exercised_stock_options'] = 'NaN'
enron_data['BELFER ROBERT']['restricted_stock'] = 44093
enron_data['BELFER ROBERT']['restricted_stock_deferred'] = -44093
enron_data['BELFER ROBERT']['total_stock_value'] = 'NaN'

enron_data['BHATNAGAR SANJAY']['director_fees'] = 'NaN'
enron_data['BHATNAGAR SANJAY']['expenses'] = 137864
enron_data['BHATNAGAR SANJAY']['other'] = 'NaN'
enron_data['BHATNAGAR SANJAY']['total_payments'] = 137864
enron_data['BHATNAGAR SANJAY']['exercised_stock_options'] = 15456290
enron_data['BHATNAGAR SANJAY']['restricted_stock'] = 2604490
enron_data['BHATNAGAR SANJAY']['restricted_stock_deferred'] = -2604490
enron_data['BHATNAGAR SANJAY']['total_stock_value'] = 15456290
# Replacing values in 2 rows in the dataframe
enron_df = dict_to_dataframe(enron_data)

# Check replacing
enron_df[(enron_df['staff_name'] == 'BHATNAGAR SANJAY') | (enron_df['staff_name'] == 'BELFER ROBERT')].T

# Create a list of finance features
finance_feature_list = ['bonus', 'deferral_payments', 'deferred_income', 'director_fees', 'exercised_stock_options',
                        'expenses', 'loan_advances', 'long_term_incentive', 'other', 'restricted_stock', 
                        'restricted_stock_deferred', 'salary', 'total_payments', 'total_stock_value']

# Create a list of email features
email_feature_list = ['to_messages', 'from_poi_to_this_person', 'from_messages', 
                      'from_this_person_to_poi', 'shared_receipt_with_poi']

# Find 1 outlier for each email feature
email_outliers = enron_df[(enron_df['staff_name'] == 'KAMINSKI WINCENTY J') | 
                          (enron_df['staff_name'] == 'SHAPIRO RICHARD S') | 
                          (enron_df['staff_name'] == 'DELAINEY DAVID W') | 
                          (enron_df['staff_name'] == 'LAVORATO JOHN J') |
                          (enron_df['staff_name'] == 'BELDEN TIMOTHY N') ] \
[['staff_name', 'to_messages', 'from_messages', 
  'from_poi_to_this_person', 'from_this_person_to_poi', 'shared_receipt_with_poi']]

email_outliers_list = ['KAMINSKI WINCENTY J', 'SHAPIRO RICHARD S', 'DELAINEY DAVID W', 
                       'LAVORATO JOHN J', 'BELDEN TIMOTHY N']
email_outliers

# Replace string NaN by np.nan
enron_df_np = enron_df.apply(column_with_npnan)

# Setup Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=1)
# Setup Scaler
scaler = MinMaxScaler()

# Setup feature list without email address
feature_list0 = ['bonus', 'deferral_payments', 'deferred_income', 'director_fees', 'expenses', 
                'exercised_stock_options', 'loan_advances', 'long_term_incentive', 'other', 'restricted_stock', 
                'restricted_stock_deferred', 'salary', 'total_payments', 'total_stock_value',
                'to_messages', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 
                 'shared_receipt_with_poi']

# Setup variable for features after Imputer and Scaler
feature_imp = [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]]
feature_imp_scaled = [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]]


# Transform data for finance features by Imputer
for i in range(len(feature_list0)):
    element = feature_list0[i]
    #print(element)
    imp.fit([enron_df_np[element]])
    #print(np.array([enron_df_np[element]]).shape)
    feature_imp[i] = imp.transform([enron_df_np[element]])
    #print(feature_imp[i].shape)
    #print(feature_imp[i][0])
    feature_imp[i] = feature_imp[i][0]
    #print(np.array([feature_imp[i]]).shape)
    #print(scaler.fit_transform([feature_imp[i]]))
    feature_imp_scaled[i] = scaler.fit_transform(feature_imp[i].reshape(-1,1))
    feature_imp_scaled[i]=np.reshape(feature_imp_scaled[i],(1,145))
    #print( feature_imp_scaled[i])

print(np.array(feature_imp_scaled).shape)
feature_imp_scaled= np.reshape(feature_imp_scaled, (1,19,145))
feature_imp_scaled=feature_imp_scaled[0]
print(feature_imp_scaled)
enron_df_imp_scaled = pd.DataFrame(feature_imp_scaled)
enron_df_imp_scaled.index = feature_list0

#Transform the dataframe
enron_df_imp_scaled = enron_df_imp_scaled.T

# Complete a scaled dataframe
df1 = enron_df['staff_name']
df2 = enron_df['email_address']
df3 = enron_df['poi']
scaled_enron_df = pd.concat([enron_df_imp_scaled, df1, df2, df3], axis=1)
scaled_enron_df.head().T

scaled_enron_data = scaled_enron_df.to_dict(orient="index")

correlation_enron_df = pd.DataFrame(scaled_enron_df)


features_list01 = ['poi','salary', 'bonus', 'exercised_stock_options', 'deferred_income']
features_list02 = ['poi','salary', 'bonus', 'exercised_stock_options', 'deferred_income', 
                   'expenses', 'long_term_incentive', 'restricted_stock']
features_list03 = ['poi', 'salary', 'bonus', 'exercised_stock_options', 'deferred_income', 
                   'long_term_incentive', 'expenses']
features_list04 = ['poi','from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']
features_list05 = ['poi','salary', 'bonus', 'exercised_stock_options', 'deferred_income', 
                  'from_poi_to_this_person', 'from_this_person_to_poi', 'shared_receipt_with_poi']



(19, 1, 145)
[[ 0.51765448  0.14272044  0.14272044 ...,  0.14272044  0.14272044
   0.14272044]
 [ 0.44582382  0.0266425   0.13384545 ...,  0.13384545  0.13384545
   0.13384545]
 [ 0.12082906  0.83717943  0.99878095 ...,  0.83717943  0.83717943
   0.83717943]
 ..., 
 [ 0.15206186  0.04157082  0.00118417 ...,  0.04157082  0.04157082
   0.04157082]
 [ 0.10673235  0.06770535  0.         ...,  0.06770535  0.06770535
   0.06770535]
 [ 0.2545751   0.21280397  0.08389201 ...,  0.21280397  0.21280397
   0.21280397]]


In [91]:

my_dataset = enron_data

'''
clf01 =  DecisionTreeClassifier(max_depth=1)
clf02 =  AdaBoostClassifier()
clf03 =  RandomForestClassifier(min_samples_split=50)
clf04 =  GaussianNB()
clf05 =  neighbors.KNeighborsClassifier(n_neighbors=4, weights='distance')
clf06 =  QuadraticDiscriminantAnalysis()
clf07 =  KMeans(n_clusters=2)
clf08 =  LogisticRegression()
'''

data01 = featureFormat(my_dataset, features_list01, sort_keys = True)
labels01, features01 = targetFeatureSplit(data01)

features_train01, features_test01, labels_train01, labels_test01 = \
    train_test_split(features01, labels01, test_size=0.3, random_state=42)


# Create a dataframe for engineering
engineer_enron_df = pd.DataFrame(scaled_enron_df)

# Create new features
engineer_enron_df['coefficient_bonus_salary'] = 0.0
engineer_enron_df['coefficient_from_poi_all'] = 0.0
engineer_enron_df['coefficient_to_poi_all'] = 0.0
engineer_enron_df['coefficient_income_total'] = 0.0

for i in range(len(scaled_enron_df['salary'])):
    if scaled_enron_df['salary'][i] > 0:
        engineer_enron_df['coefficient_bonus_salary'][i] = \
        1.0 * scaled_enron_df['bonus'][i] / scaled_enron_df['salary'][i]
for i in range(len(scaled_enron_df['to_messages'])):
    if scaled_enron_df['to_messages'][i] > 0:
        engineer_enron_df['coefficient_from_poi_all'][i] = \
        1.0 * scaled_enron_df['from_poi_to_this_person'][i] / scaled_enron_df['to_messages'][i]
for i in range(len(scaled_enron_df['from_messages'])):
    if scaled_enron_df['from_messages'][i] > 0:
        engineer_enron_df['coefficient_to_poi_all'][i] = \
        1.0 * (scaled_enron_df['from_this_person_to_poi'][i] + scaled_enron_df['shared_receipt_with_poi'][i]) \
        / scaled_enron_df['from_messages'][i]
for i in range(len(scaled_enron_df['total_payments'])):
    if scaled_enron_df['total_payments'][i] > 0:
        engineer_enron_df['coefficient_income_total'][i] = \
        1.0 * scaled_enron_df['deferred_income'][i] / scaled_enron_df['total_payments'][i]


# Reading the dataframe into a dictionary
engineer_enron_data = engineer_enron_df.to_dict(orient="index")

my_dataset3 = engineer_enron_data

features_list06 = ['poi', 'coefficient_bonus_salary', 'coefficient_income_total',
                   'coefficient_from_poi_all', 'coefficient_to_poi_all',
                   'exercised_stock_options']

data36 = featureFormat(my_dataset3, features_list06, sort_keys = True)
labels36, features36 = targetFeatureSplit(data36)

features_train36, features_test36, labels_train36, labels_test36 = \
    train_test_split(features36, labels36, test_size=0.3, random_state=42)


clf21 =  DecisionTreeClassifier(min_samples_split=15, max_depth=7)
dump_classifier_and_data(clf21, my_dataset3, features_list06)
load_classifier_and_data()

'''
### load up student's classifier, dataset, and feature_list
clf, dataset, feature_list = load_classifier_and_data()
### Run testing script
test_classifier(clf, dataset, feature_list)
'''

clf45 =  neighbors.KNeighborsClassifier(n_neighbors=4, weights='distance')
dump_classifier_and_data(clf45, my_dataset, features_list01)
load_classifier_and_data()

if __name__ == '__main__':
    main()



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='distance')
	Accuracy: 0.88107	Precision: 0.64231	Recall: 0.37800	F1: 0.47592	F2: 0.41190
	Total predictions: 14000	True positives:  756	False positives:  421	False negatives: 1244	True negatives: 11579

