In [1]:
#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")
import pandas as pd
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data, test_classifier

# FEATURE SELECTION

In [2]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
# features_list = ['poi','salary'] # You will need to use more features
### Load the dictionary containing the dataset
# with open("final_project_dataset.pkl", "r") as data_file:
#     data_dict = pickle.load(data_file)
def load_data_as_df():
    with open("final_project_dataset.pkl", "r") as data_file:
        data_dict = pickle.load(data_file)

    df = pd.DataFrame(data_dict)
    df = df.transpose()
    return df

df=load_data_as_df()

In [3]:
len(df)

146

We have total 146 data points. As in, the data of 146 different individuals supposedly working with Enron.

In [4]:
len(df[df['poi']==1])

18

Out of these 146 we have 18 are marked as POI's

In [5]:
df.head()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,...,long_term_incentive,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
ALLEN PHILLIP K,4175000.0,2869717.0,-3081055.0,,phillip.allen@enron.com,1729541.0,13868,2195.0,47.0,65.0,...,304805.0,152.0,False,126027.0,-126027.0,201955.0,1407.0,2902.0,4484442,1729541
BADUM JAMES P,,178980.0,,,,257817.0,3486,,,,...,,,False,,,,,,182466,257817
BANNANTINE JAMES M,,,-5104.0,,james.bannantine@enron.com,4046157.0,56301,29.0,39.0,0.0,...,,864523.0,False,1757552.0,-560222.0,477.0,465.0,566.0,916197,5243487
BAXTER JOHN C,1200000.0,1295738.0,-1386055.0,,,6680544.0,11200,,,,...,1586055.0,2660303.0,False,3942714.0,,267102.0,,,5634343,10623258
BAY FRANKLIN R,400000.0,260455.0,-201641.0,,frank.bay@enron.com,,129142,,,,...,,69.0,False,145796.0,-82782.0,239671.0,,,827696,63014


Let's remove the emai_address feature and convert every other values as float values. To, get the statistics and correlation easily

In [6]:
df = df.drop('email_address', axis=1)
df = df.astype(float)

In [7]:
df.isnull().sum()

bonus                         64
deferral_payments            107
deferred_income               97
director_fees                129
exercised_stock_options       44
expenses                      51
from_messages                 60
from_poi_to_this_person       60
from_this_person_to_poi       60
loan_advances                142
long_term_incentive           80
other                         53
poi                            0
restricted_stock              36
restricted_stock_deferred    128
salary                        51
shared_receipt_with_poi       60
to_messages                   60
total_payments                21
total_stock_value             20
dtype: int64

Let's get the features having more than 90 null values.We can remove those features as we will not be getting much information from them.

In [8]:
for i in df.columns: 
    if df[i].isnull().sum() > 90:
        print i

deferral_payments
deferred_income
director_fees
loan_advances
restricted_stock_deferred


In [9]:
df = df.drop("loan_advances", axis=1)
df = df.drop('restricted_stock_deferred', axis=1)
df = df.drop('director_fees', axis=1)
df = df.drop('deferral_payments', axis=1)
df = df.drop('deferred_income', axis=1)

In [10]:
df.corr()['poi']

bonus                     -0.013837
exercised_stock_options    0.052886
expenses                  -0.044508
from_messages             -0.074308
from_poi_to_this_person    0.167722
from_this_person_to_poi    0.112940
long_term_incentive       -0.021222
other                     -0.012457
poi                        1.000000
restricted_stock          -0.000107
salary                    -0.030884
shared_receipt_with_poi    0.228313
to_messages                0.058954
total_payments             0.040130
total_stock_value          0.025163
Name: poi, dtype: float64

In [11]:
# Since correlation value for restricted_stock is very low we will drop that
df = df.drop('restricted_stock', axis=1)

#  Removing Outliers

We will remove the rows now. Rows having less than 3 feature values will be removed from the dataset.

In [12]:
### Task 2: Remove outliers
for i in df.index:
        if df.ix[i].count() < 3:
            df = df.drop(i, axis=0)

We have 2 indexes named as TOTAL & THE TRAVEL AGENCY IN THE PARK which cannot be person working with or for the organization.

In [13]:
df=df.drop('TOTAL',axis=0)
df=df.drop('THE TRAVEL AGENCY IN THE PARK',axis=0)

# Exploring new features


We will be looking at the patterns for the ratio of messages coming from the poi to this person to the total no of to messages, and we will be doing the same for messages sent. Having a large no. of portions of emails sent by person is to a poi then likely that person is POI, and same goes for emails received.

In [14]:
### Task 3: Create new feature(s)
df['pct_from_poi'] = df['from_poi_to_this_person']/(df['to_messages'] + 1)
df['pct_to_poi'] = df['from_this_person_to_poi']/(df['from_messages'] + 1)

In [15]:
features_list = list(df.columns)
features_list.remove('poi')
features = df[features_list]
labels = df['poi']
features_list.insert(0,'poi')

In [16]:
print features_list

['poi', 'bonus', 'exercised_stock_options', 'expenses', 'from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'long_term_incentive', 'other', 'salary', 'shared_receipt_with_poi', 'to_messages', 'total_payments', 'total_stock_value', 'pct_from_poi', 'pct_to_poi']


In [17]:

### Store to my_dataset for easy export below.
df1 = df.transpose()
df1 = df1.to_dict()
my_dataset = df1

# ### Extract features and labels from dataset for local testing
# data = featureFormat(my_dataset, features_list, sort_keys = True)
# labels, features = targetFeatureSplit(data)

In [18]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.htm
# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import Imputer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
clf1 = Pipeline([
        ('imp', Imputer(missing_values='NaN',strategy='median')),
        ('clf', GaussianNB())
    ])
clf2 = Pipeline([
        ('imp', Imputer(missing_values='NaN',strategy='median')),
        ('clf', SVC())
    ])
clf3 = Pipeline([
        ('imp', Imputer(missing_values='NaN',strategy='median')),
        ('clf', DecisionTreeClassifier())
    ])
clf4 = Pipeline([
        ('imp', Imputer(missing_values='NaN',strategy='median')),
        ('clf', RandomForestClassifier())
    ])
clf5 = Pipeline([
        ('imp', Imputer(missing_values='NaN',strategy='median')),
        ('clf', AdaBoostClassifier())
    ])

In [19]:
test_classifier(clf1, df1, features_list, folds = 1000)

Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('clf', GaussianNB())])
	Accuracy: 0.82350	Precision: 0.30521	Recall: 0.18450	F1: 0.22998	F2: 0.20035
	Total predictions: 14000	True positives:  369	False positives:  840	False negatives: 1631	True negatives: 11160



In [20]:
test_classifier(clf2, df1, features_list, folds = 1000)

Got a divide by zero when trying out: Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
Precision or recall may be undefined due to a lack of true positive predicitons.


In [21]:
test_classifier(clf3, df1, features_list, folds = 1000)

Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])
	Accuracy: 0.78957	Precision: 0.25492	Recall: 0.24600	F1: 0.25038	F2: 0.24773
	Total predictions: 14000	True positives:  492	False positives: 1438	False negatives: 1508	True negatives: 10562



In [22]:
test_classifier(clf4, df1, features_list, folds = 1000)

Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])
	Accuracy: 0.84436	Precision: 0.36896	Recall: 0.12600	F1: 0.18785	F2: 0.14511
	Total predictions: 14000	True positives:  252	False positives:  431	False negatives: 1748	True negatives: 11569



In [23]:
test_classifier(clf5, df1, features_list, folds = 1000)

Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('clf', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None))])
	Accuracy: 0.81393	Precision: 0.29037	Recall: 0.20950	F1: 0.24339	F2: 0.22186
	Total predictions: 14000	True positives:  419	False positives: 1024	False negatives: 1581	True negatives: 10976



We will take clf3 i.e DecisionTreeClassifier as it has the highest f1 score, and a good balance b/w precision and recall scores of all the algorithms then tune it further.

# Feature Selection (without PCA)

We will have a look at the DecisionTreeClassifier feature importance attributes. 

Also, we will create a new feature_list where we will include all the features having feature importances > 0

In [24]:
features_list = []
for name, importance in zip(features.columns, clf3.named_steps['clf'].feature_importances_):
    print(name, importance)
    if importance > 0:
        features_list.append(name)
        

('bonus', 0.0)
('exercised_stock_options', 0.10751146788990827)
('expenses', 0.0)
('from_messages', 0.0)
('from_poi_to_this_person', 0.080949811117107459)
('from_this_person_to_poi', 0.0)
('long_term_incentive', 0.023067594643045305)
('other', 0.11467889908256881)
('salary', 0.047782874617737003)
('shared_receipt_with_poi', 0.1747487986020097)
('to_messages', 0.0)
('total_payments', 0.0)
('total_stock_value', 0.089222160708636561)
('pct_from_poi', 0.15557541824069068)
('pct_to_poi', 0.20646297509829625)


Here I have selected the features which feature importances score greater than 0. I don't see the need for using PCA over here as we have very less features available.

We will create the new features according to the new features_list.

In [25]:
features=features[features_list]

As, shown by the feature importance score one of the new features engineered i.e "from_poi_to_this_person" into the feature list is performing above the threshold value i.e 0.

So, the modified feature list will be.

In [26]:
features_list.insert(0,'poi')

In [27]:
features_list

['poi',
 'exercised_stock_options',
 'from_poi_to_this_person',
 'long_term_incentive',
 'other',
 'salary',
 'shared_receipt_with_poi',
 'total_stock_value',
 'pct_from_poi',
 'pct_to_poi']

Since DecisionTreeClassifier doesn't require scaling because these classifiers don't rely on the Euclidean distance between data points when making decisions. 

So, there is no feature scaling done to the new features created.

We will create the new data set to be passed to test_classifier using our new features_list.

In [28]:
df2=df[features_list]
df1 = df2.transpose()
df1 = df1.to_dict()

To justify Validation

Let's train our model on the entire data set, and see the metrics of our model on the Training data . We expect the metrics of our model to be good on the training features as it is trained on the same data.

In [29]:
clf3.fit(features, labels)
pred = clf3.predict(features)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print accuracy_score(pred,labels)
print precision_score(pred,labels)
print recall_score(pred,labels)
print f1_score(pred,labels)


1.0
1.0
1.0
1.0


Our metrics comes so perfect.


Let's see metrics of our model on unseen data. 


We expect our classifier to do well as it is trained on the large dataset.

In [30]:
test_classifier(clf3, df1, features_list, folds = 1000)

Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])
	Accuracy: 0.79543	Precision: 0.24648	Recall: 0.21000	F1: 0.22678	F2: 0.21641
	Total predictions: 14000	True positives:  420	False positives: 1284	False negatives: 1580	True negatives: 10716



Metrics of our model differs alot on unseen data.

It was perfect previously because the model was trained and tested on the same data set which gave us the wrong idea about the model metrics. That's why it is necessary to perform validation.

Since, the proportion of population of POI's is very low . So, we call it as a imbalanced classification problem.

Due to the class imbalance problem, it is preferred to use a stratified shuffle split instead. This ensures that an equal ratio of POIs to non-POIs are found in the training and test sets.

As this is a imbalanced classification problem .We need to have a good precision and recall score to proove that the model is performing well. 

In context of this project, 

recall =    TP/(TP + FN)      =    POI's correctly identified/(POI's correctly identified + POI's incorrectly labelled as Non-POI's)

precision = TP/(TP + FP)     =     POI's correctly identified/(POI's correctly identified + Non POI's incorrectly labelled as POI's)

Howerver, We can see that on just removing the non important features our model metrics goes higher as compared to the previous run on all the features_list.

Where it was -

In [31]:
test_classifier(clf3, df1, features_list, folds = 1000)

Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])
	Accuracy: 0.79336	Precision: 0.23995	Recall: 0.20600	F1: 0.22168	F2: 0.21200
	Total predictions: 14000	True positives:  412	False positives: 1305	False negatives: 1588	True negatives: 10695



We have a set of parameters to be passed to DecisionTreeClassifier. These parameters are kind of knobs which decides the performance of the classifier. 


While training the model these parameters are not learnt.


Using Grid Search with parameter grid will train model on the various parameters combinations and training and tested the data splitted by StratifiedShuffleSplit. Using the scorer passed to GridSearch we have created a custom scorer to maximize recall score coming by the different model combinations. The model with the highest recall score will be selected and we will be using it's parameters to create the final model.

# Tuning DecisionTreeClassifier / Hyperparameter optimization

In machine learning, hyperparameter optimization or tuning is the problem of choosing a set of optimal hyperparameters for a learning algorithm.

The same kind of machine learning model can require different constraints, weights or learning rates to generalize different data patterns. These measures are called hyperparameters, and have to be tuned so that the model can optimally solve the machine learning problem.

Tuning is essentially selecting the best parameters for an algorithm to optimize its performance given a working environment such as hardware, specific workloads, etc. And tuning in machine learning is an automated process for doing this.

In [61]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split,StratifiedShuffleSplit
sss = StratifiedShuffleSplit(labels, 3, test_size=0.3, random_state=0)
    
for train_index, test_index in sss:
    features_train = features.iloc[train_index]
    features_test= features.iloc[test_index]
    labels_train, labels_test = labels.iloc[train_index], labels.iloc[test_index]

# Feature Selection by SelectKBest

We will be deploying SelectKBest in our pipeline to further have the best features selected

In [75]:
# Build pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import SelectKBest, f_classif

Pipeline2 = Pipeline([
        ('imp', Imputer(missing_values='NaN')),
        ('kbest', SelectKBest(f_classif)),
        ('clf', DecisionTreeClassifier())
    ])

We will be passing [3,4,5,6,7,8,9] values for k parameter of SelectKBest so that gridsearch can try all the combinations with these values and give us the best parameter value of K which should be used in the final model.

In [79]:
from sklearn.metrics import make_scorer, f1_score, recall_score
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedShuffleSplit
score = make_scorer(recall_score, greater_is_better=True)
# Build Grid
# pre-processing
# c = [x for x in range(5,8)]
# c=[3,4,5]
# estimator parameters
e = [100,200,300,400,500,600,700]
r = [0.1,0.2,0.3,0.4,0.5,0.6,0.7]
c = [d for d in range(3, 7)]

param_grid2 = {'imp__strategy': ['mean','median','most_frequent'],
               'kbest__k': [3,4,5,6,7,8,9],
              'clf__criterion': ['gini','entropy'],
              'clf__max_depth': [3,4,5,6,7,8,9],
              'clf__presort': [True,False],
               'clf__max_features': ["auto","sqrt","log2"]
              
             }

In [80]:
len(features_list)

10

We will be tuning the strategies for Imputer 'mean','median','most_frequent'.

Also, we will be tuning below mentioned parameters of DecisionTreeClassifier -

    criterion - possiible values as 'gini','entropy'
    max_depth - possible values as 3,4,5,6,7,8,9
    presort - possible values as True, False
    max_features - possible values as "auto","sqrt","log2"

# Validation definition and importance

We will be using StratifiedShuffleSplit for cross_validation in gridsearch.It will take 25% data set as test set and train the model on the rest 75% data.

In machine learning, model validation is referred to as the process where a trained model is evaluated with a testing data set. The testing data set is a separate portion of the same data set from which the training set is derived. The main purpose of using the testing data set is to test the generalization ability of a trained model.



Model validation is carried out after model training. Together with model training, model validation aims to find an optimal model with the best performance.

In [81]:
# set model parameters to grid search object
gridCV_object = GridSearchCV(estimator = Pipeline2, 
                             param_grid = param_grid2,
                             scoring = score,
                             cv = StratifiedShuffleSplit(labels_train, test_size=0.25,  n_iter=10))

# train the model
gridCV_object.fit(features, labels)

GridSearchCV(cv=StratifiedShuffleSplit(labels=[ 0.  0. ...,  0.  0.], n_iter=10, test_size=0.25, random_state=None),
       error_score='raise',
       estimator=Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('kbest', SelectKBest(k=10, score_func=<function f_classif at 0x0000000009826BA8>)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'imp__strategy': ['mean', 'median', 'most_frequent'], 'clf__criterion': ['gini', 'entropy'], 'clf__max_depth': [3, 4, 5, 6, 7, 8, 9], 'kbest__k': [3, 4, 5, 6, 7, 8, 9], 'clf__presort': [True, False], 'clf__max_features': ['auto', 'sqrt', 'log2']},
       pre_dispatch='2*n_jobs', refit=True,
       scor

In [82]:
gridCV_object.best_params_

{'clf__criterion': 'entropy',
 'clf__max_depth': 9,
 'clf__max_features': 'log2',
 'clf__presort': False,
 'imp__strategy': 'median',
 'kbest__k': 8}

Grid search gives us the above parameters. i.e 

Imputer strategy should be median.
SelectKBest k should be 8


In [84]:
from sklearn.pipeline import Pipeline
clf_final = Pipeline([
        ('imp', Imputer(missing_values='NaN',strategy='median')),
        ('kbest', SelectKBest(f_classif,k=8)),
        ('clf', DecisionTreeClassifier(max_depth=9,max_features='log2',presort=False,criterion='entropy'))])

test_classifier(clf_final, df1,features_list, folds= 1000)

Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('kbest', SelectKBest(k=8, score_func=<function f_classif at 0x0000000009826BA8>)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=9,
            max_features='log2', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])
	Accuracy: 0.81343	Precision: 0.33351	Recall: 0.30650	F1: 0.31944	F2: 0.31155
	Total predictions: 14000	True positives:  613	False positives: 1225	False negatives: 1387	True negatives: 10775



So, we can clearly see that the tuned classifier gives us the better scores for each metrics.

Below I have pasted the metrics for the classifier which is not tuned properly has a comparatively lower metrics to our tuned classifier.

In [28]:
test_classifier(clf3, df1, features_list, folds = 1000)

Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])
	Accuracy: 0.80379	Precision: 0.28571	Recall: 0.24900	F1: 0.26610	F2: 0.25557
	Total predictions: 14000	True positives:  498	False positives: 1245	False negatives: 1502	True negatives: 10755

