In [4]:
"""
Go back to your code from the last lesson, where you built a simple first iteration of a POI identifier using a decision tree 
and one feature. Copy the POI identifier that you built into the skeleton code in evaluation/evaluate_poi_identifier.py. 
Recall that at the end of that project, your identifier had an accuracy (on the test set) of 0.724. Not too bad, right? Let's dig
into your predictions a little more carefully.
"""
"""
feature_format.py
"""
#!/usr/bin/python

""" 
    A general tool for converting data from the
    dictionary format to an (n x k) python list that's 
    ready for training an sklearn algorithm

    n--no. of key-value pairs in dictonary
    k--no. of features being extracted

    dictionary keys are names of persons in dataset
    dictionary values are dictionaries, where each
        key-value pair in the dict is the name
        of a feature, and its value for that person

    In addition to converting a dictionary to a numpy 
    array, you may want to separate the labels from the
    features--this is what targetFeatureSplit is for

    so, if you want to have the poi label as the target,
    and the features you want to use are the person's
    salary and bonus, here's what you would do:

    feature_list = ["poi", "salary", "bonus"] 
    data_array = featureFormat( data_dictionary, feature_list )
    label, features = targetFeatureSplit(data_array)

    the line above (targetFeatureSplit) assumes that the
    label is the _first_ item in feature_list--very important
    that poi is listed first!
"""

import numpy as np

def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
    """ convert dictionary to numpy array of features
        remove_NaN = True will convert "NaN" string to 0.0
        remove_all_zeroes = True will omit any data points for which
            all the features you seek are 0.0
        remove_any_zeroes = True will omit any data points for which
            any of the features you seek are 0.0
        sort_keys = True sorts keys by alphabetical order. Setting the value as
            a string opens the corresponding pickle file with a preset key
            order (this is used for Python 3 compatibility, and sort_keys
            should be left as False for the course mini-projects).
        NOTE: first feature is assumed to be 'poi' and is not checked for
            removal for zero or missing values.
    """
    return_list = []

    # Key order - first branch is for Python 3 compatibility on mini-projects,
    # second branch is for compatibility on final project.
    if isinstance(sort_keys, str):
        import pickle
        keys = pickle.load(open(sort_keys, "rb"))
    elif sort_keys:
        keys = sorted(dictionary.keys())
    else:
        keys = dictionary.keys()

    for key in keys:
        tmp_list = []
        for feature in features:
            try:
                dictionary[key][feature]
            except KeyError:
                print "error: key ", feature, " not present"
                return
            value = dictionary[key][feature]
            if value=="NaN" and remove_NaN:
                value = 0
            tmp_list.append( float(value) )

        # Logic for deciding whether or not to add the data point.
        append = True
        # exclude 'poi' class as criteria.
        if features[0] == 'poi':
            test_list = tmp_list[1:]
        else:
            test_list = tmp_list
        ### if all features are zero and you want to remove
        ### data points that are all zero, do that here
        if remove_all_zeroes:
            append = False
            for item in test_list:
                if item != 0 and item != "NaN":
                    append = True
                    break
        ### if any features for a given data point are zero
        ### and you want to remove data points with any zeroes,
        ### handle that here
        if remove_any_zeroes:
            if 0 in test_list or "NaN" in test_list:
                append = False
        ### Append the data point if flagged for addition.
        if append:
            return_list.append( np.array(tmp_list) )
    return np.array(return_list)


def targetFeatureSplit( data ):
    """ 
        given a numpy array like the one returned from
        featureFormat, separate out the first feature
        and put it into its own list (this should be the 
        quantity you want to predict)

        return targets and features as separate lists

        (sklearn can generally handle both lists and numpy arrays as 
        input formats when training/predicting)
    """
    target = []
    features = []
    for item in data:
        target.append( item[0] )
        features.append( item[1:] )

    return target, features

"""
validate_poi.py
"""
#!/usr/bin/python
"""
    Starter code for the validation mini-project.
    The first step toward building your POI identifier!

    Start by loading/formatting the data

    After that, it's not our code anymore--it's yours!
"""

import pickle
import sys
from sklearn import cross_validation

data_dict = pickle.load(open("final_project_dataset.pkl", "r") )

### first element is our labels, any added elements are predictor
### features. Keep this the same for the mini-project, but you'll
### have a different feature list when you do the final project.
features_list = ["poi", "salary"]

data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)

features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, 
                                                                                             test_size=0.3, random_state=42)

from sklearn import tree
from sklearn.metrics import accuracy_score

clf =  tree.DecisionTreeClassifier()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
accuracy = accuracy_score(pred, labels_test)
print 'Accuracy:', accuracy

Accuracy: 0.724137931034


In [15]:
"""
How many POIs are predicted for the test set for your POI identifier?

(Note that we said test set! We are not looking for the number of POIs in the whole dataset.)
"""
from collections import Counter

pois_in_test = Counter(pred)
print 'Total predicted POIs: %d' % pois_in_test[1]

Total predicted POIs: 4


In [16]:
"""
How many people total are in your test set?
"""
print 'Total people in test set: %d' % len(pred)

Total people in test set: 29


In [24]:
"""
If your identifier predicted 0. (not POI) for everyone in the test set, what would its accuracy be?
"""
all_zeros = [0.0]*(len(pred))
accuracy = accuracy_score(all_zeros, labels_test)
print 'Accuracy:', accuracy

Accuracy: 0.862068965517


In [26]:
"""
As you may now see, having imbalanced classes like we have in the Enron dataset (many more non-POIs than POIs) introduces some 
special challenges, namely that you can just guess the more common class label for every point, not a very insightful strategy, 
and still get pretty good accuracy!

Precision and recall can help illuminate your performance better. Use the precision_score and recall_score available in 
sklearn.metrics to compute those quantities.

What's the precision?
"""
from sklearn.metrics import precision_score

p = precision_score(labels_test, all_zeros)
print 'Precision: ', p

Precision:  0.0


In [32]:
"""
What's the recall? 

(Note: you may see a message like UserWarning: The precision and recall are equal to zero for some labels. Just like the message 
says, there can be problems in computing other metrics (like the F1 score) when precision and/or recall are zero, and it wants 
to warn you when that happens.) 

Obviously this isn't a very optimized machine learning strategy (we haven't tried any algorithms besides the decision tree, or 
tuned any parameters, or done any feature selection), and now seeing the precision and recall should make that much more apparent
than the accuracy did.
"""
from sklearn.metrics import recall_score

r = recall_score(labels_test, all_zeros)
print 'Recall: ', r

Recall:  0.0


In [34]:
"""
In the final project you'll work on optimizing your POI identifier, using many of the tools learned in this course. Hopefully one
result will be that your precision and/or recall will go up, but then you'll have to be able to interpret them. 

Here are some made-up predictions and true labels for a hypothetical test set; fill in the following boxes to practice 
identifying true positives, false positives, true negatives, and false negatives. Let's use the convention that "1" signifies a 
positive result, and "0" a negative. (this is fabricated data, just to give you some practice)

predictions = [0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1] 
true labels = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0]

How many true positives are there? 6

How many true negatives are there in this example? 9

How many false positives are there? 3

How many false negatives are there? 2

What's the precision of this classifier?

P = true_postives / (true_positives + false_positives)
P = 6/(6+3)
P = 0.6666666

What's the recall of this classifier?

R = true_positives / (true_positives + false_negatives)
R = 6/(6+2)
R = 0.75

Fill in the blank:

"My true positive rate is high, which means that when a _POI_ is present in the test data, I am good at flagging him or her."
"My identifier doesn't have great _PRECISION_, but it does have good _RECALL_. That means that, nearly every time a POI shows 
up in my test set, I am able to identify him or her. The cost of this is that I sometimes get some false positives, where 
non-POIs get flagged."

"My identifier doesn't have great _RECALL_, but it does have good _PRECISION_. That means that whenever a POI gets flagged in my 
test set, I know with a lot of confidence that it's very likely to be a real POI and not a false alarm. On the other hand, the 
price I pay for this is that I sometimes miss real POIs, since I'm effectively reluctant to pull the trigger on edge cases."

"My identifier has a really great _F1-SCORE_. This is the best of both worlds. Both my false positive and false negative rates 
are _LOW_, which means that I can identify POI's reliably and accurately. If my identifier finds a POI then the person is almost 
certainly a POI, and if the identifier does not flag someone, then they are almost certainly not a POI"
"""
predictions = [0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1] 
true_labels = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0]

p = precision_score(true_labels, predictions)
print 'Precision: ', p

r = recall_score(true_labels, predictions)
print 'Recall: ', r

Precision:  0.666666666667
Recall:  0.75
