In [2]:
"""
In this project, you will use regression to predict financial data for Enron employees and associates. Once you know some 
financial data about an employee, like their salary, what would you predict for the size of their bonus?

Run the starter code found in regression/finance_regression.py. This will draw a scatterplot, with all the data points drawn in. 
What target are you trying to predict? What is the input feature being used to predict it?

Mentally (or better yet, print out the scatterplot and use paper and pencil) sketch out the regression line that you roughly 
predict.
"""

"""
feature_format.py
"""

#!/usr/bin/python

""" 
    A general tool for converting data from the
    dictionary format to an (n x k) python list that's 
    ready for training an sklearn algorithm

    n--no. of key-value pairs in dictonary
    k--no. of features being extracted

    dictionary keys are names of persons in dataset
    dictionary values are dictionaries, where each
        key-value pair in the dict is the name
        of a feature, and its value for that person

    In addition to converting a dictionary to a numpy 
    array, you may want to separate the labels from the
    features--this is what targetFeatureSplit is for

    so, if you want to have the poi label as the target,
    and the features you want to use are the person's
    salary and bonus, here's what you would do:

    feature_list = ["poi", "salary", "bonus"] 
    data_array = featureFormat( data_dictionary, feature_list )
    label, features = targetFeatureSplit(data_array)

    the line above (targetFeatureSplit) assumes that the
    label is the _first_ item in feature_list--very important
    that poi is listed first!
"""
import numpy as np

def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
    """ convert dictionary to numpy array of features
        remove_NaN = True will convert "NaN" string to 0.0
        remove_all_zeroes = True will omit any data points for which
            all the features you seek are 0.0
        remove_any_zeroes = True will omit any data points for which
            any of the features you seek are 0.0
        sort_keys = True sorts keys by alphabetical order. Setting the value as
            a string opens the corresponding pickle file with a preset key
            order (this is used for Python 3 compatibility, and sort_keys
            should be left as False for the course mini-projects).
        NOTE: first feature is assumed to be 'poi' and is not checked for
            removal for zero or missing values.
    """

    return_list = []

    # Key order - first branch is for Python 3 compatibility on mini-projects,
    # second branch is for compatibility on final project.
    if isinstance(sort_keys, str):
        import pickle
        keys = pickle.load(open(sort_keys, "rb"))
    elif sort_keys:
        keys = sorted(dictionary.keys())
    else:
        keys = dictionary.keys()

    for key in keys:
        tmp_list = []
        for feature in features:
            try:
                dictionary[key][feature]
            except KeyError:
                print "error: key ", feature, " not present"
                return
            value = dictionary[key][feature]
            if value=="NaN" and remove_NaN:
                value = 0
            tmp_list.append( float(value) )

        # Logic for deciding whether or not to add the data point.
        append = True
        # exclude 'poi' class as criteria.
        if features[0] == 'poi':
            test_list = tmp_list[1:]
        else:
            test_list = tmp_list
        ### if all features are zero and you want to remove
        ### data points that are all zero, do that here
        if remove_all_zeroes:
            append = False
            for item in test_list:
                if item != 0 and item != "NaN":
                    append = True
                    break
        ### if any features for a given data point are zero
        ### and you want to remove data points with any zeroes,
        ### handle that here
        if remove_any_zeroes:
            if 0 in test_list or "NaN" in test_list:
                append = False
        ### Append the data point if flagged for addition.
        if append:
            return_list.append( np.array(tmp_list) )

    return np.array(return_list)


def targetFeatureSplit( data ):
    """ 
        given a numpy array like the one returned from
        featureFormat, separate out the first feature
        and put it into its own list (this should be the 
        quantity you want to predict)

        return targets and features as separate lists

        (sklearn can generally handle both lists and numpy arrays as 
        input formats when training/predicting)
    """
    target = []
    features = []
    for item in data:
        target.append( item[0] )
        features.append( item[1:] )

    return target, features

In [16]:
"""
finance_regression.py
"""
#!/usr/bin/python

"""
    Starter code for the regression mini-project.
    
    Loads up/formats a modified version of the dataset
    (why modified?  we've removed some trouble points
    that you'll find yourself in the outliers mini-project).

    Draws a little scatterplot of the training/testing data

    You fill in the regression code where indicated:
"""    

import sys
import pickle
#sys.path.append("../tools/")
#from feature_format import featureFormat, targetFeatureSplit
dictionary = pickle.load( open("final_project_dataset_modified.pkl", "r") )

### list the features you want to look at--first item in the 
### list will be the "target" feature
features_list = ["bonus", "salary"]
data = featureFormat( dictionary, features_list, remove_any_zeroes=True)
target, features = targetFeatureSplit( data )

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"



### Your regression goes here!
### Please name it reg, so that the plotting code below picks it up and 
### plots it correctly. Don't forget to change the test_color above from "b" to
### "r" to differentiate training points from test points.








### draw the scatterplot, with color-coded training and testing points
import matplotlib.pyplot as plt
for feature, target in zip(feature_test, target_test):
    plt.scatter( feature, target, color=test_color ) 
for feature, target in zip(feature_train, target_train):
    plt.scatter( feature, target, color=train_color ) 

### labels for the legend
plt.scatter(feature_test[0], target_test[0], color=test_color, label="test")
plt.scatter(feature_test[0], target_test[0], color=train_color, label="train")

### draw the regression line, once it's coded
try:
    plt.plot( feature_test, reg.predict(feature_test) )
except NameError:
    pass
plt.xlabel(features_list[1])
plt.ylabel(features_list[0])
plt.legend()
plt.show()


In [15]:
"""
In regression, you need training and testing data, just like in classification. This has already been set up in the starter code. 
Change the value of test_color from "b" to "r" (for "red"), and rerun. Note: For those students converting Python 2 code to 
Python 3, see below for some important remarks regarding compatibility.

You will be fitting your regression using the blue (training) points only. (You may have noticed that instead of the standard 
10%, we've put 50% of our data into the test set--that's because in Part 5, we will switch the training and testing datasets 
around and splitting the data evenly makes this more straightforward.)

Import LinearRegression from sklearn, and create/fit your regression. Name it reg so that the plotting code will show it 
overlaid on the scatterplot. Does it fall approximately where you expected it?

Extract the slope (stored in the reg.coef_ attribute) and the intercept. What are the slope and intercept?
"""

from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(feature_train, target_train)

print 'Slope: ', reg.coef_[0]
print 'Intercept: ', reg.intercept_

#print 'Net worth prediction: ', reg.predict([[27]])

Slope:  5.44814028881
Intercept:  -102360.543294


In [8]:
"""
Imagine you were a less savvy machine learner, and didn't know to test on a holdout test set. Instead, you tested on the 
same data that you used to train, by comparing the regression predictions to the target values (i.e. bonuses) in the training 
data. What score do you find? You may not have an intuition yet for what a "good" score is; this score isn't very good (but it 
could be a lot worse).
"""
print '###### stats on train dataset #####'
print 'r-squared score: ', reg.score(feature_train, target_train)

###### stats on train dataset #####
r-squared score:  0.0455091926995


In [9]:
"""
Now compute the score for your regression on the test data, like you know you should. What's that score on the testing data? 
If you made the mistake of only assessing on the training data, would you overestimate or underestimate the performance of 
your regression?
"""
print '###### stats on test dataset #####'
print 'r-squared score: ', reg.score(feature_test, target_test)


###### stats on test dataset #####
r-squared score:  -1.48499241737


In [18]:
"""
There are lots of finance features available, some of which might be more powerful than others in terms of predicting a 
person's bonus. For example, suppose you thought about the data a bit and guess that the "long_term_incentive" feature, which 
is supposed to reward employees for contributing to the long-term health of the company, might be more closely related to a 
person's bonus than their salary is.

A way to confirm that you're right in this hypothesis is to regress the bonus against the long term incentive, and see if the 
regression score is significantly higher than regressing the bonus against the salary. Perform the regression of bonus against 
long term incentive -- what's the score on the test data?
"""
features_list = ["bonus", "long_term_incentive"]
data = featureFormat( dictionary, features_list, remove_any_zeroes=True)
target, features = targetFeatureSplit( data )

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"


reg = LinearRegression()
reg.fit(feature_train, target_train)

print 'Slope: ', reg.coef_[0]
print 'Intercept: ', reg.intercept_
print 'r-squared score train: ', reg.score(feature_train, target_train)
print 'r-squared score test: ', reg.score(feature_test, target_test)

### draw the scatterplot, with color-coded training and testing points
import matplotlib.pyplot as plt
for feature, target in zip(feature_test, target_test):
    plt.scatter( feature, target, color=test_color ) 
for feature, target in zip(feature_train, target_train):
    plt.scatter( feature, target, color=train_color ) 

### labels for the legend
plt.scatter(feature_test[0], target_test[0], color=test_color, label="test")
plt.scatter(feature_test[0], target_test[0], color=train_color, label="train")

### draw the regression line, once it's coded
try:
    plt.plot( feature_test, reg.predict(feature_test) )
except NameError:
    pass
plt.xlabel(features_list[1])
plt.ylabel(features_list[0])
plt.legend()
plt.show()

Slope:  1.19214698985
Intercept:  554478.756215
r-squared score train:  0.217085971258
r-squared score test:  -0.59271289995


In [23]:
"""
This is a sneak peek of the next lesson, on outlier identification and removal. Go back to a setup where you are using the salary 
to predict the bonus, and rerun the code to remind yourself of what the data look like. You might notice a few data points that 
fall outside the main trend, someone who gets a high salary (over a million dollars!) but a relatively small bonus. This is an 
example of an outlier, and we'll spend lots of time on them in the next lesson.

A point like this can have a big effect on a regression: if it falls in the training set, it can have a significant effect on 
the slope/intercept if it falls in the test set, it can make the score much lower than it would otherwise be. As things stand 
right now, this point falls into the test set (and probably hurting the score on our test data as a result). Let's add a little 
hack to see what happens if it falls in the training set instead. Add these two lines near the bottom of finance_regression.py, 
right before plt.xlabel(features_list[1]):

reg.fit(feature_test, target_test)
plt.plot(feature_train, reg.predict(feature_train), color="b") 

Now we'll be drawing two regression lines, one fit on the test data (with outlier) and one fit on the training data (no outlier). 
Look at the plot now--big difference, huh? That single outlier is driving most of the difference. What's the slope of the new 
regression line?

(That's a big difference, and it's mostly driven by the outliers. The next lesson will dig into outliers in more detail so you 
have tools to detect and deal with them.)
"""
features_list = ["bonus", "salary"]
data = featureFormat( dictionary, features_list, remove_any_zeroes=True)
target, features = targetFeatureSplit( data )

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"

### Your regression goes here!
reg = LinearRegression()
reg.fit(feature_train, target_train)

### draw the scatterplot, with color-coded training and testing points
import matplotlib.pyplot as plt
for feature, target in zip(feature_test, target_test):
    plt.scatter( feature, target, color=test_color ) 
for feature, target in zip(feature_train, target_train):
    plt.scatter( feature, target, color=train_color ) 

### labels for the legend
plt.scatter(feature_test[0], target_test[0], color=test_color, label="test")
plt.scatter(feature_test[0], target_test[0], color=train_color, label="train")

### draw the regression line, once it's coded
try:
    plt.plot( feature_test, reg.predict(feature_test) )
except NameError:
    pass

reg.fit(feature_test, target_test)
plt.plot(feature_train, reg.predict(feature_train), color="g") 

plt.xlabel(features_list[1])
plt.ylabel(features_list[0])
plt.legend()
plt.show()

print 'Slope: ', reg.coef_[0]

Slope:  2.27410114127
