# Examination of Houshold Energy Use and Electric Vehicles
### Samuel Sherman 
### May 2016 

## Data Examination

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

def read_data():
    EV = pd.read_csv('EV_train.csv', index_col='House ID')
    EV_labels = pd.read_csv('EV_train_labels.csv', index_col='House ID')
    return EV, EV_labels

EV, EV_labels = read_data()
EV.head()

Unnamed: 0_level_0,Interval_1,Interval_2,Interval_3,Interval_4,Interval_5,Interval_6,Interval_7,Interval_8,Interval_9,Interval_10,...,Interval_2871,Interval_2872,Interval_2873,Interval_2874,Interval_2875,Interval_2876,Interval_2877,Interval_2878,Interval_2879,Interval_2880
House ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11655099,0.95,0.826,0.361,0.238,0.342,0.233,0.351,0.194,0.292,0.234,...,0.664,0.783,0.601,0.639,0.417,0.439,0.226,0.19,0.71,0.728
11633257,0.353,0.327,0.358,0.292,0.285,0.304,0.361,0.342,0.355,0.348,...,0.536,0.558,0.622,0.634,0.513,0.421,0.273,0.296,0.291,0.289
11651552,0.15,0.181,0.15,0.15,0.131,0.125,0.088,0.106,0.094,1.019,...,2.125,0.881,0.481,1.194,0.138,0.119,0.038,0.088,0.056,0.113
11636092,2.088,2.075,2.121,2.098,2.046,2.081,1.847,0.42,0.399,0.364,...,0.62,0.487,0.563,0.419,0.379,0.359,0.347,0.325,0.33,0.34
11647239,1.416,1.25,1.27,1.258,1.239,1.753105,4.609256,4.619256,4.075151,1.23,...,1.596,1.667,1.569,1.664,1.58,1.635,1.568,1.565,1.575,1.571


In [11]:
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
py.sign_in('scsherm', 'ml0wer7f1s')

data = [go.Heatmap(
        x = range(1,len(EV.columns.values)+1),
        y = EV.index.values,
        z = EV.values,
        zauto = False, 
        zmax = 20, 
        zmin = 0,
        autocolorscale = False,
        colorscale=[[0, 'rgb(220,220,220)'], [0.2, 'rgb(245,195,157)'], [0.4, 'rgb(245,160,105)'], 
                    [1, 'rgb(178,10,28)']])]

layout = go.Layout(
    title='Energy use per house per interval',
    xaxis = dict(type = 'category'),
    yaxis = dict(type = 'category'),
    autosize=False,
    barmode='overlay',
    width = 1000,
    height = 1000)

EV_heatmap = go.Figure(data=data, layout=layout)
py.iplot(EV_heatmap)

The draw time for this plot will be slow for all clients.



Estimated Draw Time Too Long



In [2]:
len(EV.columns.values)/48 #number of days

60

Considering the span of intervals takes place over 60 days, there are patterns emerging that represent energy use for different households. Although, I do not know what time of day the first interval takes place, I can hypothesize that people charge their vehicles in a manner that would be consistent with their schedule. For the average person, this might be charging their vehicle at night, while they are sleeping.

In [43]:
#Energy use for EV charging
mask = zip(*np.where(EV_labels == 1))
EV_1_values = np.array([EV.iloc[i[0],i[1]] for i in mask])
EV_1_values

[1.7531050266371799,
 4.6092559999999994,
 4.619256,
 4.0751509733628195,
 1.9274940876286399,
 1.9444940876286398,
 1.35296277023583,
 4.4202932297641704,
 2.2854940876286403,
 2.0434940876286398,
 1.6034342586413401,
 4.2728217413586602,
 1.8402901278171497,
 4.5852559999999993,
 4.5642559999999994,
 4.5592559999999995,
 3.9199658721828503,
 1.9334940876286399,
 2.4014940876286404,
 2.1374475791027598,
 4.4246260915823798,
 4.8982559999999999,
 4.8722560000000001,
 1.89162990841762,
 6.9938773083853789,
 5.5578906916146211,
 1.97349408762864,
 1.97149408762864,
 1.97549408762864,
 1.86649408762864,
 2.29637469099973,
 4.6452559999999998,
 4.6422559999999997,
 4.6502559999999997,
 4.6592560000000001,
 4.6372559999999998,
 4.6012559999999993,
 4.6312559999999996,
 3.6128813090002696,
 1.9220403476899699,
 4.1552156523100301,
 1.94749408762864,
 1.5315067265807001,
 4.2967492734192998,
 1.9824940876286399,
 2.0834940876286399,
 2.6420380598613198,
 4.6902559999999998,
 4.988255999999999

In [45]:
#Energy use for no EV charging
mask2 = zip(*np.where(EV_labels == 0))
EV_0_values = np.array([EV.iloc[i[0],i[1]] for i in mask2])
EV_0_values

[0.94999999999999996,
 0.82599999999999996,
 0.36099999999999999,
 0.23800000000000002,
 0.34200000000000003,
 0.23300000000000001,
 0.35100000000000003,
 0.19399999999999998,
 0.29199999999999998,
 0.23399999999999999,
 0.26000000000000001,
 0.27399999999999997,
 0.192,
 0.32899999999999996,
 0.192,
 0.44299999999999995,
 0.26000000000000001,
 0.222,
 0.41899999999999998,
 0.249,
 0.42899999999999999,
 0.42599999999999999,
 0.373,
 0.33500000000000002,
 0.45700000000000002,
 1.1990000000000001,
 1.7350000000000001,
 1.431,
 1.151,
 1.258,
 1.2429999999999999,
 0.82499999999999996,
 0.53400000000000003,
 0.51300000000000001,
 0.77500000000000002,
 0.70799999999999996,
 0.65500000000000003,
 0.61799999999999999,
 0.59999999999999998,
 0.57700000000000007,
 0.56799999999999995,
 0.55000000000000004,
 0.35100000000000003,
 0.43099999999999999,
 0.20199999999999999,
 0.311,
 0.67900000000000005,
 0.85199999999999998,
 0.88700000000000001,
 0.77599999999999991,
 0.44700000000000001,
 0.215,

In [52]:
#mean energy use with no EV charge
EV_0_values = EV_0_values[~np.isnan(EV_0_values)]
EV_0_values.mean()

1.3494471794259362

In [53]:
#mean energy use with EV charge
EV_1_values = EV_1_values[~np.isnan(EV_1_values)]
EV_1_values.mean()

3.5141408897533064

Energy use for households with electric vehicles is, on average, higher than for households without electric vehicles.

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.metrics import recall_score, precision_score, average_precision_score, roc_curve, roc_auc_score


EV.fillna(-1, inplace = True)
X = EV.values

#create y labels for when a house has an electric vehicle
y = EV_labels.sum(axis=1) > 0
y = y.astype(int).values

# stratify train,test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)
clf_init = LogisticRegression(fit_intercept=True, penalty = 'l2', n_jobs=-1)
clf_init.fit(X_train,y_train)
coefficients = clf_init.coef_
coefficients = np.exp(coefficients) #take exponential for interoperability 

In [5]:
np.where(coefficients == coefficients.max())[1]

array([2820])

In [6]:
coefficients[0][2820]

1.4860950498090095

In [7]:
one_rows = np.where(EV_labels.iloc[:,2820] == 1)[0]
zero_rows = np.where(EV_labels.iloc[:,2820] == 0)[0]

In [8]:
EV[EV.columns.values[2820]].values[one_rows].mean()

3.5342846043974689

In [9]:
EV[EV.columns.values[2820]].values[zero_rows].mean()

1.5066442307692307

In [12]:
EV_ones = EV.iloc[np.where(y==1)[0]]

data = [go.Scatter(
        x = range(1,len(EV_ones)), 
        y = EV_ones.mean(axis=0), 
        mode = 'lines')]
    
layout2 = go.Layout(
    title='Mean Energy Use of EV Households Over Time',
    xaxis=dict(
        title='Interval',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Mean Energy Use',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
EV_Over_Time = go.Figure(data=data, layout=layout2)    
py.iplot(EV_Over_Time)

In [128]:
EV_zeros = EV.iloc[np.where(y==0)[0]]

data = [go.Scatter(
        x = range(1,len(EV_zeros)), 
        y = EV_zeros.mean(axis=0), 
        mode = 'lines')]
    
layout2 = go.Layout(
    title='Mean Energy Use of Non-EV Households Over Time',
    xaxis=dict(
        title='Interval',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Mean Energy Use',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
Non_EV_Over_Time = go.Figure(data=data, layout=layout2)    
py.iplot(Non_EV_Over_Time)

The patterns of mean energy use are clearly distinguishable from the households with electric vehicles and those without electric vehicles. There is a cyclical pattern for the energy use for most households. This would make sense as most people have habitual schedules and probably use less energy while not at home. However, the households with electric vehicles have sections which reach much higher levels of energy use during their peak hours. Additionally, the total amount of peak times, over a 60 day period, are less.

## Determining if a household has an electric vehicle

In [2]:
from sklearn.ensemble import ExtraTreesClassifier as ETF
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.linear_model import LogisticRegression as LR
from sklearn.grid_search import GridSearchCV
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from scipy import interp

def run_model(X, y, model):
    AUC, AUC2, thresholds, recall, precision = [], [], [], [], []
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    
    #Statified K fold
    skf = StratifiedKFold(y, n_folds=5, shuffle=True)
    for train_index, test_index in skf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test= y[train_index], y[test_index]
        
        # Initialize a classifier 
        if model == LR:
            clf = model(random_state = 2, n_jobs = -1)
        elif model == ETF or model == RF:
            clf = model(random_state = 2, n_estimators = 1000, n_jobs = -1)
        else:
            clf = model
        clf.fit(X_train, y_train)
        pred = clf.predict_proba(X_test)
        pred2 = clf.predict(X_test)
        
        #Evaluate
        fpr, tpr, thresholds = roc_curve(y_test, pred[:,1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        AUC.append(roc_auc_score(y_test, pred[:,1]))
        AUC2.append(average_precision_score(y_test, pred[:,1]))
        recall.append(recall_score(y_test, pred2))
        precision.append(precision_score(y_test, pred2))
        if model == XGB:
            skf = [1]
            break

    mean_tpr /= len(skf)
    mean_tpr[-1] = 1.0
    return recall, AUC, precision, AUC2, mean_fpr, mean_tpr, thresholds, pred2, y_test, clf

In [182]:
(rf_recall, rf_AUC, rf_precision, rf_AUC2, rf_mean_fpr, 
 rf_mean_tpr, rf_thresholds, rf_pred2, rf_y_test, rf_clf) = run_model(X, y, RF)

(etf_recall, etf_AUC, etf_precision, etf_AUC2, etf_mean_fpr, 
 etf_mean_tpr, etf_thresholds, etf_pred2, etf_y_test, etf_clf) = run_model(X, y, ETF)

(log_recall, log_AUC, log_precision, log_AUC2, log_mean_fpr, log_mean_tpr, 
 log_thresholds, log_pred2, log_y_test, log_clf) = run_model(X, y, LR)

param_grid = [{'learning_rate': [.01, .001, .1], 'n_estimators': [50, 500, 1000], 'max_depth': [3, 5, 7]}]
xgb_model = xgb.XGBClassifier()
XGB = GridSearchCV(xgb_model, param_grid, verbose = 2, cv = 2, n_jobs = -1, scoring = 'roc_auc')

(gb_recall, gb_AUC, gb_precision, gb_AUC2, gb_mean_fpr, 
 gb_mean_tpr, gb_thresholds, gb_pred2, gb_y_test, gb_clf) = run_model(X, y, XGB)

v = np.linspace(0,1)

data = [go.Scatter(x = rf_mean_fpr, y = rf_mean_tpr, 
                   mode = 'lines', 
                   name = 'RF_AUC:{}'.format(np.mean(rf_AUC))),
        go.Scatter(x = etf_mean_fpr, y = etf_mean_tpr, 
                   mode = 'lines', 
                   name = 'ETF_AUC:{}'.format(np.mean(etf_AUC))),
        go.Scatter(x = log_mean_fpr, y = log_mean_tpr, 
                   mode = 'lines', 
                   name = 'LOG_AUC:{}'.format(np.mean(log_AUC))),
        go.Scatter(x = gb_mean_fpr, y = gb_mean_tpr, 
                   mode = 'lines', 
                   name = 'XBG_AUC:{}'.format(np.mean(gb_AUC))),
        go.Scatter(x = v, y = v, 
                   mode = 'lines', 
                   name = '50/50 mark')]

layout = go.Layout(title='ROC Curves',
    xaxis=dict(title='False Postive Rate',
            titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')),
    yaxis=dict(title='True Postive Rate',
            titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')))
ROC = go.Figure(data=data, layout=layout)    
py.iplot(ROC)

Fitting 2 folds for each of 27 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed: 14.1min finished


[CV] n_estimators=50, learning_rate=0.01, max_depth=3 ................
[CV] n_estimators=50, learning_rate=0.01, max_depth=3 ................
[CV] n_estimators=500, learning_rate=0.01, max_depth=3 ...............
[CV] n_estimators=500, learning_rate=0.01, max_depth=3 ...............
[CV] n_estimators=1000, learning_rate=0.01, max_depth=3 ..............
[CV] n_estimators=1000, learning_rate=0.01, max_depth=3 ..............
[CV] n_estimators=50, learning_rate=0.01, max_depth=5 ................
[CV] n_estimators=50, learning_rate=0.01, max_depth=5 ................
[CV] ....... n_estimators=50, learning_rate=0.01, max_depth=3 -   8.5s[CV] ....... n_estimators=50, learning_rate=0.01, max_depth=3 -   8.6s[CV] ...... n_estimators=500, learning_rate=0.01, max_depth=3 - 1.5min[CV] ...... n_estimators=500, learning_rate=0.01, max_depth=3 - 1.5min[CV] ..... n_estimators=1000, learning_rate=0.01, max_depth=3 - 2.9min[CV] ..... n_estimators=1000, learning_rate=0.01, max_depth=3 - 2.9min[CV] .......

In [183]:
print "Extra Trees Classifier"
print "Recall:{}, AUC:{}, Precision:{}, PR_Curve:{}".format(np.mean(etf_recall), 
    np.mean(etf_AUC), np.mean(etf_precision), np.mean(etf_AUC2))
print "Accuracy:{}".format(np.mean(etf_pred2 == etf_y_test))

Extra Trees Classifier
Recall:0.651546391753, AUC:0.934445118253, Precision:0.888723257419, PR_Curve:0.885718136082
Accuracy:0.852201257862


In [184]:
print "Random Forest Classifier"
print "Recall:{}, AUC:{}, Precision:{}, PR_Curve:{}".format(np.mean(rf_recall), 
    np.mean(rf_AUC), np.mean(rf_precision), np.mean(rf_AUC2))
print "Accuracy:{}".format(np.mean(rf_pred2 == rf_y_test))

Random Forest Classifier
Recall:0.672164948454, AUC:0.930526659514, Precision:0.853739449791, PR_Curve:0.877922321428
Accuracy:0.87106918239


In [185]:
print "Logistic Regression"
print "Recall:{}, AUC:{}, Precision:{}, PR_Curve:{}".format(np.mean(log_recall), 
    np.mean(log_AUC), np.mean(log_precision), np.mean(log_AUC2))
print "Accuracy:{}".format(np.mean(log_pred2 == log_y_test))

Logistic Regression
Recall:0.540206185567, AUC:0.757218827261, Precision:0.801826504119, PR_Curve:0.709154272771
Accuracy:0.823899371069


In [186]:
print "Xgboost Classifier"
print "Recall:{}, AUC:{}, Precision:{}, PR_Curve:{}".format(np.mean(gb_recall), 
    np.mean(gb_AUC), np.mean(gb_precision), np.mean(gb_AUC2))
print "Accuracy:{}".format(np.mean(gb_pred2 == gb_y_test))

Xgboost Classifier
Recall:0.463917525773, AUC:0.843821430237, Precision:0.9, PR_Curve:0.776334881543
Accuracy:0.820754716981


It appears the models are doing well in predicting which households contain an electric vehicle. However, in choosing a model it would be best to determine the ultimate goals or how the model will be used by a potential client. If it is desired to alter the threshold for classification, then the Extra Trees classifier seems most promising. Observing the ROC curve, the Extra Trees Classifer has the largest AUC and remains fairly smooth through the range of different thresholds. However, if it is desired to use the model "as is", then I would recommend the Random Forest Classifier. The recall or classification of "1" is higher. Although, the two models are very close in all metrics and both will ultimately perform well.

## Determining the interval an electric vehicle is charging

In [3]:
EV_test = pd.read_csv("EV_test.csv", index_col='House ID')
EV_test.fillna(-1,inplace = True)
EV_test.head()

Unnamed: 0_level_0,Interval_1,Interval_2,Interval_3,Interval_4,Interval_5,Interval_6,Interval_7,Interval_8,Interval_9,Interval_10,...,Interval_2871,Interval_2872,Interval_2873,Interval_2874,Interval_2875,Interval_2876,Interval_2877,Interval_2878,Interval_2879,Interval_2880
House ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11643173,1.013,0.215,0.217,0.217,0.217,0.22,0.22,0.215,0.215,0.213,...,1.059,0.308,1.248,0.236,0.206,0.21,0.218,0.219,0.909,0.202
11655020,1.325,1.413,0.813,0.588,0.213,0.175,0.163,0.2,0.175,0.2,...,0.75,0.625,0.45,1.325,0.35,0.313,0.188,0.2,0.188,0.175
11644367,1.543,1.636,1.683,1.556,1.935,2.393,2.601,2.709,2.59,2.673,...,2.208,2.423,2.458,2.487,2.354,2.444,2.43,2.275,1.914,1.497
11633356,0.153,0.123,0.146,0.101,0.14,0.108,0.121,0.127,0.11,0.132,...,0.216,0.237,0.213,0.258,0.275,0.219,0.218,0.079,0.049,0.055
11649905,0.187,0.004,0.004,0.004,0.004,0.004,0.005,0.004,0.004,0.005,...,0.004,0.005,0.526,0.005,0.004,0.004,0.004,0.004,0.004,0.004


In [4]:
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
py.sign_in('scsherm', 'ml0wer7f1s')

data = [go.Scatter(
        x = range(1,len(EV_test)), 
        y = EV_test.mean(axis=0), 
        mode = 'lines')]
    
layout2 = go.Layout(
    title='Mean Energy Use of EV_test Dataset',
    xaxis=dict(
        title='Interval',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Mean Energy Use',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
EV_ts = go.Figure(data=data, layout=layout2)    
py.iplot(EV_ts)

In [46]:
XGB=None
(etf_recall, etf_AUC, etf_precision, etf_AUC2, etf_mean_fpr, 
 etf_mean_tpr, etf_thresholds, etf_pred2, etf_y_test, etf_clf) = run_model(X, y, ETF)

In [47]:
check_ones = etf_clf.predict(EV_test.values)

In [48]:
check_ones.sum()/(len(check_ones)*1.0)

0.19456366237482117

In [54]:
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
py.sign_in('scsherm', 'ml0wer7f1s')

data = [go.Scatter(
        x = range(1,len(EV_test)), 
        y = EV_test.iloc[check_ones == 1].mean(axis=0), 
        mode = 'lines')]
    
layout2 = go.Layout(
    title='Mean Energy Use of EV_test Dataset for Predicted ',
    xaxis=dict(
        title='Interval',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Mean Energy Use',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
EV_ts2 = go.Figure(data=data, layout=layout2)    
py.iplot(EV_ts2)

The rows, of which the model believes to be housholds with EV's, show a structure for mean energy use per interval similar to what was modeled from the labeled data. This leads me to believe the model is predicting these classifications well. Additionally, assuming the predictions are correct, I know the majority of cases in the test set are households without electric vehicles. 

### Model with just EV households

In [7]:
from sklearn.multiclass import OneVsRestClassifier

X_multi_label = EV.iloc[np.where(y==1)[0]].values #EV Houses only
y_multi_label = EV_labels.iloc[np.where(y==1)[0]].values #EV Houses only

X_train, X_test, y_train, y_test = train_test_split(X_multi_label, y_multi_label, 
    test_size = 0.2, random_state = 42)

In [8]:
clf = OneVsRestClassifier(ETF(n_estimators=1000), n_jobs = -1)
clf.fit(X_train, y_train)


Label not 2840 is present in all training examples.


Label not 2841 is present in all training examples.


Label not 2843 is present in all training examples.


Label not 2846 is present in all training examples.


Label not 2858 is present in all training examples.


Label not 2863 is present in all training examples.


Label not 2869 is present in all training examples.


Label not 2876 is present in all training examples.


Label not 2842 is present in all training examples.


Label not 2845 is present in all training examples.


Label not 2849 is present in all training examples.


Label not 2857 is present in all training examples.


Label not 2868 is present in all training examples.


Label not 2871 is present in all training examples.


Label not 2844 is present in all training examples.


Label not 2848 is present in all training examples.


Label not 2852 is present in all training examples.


Label not 2862 is present in all training examples.


Label not 2875 is present i

OneVsRestClassifier(estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          n_jobs=-1)

In [9]:
y_pred = clf.predict(X_test)

In [10]:
multi_label_recall = recall_score(y_pred, y_test, average = 'micro') 
multi_label_precision = precision_score(y_pred, y_test, average = 'micro')
multi_label_PRC = average_precision_score(y_pred, y_test, average = 'micro')
multi_label_AUC = roc_auc_score(y_pred, y_test, average = 'micro')

In [11]:
print "Recall: {}, Precision: {}, PR_Curve: {}, AUC: {}".format(multi_label_recall, 
        multi_label_precision, multi_label_PRC, multi_label_AUC) 

Recall: 0.843120260022, Precision: 0.181712044085, PR_Curve: 0.513711971068, AUC: 0.889672379551


In [12]:
print "{} Total values in dataset".format(len(y_test.T)*len(y_test))

279360 Total values in dataset


In [14]:
print "{}% of the values are 1".format(y_test.sum()/(len(y_test.T)*len(y_test)*1.0)*100)

7.66502004582% of the values are 1


In [15]:
print "The potential accuracy of the model is {}%".format(np.mean(y_test == y_pred)*100)

The potential accuracy of the model is 93.4686426117%


### Model with all households

In [16]:
X_multi_label2 = EV.values #All houses
y_multi_label2 = EV_labels.values #All houses

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_multi_label2, y_multi_label2, 
    test_size = 0.2, random_state = 42, stratify = y)

In [17]:
clf2 = OneVsRestClassifier(ETF(n_estimators=1000), n_jobs = -1)
clf2.fit(X_train2, y_train2)


Label not 2838 is present in all training examples.


Label not 2839 is present in all training examples.


Label not 2840 is present in all training examples.


Label not 2842 is present in all training examples.


Label not 2844 is present in all training examples.


Label not 2870 is present in all training examples.


Label not 2841 is present in all training examples.


Label not 2843 is present in all training examples.


Label not 2846 is present in all training examples.


Label not 2848 is present in all training examples.


Label not 2854 is present in all training examples.


Label not 2878 is present in all training examples.


Label not 2845 is present in all training examples.


Label not 2847 is present in all training examples.


Label not 2850 is present in all training examples.


Label not 2852 is present in all training examples.


Label not 2859 is present in all training examples.


Label not 2849 is present in all training examples.


Label not 2851 is present i

OneVsRestClassifier(estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          n_jobs=-1)

In [18]:
y_pred2 = clf2.predict(X_test2)

In [19]:
multi_label_recall2 = recall_score(y_pred2, y_test2, average = 'micro') 
multi_label_precision2 = precision_score(y_pred2, y_test2, average = 'micro')
multi_label_PRC2 = average_precision_score(y_pred2, y_test2, average = 'micro')
multi_label_AUC2 = roc_auc_score(y_pred2, y_test2, average = 'micro')

In [20]:
print "Recall: {}, Precision: {}, PR_Curve: {}, AUC: {}".format(multi_label_recall2, 
        multi_label_precision2, multi_label_PRC2, multi_label_AUC2) 

Recall: 0.895486935867, Precision: 0.0894679386777, PR_Curve: 0.492597545588, AUC: 0.937245894786


In [21]:
print "{} Total values in dataset".format(len(y_test2.T)*len(y_test2))

915840 Total values in dataset


In [22]:
print "{}% of the values are 1".format(y_test.sum()/(len(y_test2.T)*len(y_test2)*1.0)*100)

2.33807215234% of the values are 1


In [23]:
print "The potential accuracy of the model is {}%".format(np.mean(y_test2 == y_pred2)*100)

The potential accuracy of the model is 97.8812893082%


### Examining the feasability of both models on test data

In [24]:
submission = clf.predict_proba(EV_test.values)

In [25]:
count_positives = submission > 0.5

In [26]:
count_positives.sum()/2013120.

0.0068177753934191704

In [27]:
submission = pd.DataFrame(submission, index = EV_test.index, columns = EV_test.columns )
submission.head()

Unnamed: 0_level_0,Interval_1,Interval_2,Interval_3,Interval_4,Interval_5,Interval_6,Interval_7,Interval_8,Interval_9,Interval_10,...,Interval_2871,Interval_2872,Interval_2873,Interval_2874,Interval_2875,Interval_2876,Interval_2877,Interval_2878,Interval_2879,Interval_2880
House ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11643173,0.002,0.004,0.012,0.008,0.007,0.012,0.013,0.002,0.003,0.006,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11655020,0.014,0.035,0.038,0.035,0.044,0.016,0.039,0.044,0.045,0.048,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11644367,0.047,0.11,0.125,0.185,0.116,0.115,0.135,0.059,0.059,0.044,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11633356,0.003,0.004,0.026,0.032,0.021,0.026,0.029,0.015,0.007,0.011,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11649905,0.008,0.004,0.015,0.012,0.015,0.01,0.041,0.009,0.013,0.013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
submission2 = clf2.predict_proba(EV_test.values)

In [59]:
count_positives2 = submission2 > 0.5

In [60]:
count_positives2.sum()/2013120.

0.0020063384199650292

In [61]:
submission2 = pd.DataFrame(submission2, index = EV_test.index, columns = EV_test.columns )
submission2.head()

Unnamed: 0_level_0,Interval_1,Interval_2,Interval_3,Interval_4,Interval_5,Interval_6,Interval_7,Interval_8,Interval_9,Interval_10,...,Interval_2871,Interval_2872,Interval_2873,Interval_2874,Interval_2875,Interval_2876,Interval_2877,Interval_2878,Interval_2879,Interval_2880
House ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11643173,0.0,0.002,0.003,0.003,0.001,0.006,0.005,0.002,0.004,0.002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11655020,0.014,0.015,0.022,0.018,0.016,0.016,0.025,0.029,0.036,0.03,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11644367,0.0,0.001,0.003,0.012,0.013,0.016,0.012,0.006,0.004,0.001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11633356,0.001,0.002,0.009,0.012,0.009,0.011,0.018,0.007,0.013,0.005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11649905,0.0,0.0,0.002,0.005,0.005,0.002,0.016,0.004,0.01,0.004,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


I feel the models are performing fairly well. The recall is showing a good metric. However, the precision needs to be improved in both cases. The accuracy is also good, but is not the best indicator of performance in this case. This is because the intervals of which an EV is charging are rare events and if I classified everything as a zero, it could potentially still have a high level of accuracy. The model trained on just the households with EV's seems to be predicting more ones. However, I would argue that this model understands the behavior of charging for these housholds better than the other model.   

In [63]:
submission.to_csv('submission.csv')

In [66]:
EV_household_classification = pd.DataFrame(check_ones, index = EV_test.index, columns = ['Has_EV?'] )
EV_household_classification.to_csv('EV_household_classification.csv')

In [62]:
from IPython.core.display import HTML #Make pretty
import urllib2
HTML(urllib2.urlopen('http://bit.ly/1Bf5Hft').read())