In [1]:
import pickle
import numpy as np
import pandas as pd

from sklearn import cross_validation
from cancel_predict import *

In [3]:
df = pickle.load(open("ontime_sample_01.pickle", 'rb'))
transformer_dict = pickle.load(open("transformer_dict_01.pickle", 'rb'))

df = df.sort_values(by="OrdinalDate")

X, y, Xenc = vectorize_data(df)

print("X shape: " + str(X.shape))
print("y shape: " + str(y.shape))

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, train_size=0.7)
# X_train, X_test, y_train, y_test = contiguous_train_test_split(X, y, train_size=0.7)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

X shape: (1009425, 3101)
y shape: (1009425,)
(706597, 3101)
(302828, 3101)
(706597,)
(302828,)


In [4]:
print("Training logistic regression classifier...")

clf = linear_model.LogisticRegression(penalty='l2', verbose=True)
clf.fit(X_train, y_train)

print("\nTest")
y_pred = clf.predict_proba(X_test)
y_baseline = np.ones(y_test.shape) * (np.sum(y) / len(y))
print_metrics(y_test, y_pred[:, 1], y_baseline)

print("\nTrain")
y_pred = clf.predict_proba(X_train)
y_baseline = np.ones(y_train.shape) * (np.sum(y) / len(y))
print_metrics(y_train, y_pred[:, 1], y_baseline)

Training logistic regression classifier...
[LibLinear]
Test
-----
ROC AUC of classifier: 0.678880147687
ROC AUC score of baseline: 0.5
ROC AUC score of never: 0.5
ROC AUC score of always: 0.5

Train
-----
ROC AUC of classifier: 0.712189324403
ROC AUC score of baseline: 0.5
ROC AUC score of never: 0.5
ROC AUC score of always: 0.5


In [10]:
cat_vars = ["UniqueCarrier",
            "OriginAirportID",
            "OriginAirportSeqID",
            "OriginCityMarketID",
            "OriginState",
            "DestAirportID",
            "DestAirportSeqID",
            "DestCityMarketID",
            "DepTimeBlk",
            "ArrTimeBlk",
            "DistanceGroup",
            "DestState"]

feat_indices = Xenc.feature_indices_
for ii, jj in zip(cat_vars, feat_indices):
    print(ii, jj)

UniqueCarrier 0
OriginAirportID 26
OriginAirportSeqID 389
OriginCityMarketID 1139
OriginState 1476
DestAirportID 1529
DestAirportSeqID 1895
DestCityMarketID 2653
DepTimeBlk 2993
ArrTimeBlk 3012
DistanceGroup 3031
DestState 3042


In [88]:
con_vars = ["CRSElapsedTime",
            "Distance",
            "CRSDepTime",
            "CRSArrTime",
            "WeekDay",
            "YearDay"]

In [37]:
for feat in feat_indices:
    print(feat)

0
26
389
1139
1476
1529
1895
2653
2993
3012
3031
3042
3095


In [None]:
transformer_dict = pickle.load(open("transformer_dict_01.pickle", 'rb'))
inv_transformer_dict = {}
for key1 in transformer_dict.keys():
    inv_transformer_dict[key1] = {}
    for key2 in transformer_dict[key1].keys():
        invkey = transformer_dict[key1][key2]
        inv_transformer_dict[key1][invkey] = key2

In [96]:
ranked_features = np.abs(clf.coef_).argsort()
for feat_i in ranked_features[0, -100:]:
    ii = 0
    coef_val = clf.coef_[0, feat_i]
    
    try:
        while feat_i > feat_indices[ii]:
            ii += 1
        ii -= 1
        var_val = cat_vars[ii]
        code_val = feat_i - feat_indices[ii]
        feat_val = inv_transformer_dict[cat_vars[ii]][code_val]
    
    except IndexError:
        code_val = feat_i - 3095
        var_val = con_vars[feat_i - 3095]
        code_val = "N/A"
        feat_val = "cont"

    print("coef: {0: 4f},\tvariable: {1},\tvalue: {4}\tfeature index: {2},\tvalue: {3}".format(coef_val, 
                                                                                    var_val, 
                                                                                    feat_i, 
                                                                                    code_val,
                                                                                    feat_val))

coef: -0.614481,	variable: DestCityMarketID,	value: 35841	feature index: 2756,	value: 103
coef: -0.614481,	variable: DestAirportID,	value: 15841	feature index: 1649,	value: 120
coef: -0.615194,	variable: OriginAirportSeqID,	value: 1289604	feature index: 853,	value: 464
coef: -0.615638,	variable: DestAirportSeqID,	value: 1013603	feature index: 2419,	value: 524
coef:  0.615739,	variable: DestAirportSeqID,	value: 1448702	feature index: 2443,	value: 548
coef:  0.618398,	variable: OriginAirportSeqID,	value: 1157701	feature index: 584,	value: 195
coef: -0.620278,	variable: DestAirportSeqID,	value: 1014601	feature index: 2173,	value: 278
coef: -0.620281,	variable: OriginAirportSeqID,	value: 1295101	feature index: 595,	value: 206
coef: -0.624136,	variable: OriginAirportSeqID,	value: 1129804	feature index: 1114,	value: 725
coef: -0.624408,	variable: DestAirportSeqID,	value: 1334204	feature index: 2310,	value: 415
coef:  0.633512,	variable: DestAirportSeqID,	value: 1039702	feature index: 1907,	v

In [107]:
ranked_features = (-clf.coef_).argsort()
for feat_i in ranked_features[0, -100:]:
    ii = 0
    coef_val = clf.coef_[0, feat_i]
    
    try:
        while feat_i > feat_indices[ii]:
            ii += 1
        ii -= 1
        var_val = cat_vars[ii]
        code_val = feat_i - feat_indices[ii]
        feat_val = inv_transformer_dict[cat_vars[ii]][code_val]
    
    except IndexError:
        code_val = feat_i - 3095
        var_val = con_vars[feat_i - 3095]
        code_val = "N/A"
        feat_val = "cont"

    print("coef: {0: 4f},  variable: {1},\tvariable value: {4:}\tfeature index: {2},\tcode value: {3}".format(coef_val, 
                                                                                    var_val, 
                                                                                    feat_i, 
                                                                                    code_val,
                                                                                    feat_val))

coef: -0.497023,  variable: DestAirportSeqID,	variable value: 1086803	feature index: 2361,	code value: 466
coef: -0.499745,  variable: OriginAirportSeqID,	variable value: 1482801	feature index: 539,	code value: 150
coef: -0.501617,  variable: DestCityMarketID,	variable value: 31041	feature index: 2890,	code value: 237
coef: -0.501617,  variable: DestAirportID,	variable value: 11041	feature index: 1790,	code value: 261
coef: -0.504071,  variable: OriginAirportSeqID,	variable value: 1055102	feature index: 977,	code value: 588
coef: -0.504096,  variable: OriginAirportSeqID,	variable value: 1078502	feature index: 942,	code value: 553
coef: -0.504237,  variable: DestAirportSeqID,	variable value: 1532302	feature index: 2460,	code value: 565
coef: -0.504318,  variable: DestAirportSeqID,	variable value: 1295101	feature index: 2109,	code value: 214
coef: -0.504732,  variable: DistanceGroup,	variable value: 1	feature index: 3034,	code value: 3
coef: -0.505373,  variable: OriginAirportSeqID,	vari

In [104]:
ranked_features = clf.coef_.argsort()
for feat_i in ranked_features[0, -100:]:
    ii = 0
    coef_val = clf.coef_[0, feat_i]
    
    try:
        while feat_i > feat_indices[ii]:
            ii += 1
        ii -= 1
        var_val = cat_vars[ii]
        code_val = feat_i - feat_indices[ii]
        feat_val = inv_transformer_dict[cat_vars[ii]][code_val]
    
    except IndexError:
        code_val = feat_i - 3095
        var_val = con_vars[feat_i - 3095]
        code_val = "N/A"
        feat_val = "cont"

    print("coef: {0: 4f},\tvariable: {1},\tvalue: {4}\tfeature index: {2},\tvalue: {3}".format(coef_val, 
                                                                                    var_val, 
                                                                                    feat_i, 
                                                                                    code_val,
                                                                                    feat_val))

ValueError: Sign not allowed in string format specifier

In [None]:
# print("Training random forest classifier....")

clf = ensemble.RandomForestClassifier(verbose=True, n_jobs=2, max_depth=)
clf.fit(X_train, y_train)

print("\nTest")
y_pred = clf.predict_proba(X_test)
y_baseline = np.ones(y_test.shape) * (np.sum(y) / len(y))
print_metrics(y_test, y_pred[:, 1], y_baseline)

print("\nTrain")
y_pred = clf.predict_proba(X_train)
y_baseline = np.ones(y_train.shape) * (np.sum(y) / len(y))
print_metrics(y_train, y_pred[:, 1], y_baseline)


print("program complete")