In [2]:
import pandas as pd
import numpy as np

In [74]:
filename = 'on_time/On_Time_On_Time_Performance_2015_9.csv'

usecols = ["FlightDate",
           "UniqueCarrier",
           "OriginAirportID",
           "OriginAirportSeqID",
           "OriginCityMarketID",
           "OriginState",
           "DestAirportID",
           "DestAirportSeqID",
           "DestCityMarketID",
           "DestState",
           "CRSDepTime",
           "DepTimeBlk",
           "CRSArrTime",
           "ArrTimeBlk",
           "CRSElapsedTime",
           "Distance",
           "DistanceGroup",
           "Cancelled",
           "CancellationCode"]

df = pd.read_csv(filename,
                usecols=usecols)

df["CancellationCode"].fillna("NA", inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
for i, value in enumerate(df["CancellationCode"].unique()):
    print(i, value)

0 NA
1 A
2 C
3 B


In [9]:
def transform_categorical_feature(df, column_name):
    unique_values = df[column_name].unique()
#     unique_values = set( df[column_name].tolist() ) # this handle NaN better than df[column_name].unique()
    transformer_dict = {}
    for ii, value in enumerate( unique_values ):
        transformer_dict[value] = ii

    def label_map(y):
        return transformer_dict[y]
    
    df[column_name] = df[column_name].apply( label_map )
    return df


categorical_variables = ["UniqueCarrier",
                         "OriginAirportID",
                         "OriginAirportSeqID",
                         "OriginCityMarketID",
                         "OriginState",
                         "DestAirportID",
                         "DestAirportSeqID",
                         "DestCityMarketID",
                         "DepTimeBlk",
                         "ArrTimeBlk",
                         "DistanceGroup",
                         "DestState",   
                         "Cancelled",
                         "CancellationCode"]

for var in categorical_variables:
    df_transformed = transform_categorical_feature(df, var)
    
df_transformed.head()

Unnamed: 0,FlightDate,UniqueCarrier,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,OriginState,DestAirportID,DestAirportSeqID,DestCityMarketID,DestState,CRSDepTime,DepTimeBlk,CRSArrTime,ArrTimeBlk,Cancelled,CancellationCode,CRSElapsedTime,Distance,DistanceGroup
0,2015-09-01,0,0,0,0,0,0,0,0,0,900,0,1213,0,0,0,373,2475,0
1,2015-09-02,0,0,0,0,0,0,0,0,0,900,0,1213,0,0,0,373,2475,0
2,2015-09-03,0,0,0,0,0,0,0,0,0,900,0,1213,0,0,0,373,2475,0
3,2015-09-04,0,0,0,0,0,0,0,0,0,900,0,1213,0,0,0,373,2475,0
4,2015-09-05,0,0,0,0,0,0,0,0,0,900,0,1213,0,0,0,373,2475,0


In [6]:
num_of_dim = 0
for var in categorical_variables:
    dim = df_transformed[var].max() + 1
    num_of_dim += dim
    print(var + ": \t" + str(dim))

print("total number of categorical dimensions: " + str(num_of_dim))

UniqueCarrier: 	13
OriginAirportID: 	308
OriginAirportSeqID: 	308
OriginCityMarketID: 	287
OriginState: 	52
DestAirportID: 	309
DestAirportSeqID: 	309
DestCityMarketID: 	288
DepTimeBlk: 	19
ArrTimeBlk: 	19
DistanceGroup: 	11
DestState: 	52
Cancelled: 	2
CancellationCode: 	4
total number of categorical dimensions: 1981


In [73]:
df["CancellationCode"].head()

0    0
1    0
2    0
3    0
4    0
Name: CancellationCode, dtype: int64

In [82]:
feature_cols = ["UniqueCarrier",
                "OriginAirportID",
                "OriginAirportSeqID",
                "OriginCityMarketID",
                "OriginState",
                "DestAirportID",
                "DestAirportSeqID",
                "DestCityMarketID",
                "DepTimeBlk","ArrTimeBlk",
                "DistanceGroup",
                "DestState"]
# target_col = "Cancelled"
target_col = "CancellationCode"

feature_array = df_transformed[feature_cols].as_matrix()
target_array = (df_transformed[target_col] == 1).as_matrix()

print(feature_array.shape)
print(target_array.shape)

(464946, 12)
(464946,)


In [83]:
from sklearn.preprocessing import OneHotEncoder

X_enc = OneHotEncoder()

X = X_enc.fit_transform(feature_array)

y = target_array

In [84]:
from sklearn import linear_model
from sklearn import cross_validation

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y)

clf = linear_model.LogisticRegression(penalty='l2')
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [85]:
from sklearn import metrics

y_proba = clf.predict_proba(X_test)

y_pred = np.divide(y_proba[:, 1], y_proba[:, 0] + y_proba[:, 1])

error = metrics.log_loss(y_test, y_proba)
print(error)

0.0155806704411


In [86]:
y_no = np.zeros(y_test.shape)
y_yes = np.ones(y_test.shape)

n_cancelled = len(df[df["Cancelled"] == 1])
n_total = len(df["Cancelled"])
p_cancelled = n_cancelled/n_total
y_baseline = np.multiply(p, np.ones(y_test.shape))

error_no = metrics.log_loss(y_test, y_no)
print(error_no)

error_yes = metrics.log_loss(y_test, y_yes)
print(error_yes)

error_baseline = metrics.log_loss(y_test, y_baseline)
print(error_baseline) 

0.081713770216
34.4578603304
0.0172662082178


In [None]:
def transform_date_feature(seq):
    return pd.to_datetime(seq, format="%Y-%m-%d")

In [None]:
def make_feature_array(df):
    pass

In [None]:
def make_target_vector(df):
    pass

Some features that we want:
1. the time of year, to capture seasonal risk
2. the absolute date, capture long-term trends

Need to calculate a baseline... Just predict the average for all of them and calculate error.