In [2]:
# Imports and magic here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import cPickle as pickle
from sklearn.metrics import confusion_matrix
from scipy.optimize import fmin

In [3]:
# Read input datasets
train_csv = './train.csv'
test_csv = './test.csv'

# Read CSVs into Pandas DFs
df_train = pd.read_csv(train_csv, index_col = 0)
df_test = pd.read_csv(test_csv, index_col = 0)

# Add a train/test marker variable to distinguish rows
df_train['Is_Train'] = 1
df_test['Is_Train'] = 0

# Create a combined dataset for feature engineering purposes
df_all_data = pd.concat([df_train, df_test], axis = 0, join = 'outer')

In [4]:
# Identify predictor variables and response variables, and create corresponding DFs
response = 'Response'
all_predictors = [col for col in df_all_data.columns if col not in response]

df_predictors = df_all_data[all_predictors] # Extract all predictors for downstream feature engineering
df_response = df_train[response] # Extract response variable, *only from training data*
df_response_with_dummies = pd.get_dummies(df_response, prefix='Response')

In [5]:
# Identify features (categoricals) for dummification

def get_dummy_features(df, verbose=False, dummy_threshold = 200):
    features_for_dummification = []
    num_rows = len(df)

    for each_feature in df.columns:
        num_uniques = len(df[each_feature].unique())
        num_nulls = df[each_feature].isnull().sum()
        example = df[each_feature].iloc[0]
        
        # Keep track of dummy features
        if isinstance(example, str) or (num_uniques*dummy_threshold<num_rows and isinstance(example, (int, long))): 
            features_for_dummification.append(each_feature)

        if verbose==True:
            print '{}: Uniques: {}/{}. Nulls: {}. Type: {}'.format(each_feature, num_uniques, 
                                                               num_rows, num_nulls, type_of_feature)
            
            
    return features_for_dummification

In [6]:
# Turn chosen features into dummy variables
def create_dummy_features(df_raw, features_for_dummification, verbose=True):
    df_expanded = df_raw

    for each_feature in features_for_dummification:
        if verbose==True:
            print "Expanding variable: {}".format(each_feature)
        df_temp = pd.get_dummies(df_raw[each_feature], prefix=each_feature)
        df_expanded = pd.concat([df_expanded, df_temp], axis = 1, join = 'inner')
        df_expanded.drop(each_feature,inplace=True, axis=1)
        
    return df_expanded

In [7]:
# Quadratic weighted kappa computation
def compute_weighted_kappa(y_actual, y_predicted):
    matrix_o = confusion_matrix(y_actual, y_predicted)
    matrix_o = matrix_o/float(np.sum(matrix_o)) # Normalize to sum 1
    
    matrix_w = np.zeros((len(matrix_o), len(matrix_o)))
                    
    for ii in range(0,len(matrix_o)):
        for jj in range(0,len(matrix_o)):
            matrix_w[ii,jj] = float((ii-jj)**2)/(len(matrix_o)-1)**2

    matrix_e = (np.outer(np.bincount(y_actual), np.bincount(y_predicted)))
    matrix_e = matrix_e/float(np.sum(matrix_e))

    w_upr, w_lwr = 0.0, 0.0
    
    for ii in range(0, len(matrix_o)):
        for jj in range(0, len(matrix_o)):
            w_upr += matrix_w[ii,jj] * matrix_o[ii,jj]
            w_lwr += matrix_w[ii,jj] * matrix_e[ii,jj]

    kappa = 1 - w_upr / w_lwr
    
    print "DEBUG** Kappa: {}".format(kappa)
    return kappa


# Objective function to be minimized to find optimal cutpoint
obj_fun = lambda splits, df_resp, pred_resp_raw: 1.0/compute_weighted_kappa(df_resp, 
                                                    np.digitize(pred_resp_raw, sorted(splits)))

# Custom scoring function for grid search
def custom_scoring_function(estimator, X, y):
    y_actual = y
    y_pred = estimator.predict(X)
    
    initial_guess = np.array([0, 1.5, 2.5, 3, 4.2, 5.8, 6.5, 7])
    split_min = fmin(obj_fun, initial_guess, args=(y_actual, y_pred), disp=True)

    return 1.0/obj_fun(split_min, y_actual, y_pred)

In [8]:
# Identify features for dummification
features_for_dummification = get_dummy_features(df_predictors)
features_already_dummy = ['Medical_Keyword_{}'.format(num) for num in range(1,49)]
features_already_dummy.append('Is_Train') # No need to dummify train/test marker variable
features_for_dummification = sorted(list(set(features_for_dummification) - set(features_already_dummy)))
print features_for_dummification

['Employment_Info_2', 'Employment_Info_3', 'Employment_Info_5', 'Family_Hist_1', 'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 'InsuredInfo_6', 'InsuredInfo_7', 'Medical_History_11', 'Medical_History_12', 'Medical_History_13', 'Medical_History_14', 'Medical_History_16', 'Medical_History_17', 'Medical_History_18', 'Medical_History_19', 'Medical_History_20', 'Medical_History_21', 'Medical_History_22', 'Medical_History_23', 'Medical_History_25', 'Medical_History_26', 'Medical_History_27', 'Medical_History_28', 'Medical_History_29', 'Medical_History_3', 'Medical_History_30', 'Medical_History_31', 'Medical_History_33', 'Medical_History_34', 'Medical_History_35', 'Medical_History_36', 'Medical_History_37', 'Medical_History_38', 'Medical_History_39', 'Medical_History_4', 'Medical_History_40', 'Me

In [9]:
# Create dummy features
df_predictors_expanded = create_dummy_features(df_predictors, features_for_dummification)

Expanding variable: Employment_Info_2
Expanding variable: Employment_Info_3
Expanding variable: Employment_Info_5
Expanding variable: Family_Hist_1
Expanding variable: Insurance_History_1
Expanding variable: Insurance_History_2
Expanding variable: Insurance_History_3
Expanding variable: Insurance_History_4
Expanding variable: Insurance_History_7
Expanding variable: Insurance_History_8
Expanding variable: Insurance_History_9
Expanding variable: InsuredInfo_1
Expanding variable: InsuredInfo_2
Expanding variable: InsuredInfo_3
Expanding variable: InsuredInfo_4
Expanding variable: InsuredInfo_5
Expanding variable: InsuredInfo_6
Expanding variable: InsuredInfo_7
Expanding variable: Medical_History_11
Expanding variable: Medical_History_12
Expanding variable: Medical_History_13
Expanding variable: Medical_History_14
Expanding variable: Medical_History_16
Expanding variable: Medical_History_17
Expanding variable: Medical_History_18
Expanding variable: Medical_History_19
Expanding variable: Me

In [10]:
# Set up predictor and response variables
df_predictors_selected = df_predictors_expanded.copy(deep=True)

# Fill NAs with median estimates (not localized approach yet)
df_predictors_selected.fillna(df_predictors_selected.median(), inplace=True)

Unnamed: 0_level_0,BMI,Employment_Info_1,Employment_Info_4,Employment_Info_6,Family_Hist_2,Family_Hist_3,Family_Hist_4,Family_Hist_5,Ht,Ins_Age,...,Product_Info_3_36,Product_Info_3_37,Product_Info_3_38,Product_Info_5_2,Product_Info_5_3,Product_Info_6_1,Product_Info_6_3,Product_Info_7_1,Product_Info_7_2,Product_Info_7_3
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.323008,0.0280,0.0000,0.2500,0.463768,0.598039,0.436620,0.526786,0.581818,0.641791,...,0,0,0,1,0,1,0,1,0,0
5,0.272288,0.0000,0.0000,0.0018,0.188406,0.519608,0.084507,0.508929,0.600000,0.059701,...,0,0,0,1,0,0,1,1,0,0
6,0.428780,0.0300,0.0000,0.0300,0.304348,0.519608,0.225352,0.508929,0.745455,0.029851,...,0,0,0,1,0,0,1,1,0,0
7,0.352438,0.0420,0.0000,0.2000,0.420290,0.519608,0.352113,0.508929,0.672727,0.164179,...,0,0,0,1,0,0,1,1,0,0
8,0.424046,0.0270,0.0000,0.0500,0.463768,0.519608,0.408451,0.508929,0.654545,0.417910,...,0,0,0,1,0,0,1,1,0,0
10,0.364887,0.3250,0.0000,1.0000,0.463768,0.294118,0.507042,0.508929,0.836364,0.507463,...,0,0,0,0,1,1,0,1,0,0
11,0.376587,0.1100,0.0000,0.8000,0.594203,0.519608,0.549296,0.508929,0.581818,0.373134,...,0,0,0,1,0,0,1,1,0,0
14,0.571612,0.1200,0.0000,1.0000,0.463768,0.490196,0.436620,0.633929,0.781818,0.611940,...,0,0,0,1,0,0,1,1,0,0
15,0.362643,0.1650,0.0000,1.0000,0.463768,0.529412,0.676056,0.508929,0.618182,0.522388,...,0,0,0,1,0,0,1,1,0,0
16,0.587796,0.0250,0.0000,0.0500,0.797101,0.519608,0.436620,0.553571,0.600000,0.552239,...,0,0,0,1,0,0,1,1,0,0


In [11]:
# Compute additional features from K-Means
# Note: This takes a really long time to run, so I've saved the clustering output data into pickled files.

from sklearn.cluster import KMeans

READ_CLUSTER_DATA_FROM_FILE = True # Set this to False to rerun the clustering code; will take 1.5 hrs

if READ_CLUSTER_DATA_FROM_FILE==False:
    # Perform the clustering
    kmeans_estimator = KMeans(n_jobs=-1, n_clusters=200, init='k-means++', 
                              precompute_distances='auto', random_state=0)
    kmeans_distance_to_cluster_center = kmeans_estimator.fit_transform(df_predictors_selected)
    kmeans_cluster_membership = kmeans_estimator.fit_predict(df_predictors_selected)
    
    # Write clustering data to files
    dist_to_clust_file = open('kmeans_distance_to_cluster_center','wb')
    clust_membership_file = open('kmeans_cluster_membership','wb')

    pickle.dump(kmeans_distance_to_cluster_center, dist_to_clust_file)
    pickle.dump(kmeans_cluster_membership, clust_membership_file)

else:
    dist_to_clust_file = open('kmeans_distance_to_cluster_center','rb')
    kmeans_distance_to_cluster_center = pickle.load(dist_to_clust_file)
    clust_membership_file = open('kmeans_cluster_membership','rb')
    kmeans_cluster_membership = pickle.load(clust_membership_file)

dist_to_clust_file.close()
clust_membership_file.close()


IOError: [Errno 2] No such file or directory: 'kmeans_distance_to_cluster_center'

In [11]:
# Create new features based on distances to K-means cluster centers
kmeans_features = pd.DataFrame(kmeans_distance_to_cluster_center)
kmeans_features.columns = ['Kmeans_dist_{}'.format(col) for col in kmeans_features.columns]
kmeans_features.reset_index(inplace=True, drop=True)
df_predictors_selected.reset_index(inplace=True,drop=True)

In [12]:
# Create new features based on count of NAs and count of Medical Terms
count_na = df_predictors_expanded.isnull().sum(axis=1)
count_medical_terms = df_predictors_expanded[[col for col in df_predictors_expanded.columns 
                                              if 'Medical_Keyword_' in col]].sum(axis=1)
count_na.reset_index(inplace=True, drop=True)
count_medical_terms.reset_index(inplace=True, drop=True)
count_na.columns = ['Count_NA']
count_medical_terms.columns = ['Count_Medical_Terms']

In [13]:
# Merge all features into a combined dataset
df_predictors_combined = pd.concat([df_predictors_selected, kmeans_features, count_na, count_medical_terms], 
                                   axis=1, join='outer')

# Split DF into training and testing
df_predictors_training = df_predictors_combined.query("Is_Train==1").drop('Is_Train',axis=1)
df_predictors_test = df_predictors_combined.query("Is_Train==0").drop('Is_Train',axis=1)


In [14]:
# Write predictors_training and response to CSV
df_predictors_training.to_csv('predictors_training.csv')
df_response.to_csv('response.csv', index=False, header='Response')

In [15]:
## ENSEMBLE TRAINING STEP

# Set up ensemble model libraries
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import cPickle as pickle
import pandas as pd

# Read input files
df_predictors_training = pd.read_csv('predictors_training.csv', index_col = 0)
df_response = pd.read_csv('response.csv', index_col=False, squeeze = True)

# Set up low-level XGBoost routines
clf_xgb = xgb.XGBRegressor(max_depth=5, nthread=6, colsample_bylevel=0.4, colsample_bytree=0.7,
                           gamma=0, silent=0, subsample=0.9, n_estimators=300, min_child_weight=50)
# bst = clf_xgb.fit(df_predictors_training, df_response)

RUN_ENSEMBLE_SEARCH = True

# Perform randomized grid search
if RUN_ENSEMBLE_SEARCH == True:
    pgrid = {'colsample_bylevel': [0.2,0.4,0.6,0.8]
            }
    
    grid = GridSearchCV(clf_xgb, param_grid=pgrid, scoring=custom_scoring_function, cv=2)
    grid.fit(df_predictors_training, df_response) # This will take a long time!

    # Store grid in pickle format
    store_grid = open('grid.pkl','wb')

    pickle.dump(grid, store_grid)
    store_grid.close()

DEBUG** Kappa: 0.586270885884
DEBUG** Kappa: 0.586270885884
DEBUG** Kappa: 0.586698949747
DEBUG** Kappa: 0.587970527642
DEBUG** Kappa: 0.589926580288
DEBUG** Kappa: 0.590825214297
DEBUG** Kappa: 0.580965764086
DEBUG** Kappa: 0.573387746112
DEBUG** Kappa: 0.569333610799
DEBUG** Kappa: 0.590964632938
DEBUG** Kappa: 0.596128158431
DEBUG** Kappa: 0.594977296274
DEBUG** Kappa: 0.595673518847
DEBUG** Kappa: 0.596509568999
DEBUG** Kappa: 0.599899107404
DEBUG** Kappa: 0.599403745435
DEBUG** Kappa: 0.599831914048
DEBUG** Kappa: 0.597765870244
DEBUG** Kappa: 0.597175815032
DEBUG** Kappa: 0.597117774157
DEBUG** Kappa: 0.604777363801
DEBUG** Kappa: 0.600714102836
DEBUG** Kappa: 0.600216254818
DEBUG** Kappa: 0.602802932521
DEBUG** Kappa: 0.6007036921
DEBUG** Kappa: 0.6018896573
DEBUG** Kappa: 0.60278260707
DEBUG** Kappa: 0.606054420469
DEBUG** Kappa: 0.607746857473
DEBUG** Kappa: 0.607018124432
DEBUG** Kappa: 0.608214176418
DEBUG** Kappa: 0.610405036104
DEBUG** Kappa: 0.608853036461
DEBUG** Kappa: 

In [17]:
grid.best_estimator_

XGBRegressor(base_score=0.5, colsample_bylevel=0.4, colsample_bytree=0.7,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=50, missing=None, n_estimators=300, nthread=6,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=0, subsample=0.9)

In [18]:
# Optimize splits

pred_response_raw = grid.best_estimator_.predict(df_predictors_training)
initial_guess = np.array([0, 1.5, 2.5, 3, 4.2, 5.8, 6.5, 7])

# Objective function to be minimized
obj_fun = lambda splits, df_resp, pred_resp_raw: 1.0/compute_weighted_kappa(df_resp, 
                                                    np.digitize(pred_resp_raw, sorted(splits)))

split_min = fmin(obj_fun, initial_guess, args=(df_response, pred_response_raw), disp=True)

print split_min, 1.0/obj_fun(split_min, df_response, pred_response_raw)


DEBUG** Kappa: 0.651355833607
DEBUG** Kappa: 0.651355833607
DEBUG** Kappa: 0.651830208884
DEBUG** Kappa: 0.653707697309
DEBUG** Kappa: 0.654939952113
DEBUG** Kappa: 0.655700796349
DEBUG** Kappa: 0.645723247863
DEBUG** Kappa: 0.639926454395
DEBUG** Kappa: 0.637815247157
DEBUG** Kappa: 0.654025867052
DEBUG** Kappa: 0.659016814538
DEBUG** Kappa: 0.659620947842
DEBUG** Kappa: 0.662763035912
DEBUG** Kappa: 0.664360574911
DEBUG** Kappa: 0.664937877093
DEBUG** Kappa: 0.668838313678
DEBUG** Kappa: 0.667968513373
DEBUG** Kappa: 0.669530374128
DEBUG** Kappa: 0.672526182565
DEBUG** Kappa: 0.670022364575
DEBUG** Kappa: 0.668032291566
DEBUG** Kappa: 0.668605504817
DEBUG** Kappa: 0.665639513579
DEBUG** Kappa: 0.670845172945
DEBUG** Kappa: 0.670980773446
DEBUG** Kappa: 0.666938462706
DEBUG** Kappa: 0.671420686197
DEBUG** Kappa: 0.671398692229
DEBUG** Kappa: 0.668913718047
DEBUG** Kappa: 0.673986670016
DEBUG** Kappa: 0.670602487412
DEBUG** Kappa: 0.671305907228
DEBUG** Kappa: 0.672687413117
DEBUG** Ka

In [19]:
# Run model
# df_oos_test_predictions = pd.DataFrame(clf.predict(df_predictors_test), columns = ['Response'], 
#                                        index=df_test.index)
df_oos_test_predictions = pd.DataFrame(np.digitize(grid.best_estimator_.predict(df_predictors_test), sorted(split_min)), 
                                       columns=['Response'], index=df_test.index)
df_oos_test_predictions.replace(to_replace=0, value=1, inplace=True)

In [20]:
df_oos_test_predictions.Response.unique()

array([4, 7, 6, 5, 8, 3, 2, 1])

In [21]:
# Create submission file

# Timestamp filename    
from time import strftime
str_timestamp = strftime('%Y%m%d%H%M%S')
df_oos_test_predictions.to_csv('./submission_{}.csv'.format(str_timestamp))