In [None]:
### SET UP THE ENVIRONMENT ###

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import sklearn.model_selection

%matplotlib inline

In [None]:
### LOAD DATA ###

DATA_DIR = os.path.join('data')

data_paths = {'A': {'data': os.path.join(DATA_DIR, 'A_hhold_train.csv'), 
                    'pred':  os.path.join(DATA_DIR, 'A_hhold_test.csv')}, 
              
              'B': {'data': os.path.join(DATA_DIR, 'B_hhold_train.csv'), 
                    'pred':  os.path.join(DATA_DIR, 'B_hhold_test.csv')}, 
              
              'C': {'data': os.path.join(DATA_DIR, 'C_hhold_train.csv'), 
                    'pred':  os.path.join(DATA_DIR, 'C_hhold_test.csv')}}

a_data = pd.read_csv(data_paths['A']['data'], index_col='id')
b_data = pd.read_csv(data_paths['B']['data'], index_col='id')
c_data = pd.read_csv(data_paths['C']['data'], index_col='id')

In [None]:
# EXPLORE CLASS DISTRIBUTION ###

#a_data.poor.value_counts().plot.bar(title='Number of Poor for country A')
#b_data.poor.value_counts().plot.bar(title='Number of Poor for country B')
c_data.poor.value_counts().plot.bar(title='Number of Poor for country C')

In [None]:
### PRE-PROCESSING FUNCTIONS ###

# Standardize features
def standardize(df, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    
    # subtracy mean and divide by std
    df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    
    return df

def pre_process_data(df, enforce_cols=None):
    #print("Input shape:\t{}".format(df.shape))   

    df = standardize(df)
    #print("After standardization: {}".format(df.shape))
        
    # create dummy variables for categoricals
    df = pd.get_dummies(df)
    #print("After converting categoricals:\t{}".format(df.shape))
    

    # match test set and training set columns
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)

        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    
    df.fillna(0, inplace=True)
    
    return df

In [None]:
### PRE-PROCESS DATA FOR TRAINING ###

X_data_A = pre_process_data(a_data.drop('poor', axis=1))
y_data_A = np.ravel(a_data.poor)

X_data_B = pre_process_data(b_data.drop('poor', axis=1))
y_data_B = np.ravel(b_data.poor)

X_data_C = pre_process_data(c_data.drop('poor', axis=1))
y_data_C = np.ravel(c_data.poor)

X_train_A, X_test_A, y_train_A, y_test_A = sk.cross_validation.train_test_split( \
                                            X_data_A, y_data_A , test_size=0.25, random_state=0, stratify=y_data_A)

X_train_B, X_test_B, y_train_B, y_test_B = sk.cross_validation.train_test_split( \
                                            X_data_B, y_data_B , test_size=0.25, random_state=0, stratify=y_data_B)

X_train_C, X_test_C, y_train_C, y_test_C = sk.cross_validation.train_test_split( \
                                            X_data_C, y_data_C , test_size=0.25, random_state=0, stratify=y_data_C)

In [None]:
### CREATE AND TRAIN MODELS ###

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import grid_search

def train_model(features, labels, **kwargs):
    
    # instantiate model
    model = RandomForestClassifier(n_estimators=50, random_state=0)
    
    # train model
    model.fit(features, labels)
    
    return model

#train all models
model_a = train_model(X_train_A, y_train_A)
model_b = train_model(X_train_B, y_train_B)
model_c = train_model(X_train_C, y_train_C)

In [None]:
### ASSESS MODEL PERFORMANCE ###
test_pred_A = model_a.predict_proba(X_test_A)
test_pred_B = model_b.predict_proba(X_test_B)
test_pred_C = model_c.predict_proba(X_test_C)

y_test_all = np.hstack((y_test_A, y_test_B, y_test_C))
test_pred_all = np.hstack((test_pred_A[:, 1], test_pred_B[:, 1], test_pred_C[:, 1]))

perf_a = sk.metrics.log_loss(y_test_A, test_pred_A)
perf_b = sk.metrics.log_loss(y_test_B, test_pred_B)
perf_c = sk.metrics.log_loss(y_test_C, test_pred_C)

perf_all = sk.metrics.log_loss(y_test_all, test_pred_all)

roc_auc_all = sk.metrics.roc_auc_score(y_test_all, test_pred_all)

print ("model A logloss: ", perf_a)
print ("model B logloss: ", perf_b)
print ("model C logloss: ", perf_c)
print ("overall logloss: ", perf_all)

print ("overall AUC-ROC: ", roc_auc_all)

In [None]:
### MAKE PREDICTIONS ###

# load test data
a_test = pd.read_csv(data_paths['A']['pred'], index_col='id')
b_test = pd.read_csv(data_paths['B']['pred'], index_col='id')
c_test = pd.read_csv(data_paths['C']['pred'], index_col='id')

# pre-process test data
a_test = pre_process_data(a_test, enforce_cols=X_data_A.columns)
b_test = pre_process_data(b_test, enforce_cols=X_data_B.columns)
c_test = pre_process_data(c_test, enforce_cols=X_data_C.columns)

# make predictions for test datae
a_preds = model_a.predict_proba(a_test)
b_preds = model_b.predict_proba(b_test)
c_preds = model_c.predict_proba(c_test)

In [None]:
### CREATE A SUBMISSION ###

def make_country_sub(preds, test_feat, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds[:, 1],  # proba p=1
                               columns=['poor'], 
                               index=test_feat.index)

    
    # add the country code for joining later
    country_sub["country"] = country
    return country_sub[["country", "poor"]]

# convert preds to data frames
a_sub = make_country_sub(a_preds, a_test, 'A')
b_sub = make_country_sub(b_preds, b_test, 'B')
c_sub = make_country_sub(c_preds, c_test, 'C')

submission = pd.concat([a_sub, b_sub, c_sub])
submission.head()
submission.to_csv('submission-take-one.csv')