In [1]:
import xgboost as xgb
import pandas as pd
pd.options.mode.chained_assignment = None

import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
train_df = pd.read_csv('./Data/Titanic/train.csv', header=0)
# test_df = pd.read_csv('./Data/Titanic/test.csv', header=0)

In [3]:
# We'll impute missing values using the median for numeric columns and the most
# common value for string columns.

# from sklearn.base import TransformerMixin
# class DataFrameImputer(TransformerMixin):
#     def fit(self, X, y=None):
#         self.fill = pd.Series([X[c].value_counts().index[0]
#             if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
#             index=X.columns)
#         return self
    
#     def transform(self, X, y=None):
#         return X.fillna(self.fill)

In [4]:
feature_columns_to_use = ['Pclass', 'Sex', 'Age', 'Fare', 'Parch', 'SibSp']
nonnumeric_columns = ['Sex']

train_portion = 0.8

In [5]:
## randomizing the train and test data for the model
train_df = train_df.reindex(np.random.permutation(train_df.index))
# test_df = test_df.reindex(np.random.permutation(test_df.index))

data_size = train_df.shape[0]
cutoff_idx = int(train_portion * data_size)

final_train_df = train_df.iloc[:cutoff_idx, :]
final_test_df = train_df.iloc[cutoff_idx:, :]

In [6]:
medians = final_train_df.loc[:, feature_columns_to_use].median().to_dict()
max_counts = {k: final_train_df.loc[:, k].value_counts().index[0] for k in nonnumeric_columns}

In [7]:
for col in feature_columns_to_use:
    null_vals = final_train_df.loc[:, col].isnull().values
    if col in nonnumeric_columns:
        final_train_df.loc[null_vals, col] = max_counts[col]
    else:
        final_train_df.loc[null_vals, col] = medians[col]
        
for col in feature_columns_to_use:
    null_vals = final_test_df.loc[:, col].isnull().values
    if col in nonnumeric_columns:
        final_test_df.loc[null_vals, col] = max_counts[col]
    else:
        final_test_df.loc[null_vals, col] = medians[col]

In [8]:
le = LabelEncoder()
for feature in nonnumeric_columns:
    le.fit(train_df[feature])
    final_train_df.loc[:, feature] = le.transform(final_train_df[feature])
    final_test_df.loc[:, feature] = le.transform(final_test_df[feature])

In [9]:
def get_class_weights(train_series):
    tr_counts_df = train_series.value_counts()
    return (1. / (tr_counts_df / tr_counts_df.sum())).to_dict()

In [10]:
# Prepare the inputs for the model
# train_X = big_X_imputed[0:train_df.shape[0]].as_matrix()
# test_X = big_X_imputed[train_df.shape[0]:].as_matrix()
# train_y = train_df['Survived']
train_X = final_train_df.loc[:, feature_columns_to_use].values
train_y = final_train_df.loc[:, 'Survived'].values.flatten()

test_X = final_test_df.loc[:, feature_columns_to_use].values
test_y_ref = final_test_df.loc[:, 'Survived'].values.flatten()

tr_wght_func, test_wght_func = get_class_weights(final_train_df['Survived']), get_class_weights(final_test_df['Survived'])
tr_weights = final_train_df['Survived'].map(tr_wght_func)
test_weights = final_test_df['Survived'].map(test_wght_func)

### XGBoost
---

In [11]:
def get_xgb_imp(xgb, feat_names):
    imp_vals = xgb.booster().get_fscore()
    imp_dict = {feat_names[i]:float(imp_vals.get('f'+str(i),0.)) for i in range(len(feat_names))}
    total = np.array(list(imp_dict.values())).sum()
    return {k:v/total for k,v in imp_dict.items()}

In [54]:
gbm = xgb.XGBClassifier(max_depth=4, n_estimators=100, learning_rate=0.05).fit(train_X, train_y)
pred_unweighted = gbm.predict(test_X)

_imp_unweighted_ = get_xgb_imp(gbm, feature_columns_to_use)
imp_unwghtd = [_imp_unweighted_[f] for f in feature_columns_to_use]

print("Test score is: ", gbm.score(test_X, test_y_ref))
print("Train score is: ", gbm.score(train_X, train_y))

Test score is:  0.821229050279
Train score is:  0.879213483146


#### Weighting Classes

In [56]:
gbm_w = xgb.XGBClassifier(max_depth=4, n_estimators=100, learning_rate=0.01)
gbm_w = gbm_w.fit(train_X, train_y, sample_weight=tr_weights)
predictions = gbm_w.predict(test_X)

_imp_weighted_ = get_xgb_imp(gbm_w, feature_columns_to_use)
imp_wghtd = [_imp_weighted_[f] for f in feature_columns_to_use]

print("Test score is: ", gbm_w.score(test_X, test_y_ref, sample_weight=test_weights))
print("Train score is: ", gbm_w.score(train_X, train_y, sample_weight=tr_weights))

Test score is:  0.801735559089
Train score is:  0.837091290871


In [57]:
xgb_imp = list(zip(*zip(imp_unwghtd, imp_wghtd)))

from bqplot import pyplot as plt

plt.figure(title='XGBoost Importances')
plt.bar(feature_columns_to_use, xgb_imp, type='grouped', labels=['Unweighted', 'Weighted'],
        display_legend=True)
plt.show()

### Gradient Boosted Trees
---

In [58]:
from sklearn.ensemble import GradientBoostingClassifier

In [60]:
grad_booster = GradientBoostingClassifier(max_depth=3, n_estimators=100, learning_rate=0.01)
grad_booster.fit(train_X, train_y)

gbt_prediction = grad_booster.predict(test_X)

print("Test score is: ", grad_booster.score(test_X, test_y_ref))
print("Train score is: ", grad_booster.score(train_X, train_y))

gbt_imp_unwghtd = grad_booster.feature_importances_

Test score is:  0.804469273743
Train score is:  0.84691011236


In [61]:
unwghtd_imps = list(zip(*(zip(imp_unwghtd, gbt_imp_unwghtd))))

plt.figure(title='Unweighted Importances xgboost vs gbt')
plt.bar(feature_columns_to_use, unwghtd_imps, type='grouped', labels=['xgboost', 'gbt'], display_legend=True)
plt.show()

#### Weighting Classes

In [62]:
grad_booster_wgt = GradientBoostingClassifier(max_depth=3, n_estimators=500, learning_rate=0.01)

counts_df = final_train_df['Survived'].value_counts()
counts_dict = (1. / (counts_df / counts_df.sum())).to_dict()
train_weights = final_train_df['Survived'].map(counts_dict)

grad_booster_wgt.fit(train_X, train_y, sample_weight=tr_weights)
gbt_prediction = grad_booster_wgt.predict(test_X)

print("Test score is: ", grad_booster_wgt.score(test_X, test_y_ref, sample_weight=test_weights))
print("Train score is: ", grad_booster_wgt.score(train_X, train_y, sample_weight=tr_weights))

gbt_imp_wghtd = grad_booster_wgt.feature_importances_

Test score is:  0.797231054584
Train score is:  0.857614238576


In [63]:
wghtd_imps = list(zip(*(zip(imp_wghtd, gbt_imp_wghtd))))

plt.figure(title='Class Weighted Importances xgboost vs gbt')
plt.bar(feature_columns_to_use, wghtd_imps, type='grouped', labels=['xgboost', 'gbt'], display_legend=True)
plt.show()

#### Grid Search with XGBoost -- Unweighted

In [64]:
from sklearn.grid_search import GridSearchCV

In [66]:
xgb_classifier = xgb.XGBClassifier(n_estimators=100)

xgb_params_dict = {'learning_rate': [0.001, 0.01, 0.1],
                   'max_depth': [3, 4, 5],
                   'subsample': [0.75, 0.9, 1.0]}

clf = GridSearchCV(xgb_classifier, xgb_params_dict, n_jobs=-1, cv=5)
## assigning to supress output
_ = clf.fit(train_X, train_y)

xgb_classifier.set_params(**clf.best_params_)
xgb_classifier.fit(train_X, train_y)

print('Train score: ', xgb_classifier.score(train_X, train_y))
print('Test score: ', xgb_classifier.score(test_X, test_y_ref))

Train score:  0.877808988764
Test score:  0.826815642458


#### Grid Search with XGBoost -- Weighted

In [67]:
xgb_cl_wghtd = xgb.XGBClassifier(n_estimators=100)

xgb_params_dict = {'learning_rate': [0.001, 0.01, 0.1],
                   'max_depth': [3, 4, 5],
                   'subsample': [0.75, 0.9, 1.0]}

clf = GridSearchCV(xgb_cl_wghtd, xgb_params_dict, n_jobs=-1, cv=5, fit_params={'sample_weight': tr_weights})
## assigning to supress output
_ = clf.fit(train_X, train_y)

xgb_cl_wghtd.set_params(**clf.best_params_)
xgb_cl_wghtd.fit(train_X, train_y, sample_weight=tr_weights)

print('Train score: ', xgb_cl_wghtd.score(train_X, train_y, sample_weight=tr_weights))
print('Test score: ', xgb_cl_wghtd.score(test_X, test_y_ref, sample_weight=test_weights))

Train score:  0.856705996067
Test score:  0.785373608903


### Random Forest
---

In [68]:
from sklearn.ensemble import RandomForestClassifier

In [69]:
rf_clas = RandomForestClassifier(max_depth=3, n_estimators=100, max_features='auto')
rf_clas.fit(train_X, train_y)

rf_pred = rf_clas.predict(test_X)

print("Test score is: ", rf_clas.score(test_X, test_y_ref))
print("Train score is: ", rf_clas.score(train_X, train_y))

rf_imp_unwghtd = rf_clas.feature_importances_

Test score is:  0.804469273743
Train score is:  0.845505617978


In [70]:
unwghtd_imps = list(zip(*(zip(imp_unwghtd, rf_imp_unwghtd))))

plt.figure(title='Unweighted Importances xgboost vs Random Forest')
plt.bar(feature_columns_to_use, unwghtd_imps, type='grouped', labels=['xgboost', 'Random Forest'], display_legend=True)
plt.show()

In [74]:
rf_clas_wgt = RandomForestClassifier(max_depth=3, n_estimators=100, max_features='auto')
rf_clas_wgt.fit(train_X, train_y, sample_weight=tr_weights.values)

rf_pred = rf_clas.predict(test_X)

print("Test score is: ", rf_clas.score(test_X, test_y_ref, sample_weight=test_weights))
print("Train score is: ", rf_clas.score(train_X, train_y, sample_weight=tr_weights))

rf_imp_unwghtd = rf_clas.feature_importances_

Test score is:  0.771131425543
Train score is:  0.825917408259
