In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import pyspark

import datetime as dt

import tabula
import joblib

In [13]:
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 60)
pd.set_option('display.precision', 3)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import sklearn
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer, OneHotEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, cross_validate \
                                    ,cross_val_predict, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix,recall_score,precision_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

from sklearn.metrics import roc_curve, auc

import itertools
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
from imblearn.over_sampling import SMOTE

import catboost
from catboost import CatBoostClassifier

In [142]:
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.svm import SVC

In [3]:
df = pd.read_pickle('./data/clean/clean_for_classifier_df.pkl')

In [4]:
col_names = ['claim_number', 'date_received', 'incident_date', 'airport_code', 'airport_name',
           'airline', 'claim_type', 'claim_site', 'item_category', 'close_amount', 'disposition']
           

In [5]:
# Create Dataset limited to top 40 airports and top 14 airlines
passengers = pd.read_pickle('./data/clean/usa2016-17-enplanements.pkl')

top_airports = list(passengers.airport_code.iloc[:40].unique())
top_airports.remove('MCI')
top_airports.remove('SFO')
top_airlines = df.airline.value_counts()[:14].index.tolist()


df = df[df.airport_code.isin(top_airports)]
df = df[df.airline.isin(top_airlines)]
df = df[(df.claim_site == 'Checked Baggage') | (df.claim_site == 'Checkpoint')]
df = df[(df.claim_type == 'PropertyLoss') | (df.claim_type == 'PropertyDamage')]

In [6]:
passengers = None
top_airports = None
top_airlines = None

In [7]:
df.dropna(inplace=True)

In [8]:
df['binary_disposition'] = df['disposition']
df['binary_disposition'] = df['binary_disposition'].where(df['binary_disposition'] == 'Deny', other='Compensate')

# Change some text to make it more human readable
df.claim_site[df.claim_site == '-'] = 'Unknown'
df.claim_type[df.claim_type == '-'] = 'Unknown'

# Feature Engineering

### Count of items claimed

# TODO: Consider only using this for where claim_type is related to property.
df['num_items_or_incidents_claimed'] = df['item_category'].str.split(pat=';').apply(lambda x: len(x))
df['num_items_or_incidents_claimed'] = df['num_items_or_incidents_claimed'].where(df['claim_type'].str.contains('property', case=False) == True, other= 0)

### Time calculation
wait_period = df.date_received - df.incident_date
df['days_waited_to_file_claim'] = wait_period.dt.days

# Drop days where the 'date_received" was reported before 'incident_date'
df = df[df.days_waited_to_file_claim >= 0]

df.reset_index(inplace=True,drop=True)
df['bin_dispos_onehot'] = df['binary_disposition'].apply(lambda x: 1 if x == 'Compensate' else 0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [9]:
# df['Week/Year_inc_date'] = df['incident_date'].apply(lambda x: "%d/%d" % (x.week, x.year))
# df['Week_inc_date'] = df['incident_date'].apply(lambda x: "%d" % (x.week))
df['Month_inc_date'] = df['incident_date'].apply(lambda x: "%d" % (x.month))
# df['Year_inc_date'] = df['incident_date'].apply(lambda x: "%d" % (x.year))

# df['Week_received'] = df['date_received'].apply(lambda x: "%d" % (x.week))
df['Month_received'] = df['date_received'].apply(lambda x: "%d" % (x.month))
# df['Year_received'] = df['date_received'].apply(lambda x: "%d" % (x.year))

In [10]:
df['squared_days_waited'] = df['days_waited_to_file_claim'].apply(lambda x: x**2)

In [17]:
import joblib
savedir = 'data/clean'
joblib.dump(df, f'{savedir}/df_for_plots')

['data/clean/df_for_plots']

# Modeling Prep

In [11]:
X_small_df = df[['airport_code', 'airline', 'claim_type', 'claim_site', 'item_category', 'days_waited_to_file_claim', 'Month_received', 'Month_inc_date', 'squared_days_waited']]
y_small = df['binary_disposition'].apply(lambda x: 1 if x == 'Compensate' else 0)

In [12]:
categorical = ['airport_code', 'airline', 'claim_type', 'claim_site', 'Month_inc_date']
continuous =  ['days_waited_to_file_claim'] #, 'squared_days_waited']

In [60]:
# Save fit models to be transformed later in 
import joblib
trans_dir = './web_app/stat_models/transformers'

enc = OneHotEncoder(sparse=False)
enc.fit(X_small_df[categorical])
joblib.dump(enc, f'{trans_dir}/onehotencode.joblib')
# onehotarray = enc.transform(X[categorical])

ss = StandardScaler()
ss.fit(X_small_df[continuous])
joblib.dump(ss, f'{trans_dir}/standardscaler.joblib')


mlb = MultiLabelBinarizer(sparse_output=False)
mlb.fit(X_small_df['item_category'].str.replace(' ','').str.split(pat=';'))
joblib.dump(mlb, f'{trans_dir}/item_category.joblib')

  return self.partial_fit(X, y)


['./web_app/stat_models/transformers/item_category.joblib']

In [18]:
enc = OneHotEncoder(sparse=False)
onehotarray = enc.fit_transform(X_small_df[categorical])

ss = StandardScaler()
continuousarray = ss.fit_transform(X_small_df[continuous])

mlb = MultiLabelBinarizer(sparse_output=False)
onehot_itemcategories = mlb.fit_transform(X_small_df['item_category'].str.replace(' ','').str.split(pat=';'))

X_small = np.concatenate((onehotarray, continuousarray, onehot_itemcategories), axis=1)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_small, y_small, test_size=0.3, random_state=42, stratify=y_small)

# Random Forest Classifier

In [20]:
%%time
rf = RandomForestClassifier(random_state=42)

criterions = ['gini']#, 'entropy']
n_ests = [300]
m_depths = [30]
param_grid = dict(criterion=criterions, n_estimators=n_ests, max_depth=m_depths)

grid_rf = GridSearchCV(rf, param_grid, scoring='roc_auc', cv=6, n_jobs=1)

grid_rf.fit(X_train, y_train)

print(grid_rf.best_score_)
print(grid_rf.best_params_)
print(grid_rf.best_estimator_)

y_pred = grid_rf.predict(X_test)
print(roc_auc_score(y_test, y_pred))

0.6916683486382915
{'criterion': 'gini', 'max_depth': 30, 'n_estimators': 300}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
0.6083105773516357
CPU times: user 8min 26s, sys: 312 ms, total: 8min 26s
Wall time: 8min 26s


In [21]:
savedir = 'models'
joblib.dump(grid_rf.best_estimator_, f'{savedir}/rf_subset_airports_airlines_12.04.2018.joblib',compress=9)

['models/rf_subset_airports_airlines_12.04.2018.joblib']

# Gradient Boosting Classifier

In [18]:
%%time
gbclf = GradientBoostingClassifier(random_state=42)

n_ests = [300]
m_depths = [5, 10, 15]
learning_rates = [0.05, 0.1, 0.15]
param_grid = dict()#n_estimators=n_ests, max_depth=m_depths, learning_rate = learning_rates)

gb = GridSearchCV(gbclf, param_grid, scoring='roc_auc', cv=6, n_jobs=5)

gb.fit(X_train, y_train)

print(gb.best_score_)
# print(grid_rf.best_params_)
# print(grid_rf.best_estimator_)

y_pred = gb.predict(X_test)
print(roc_auc_score(y_test, y_pred))

savedir = 'models'
joblib.dump(gb.best_estimator_, f'{savedir}/gradboost_12.04.2018.joblib',compress=9)

0.6839737567441151
0.5794886547601418
CPU times: user 1min 48s, sys: 414 ms, total: 1min 48s
Wall time: 8min 13s


# Logisic Regression Classifier

In [126]:
%%time
logitclf = LogisticRegression(random_state=42)

solvers = ['newton-cg', 'sag', 'lbfgs']
penalties = ['l2']
tols = [.001, .0001, .00001]
Cs = [0.5, 1.0, 1.5]
param_grid = dict(solver=solvers, penalty=penalties, tol=tols, C=Cs)

logit = GridSearchCV(logitclf, param_grid, scoring='roc_auc', cv=8, n_jobs=1)

logit.fit(X_train, y_train)

print(logit.best_score_)
# print(grid_rf.best_params_)
# print(grid_rf.best_estimator_)

y_pred = logit.predict(X_test)
print(roc_auc_score(y_test, y_pred))

savedir = 'models'
joblib.dump(logit.best_estimator_, f'{savedir}/logit_12.04.2018.joblib',compress=9)



0.6731556467229697
0.5960707825108541
CPU times: user 1h 13min 14s, sys: 1min 11s, total: 1h 14min 26s
Wall time: 26min 54s


# Naive Bayes Bernoulli Classifier

In [123]:
%%time
bNBclf = BernoulliNB()

solvers = ['newton-cg', 'sag', 'lbfgs']
penalties = ['l2']
n_ests = [300]
m_depths = [30]
param_grid = dict()

bNB = GridSearchCV(bNBclf, param_grid, scoring='roc_auc', cv=8, n_jobs=1)

bNB.fit(X_train, y_train)

print(bNB.best_score_)
# print(grid_rf.best_params_)
# print(grid_rf.best_estimator_)

y_pred = bNB.predict(X_test)
print(roc_auc_score(y_test, y_pred))

savedir = 'models'
joblib.dump(bNB.best_estimator_, f'{savedir}/bernoulliNB_12.04.2018.joblib',compress=9)

0.6422614422937925
0.5849968847221777
CPU times: user 8.91 s, sys: 1.28 s, total: 10.2 s
Wall time: 2.68 s


# Support Vector Machine Classifier

In [19]:
%%time
SVCclf = SVC(random_state=42, gamma='auto')

kernels = [#'linear', 
           #'poly',
           #'sigmoid', 
           'rbf']
Cs = [5, 10, 15]
tols = [0.0001, 0.001, 0.01]
param_grid = dict(kernel=kernels)#, tol=tols, C=Cs)

SVC = GridSearchCV(SVCclf, param_grid, scoring='roc_auc', cv=8, n_jobs=4)

SVC.fit(X_train, y_train)

print(SVC.best_score_)
# print(grid_rf.best_params_)
# print(grid_rf.best_estimator_)

y_pred = SVC.predict(X_test)
print(roc_auc_score(y_test, y_pred))

savedir = 'models'
joblib.dump(SVC.best_estimator_, f'{savedir}/supportvector_12.04.2018.joblib',compress=9)



0.6580844464200435
0.5566937910955249
CPU times: user 25min 8s, sys: 367 ms, total: 25min 9s
Wall time: 2h 7min 41s


# Extra Trees Classifier

In [22]:
%%time
exclf = ExtraTreesClassifier(random_state=42)

criterions = ['gini']#, 'entropy']
n_ests = [300]
m_depths = [30] #, 30, 40]
param_grid = dict(criterion=criterions, n_estimators=n_ests, max_depth=m_depths)

extra = GridSearchCV(exclf, param_grid, scoring='roc_auc', cv=8, n_jobs=5)

extra.fit(X_train, y_train)

print(extra.best_score_)
# print(grid_rf.best_params_)
# print(grid_rf.best_estimator_)

y_pred = extra.predict(X_test)
print(roc_auc_score(y_test, y_pred))

savedir = 'models'
joblib.dump(extra.best_estimator_, f'{savedir}/extratrees_12.04.2018.joblib',compress=9)

0.68361139641582
0.6011070003410419
CPU times: user 3min 4s, sys: 420 ms, total: 3min 4s
Wall time: 8min 5s


# Catboost Gradient Descent

In [182]:
%%time
cboost = CatBoostClassifier(silent=True, random_seed=42)
    
param_grid = dict() # n_estimators=n_ests)

cat_clf = GridSearchCV(cboost, param_grid, scoring='roc_auc', cv=8, n_jobs=5)

cat_clf.fit(X_train, y_train)

print(cat_clf.best_score_)
print(cat_clf.best_params_)
print(cat_clf.best_estimator_)

y_pred = cat_clf.predict(X_test)
print(roc_auc_score(y_test, y_pred))


savedir = 'models'
joblib.dump(cat_clf.best_estimator_, f'{savedir}/catboost_month_incident_12.06.2018.joblib',compress=3)

0.7022735521377326
{}
<catboost.core.CatBoostClassifier object at 0x7fa6e1e8c518>
0.6278820646943781
CPU times: user 3min 46s, sys: 7.67 s, total: 3min 54s
Wall time: 5min 46s


# Create Lists to use in Web App

In [93]:
import joblib

itemlist = None
flat_list = None
flat_list2 = None

itemlist = []
X_small_df['item_category'].str.split(';').map(lambda x: itemlist.append(x) if x is not np.nan else 0)
flat_list = [item for sublist in itemlist for item in sublist]

flat_list2 = [item.strip() for item in flat_list]

featuredirs = './web_app/featurelists'
airports = X_small_df.airport_code.unique().tolist()
airlines = sorted(X_small_df.airline.unique().tolist())[1:]
claim_types = X_small_df.claim_type.unique().tolist()
claim_sites = X_small_df.claim_site.unique().tolist()
item_cats = sorted(list(set(flat_list2)))[2:]

joblib.dump(airports, f'{featuredirs}/airports.joblib')
joblib.dump(airlines, f'{featuredirs}/airlines.joblib')
joblib.dump(claim_types, f'{featuredirs}/claim_types.joblib')
joblib.dump(claim_sites, f'{featuredirs}/claim_sites.joblib')
joblib.dump(item_cats, f'{featuredirs}/item_category.joblib')

['./web_app/featurelists/item_category.joblib']

In [96]:
feat_names = list(enc.categories_[0]) + list(enc.categories_[1]) + list(enc.categories_[2]) + list(enc.categories_[3]) + list(enc.categories_[4])+ list(X_small_df[continuous].columns) + list(mlb.classes_)

values = sorted(zip(feat_names, cat_clf.best_estimator_.feature_importances_), key=lambda x: x[1] * -1)

joblib.dump(values, './models/catboost_best_12.05_feature_importance_list')

['./models/catboost_best_12.05_feature_importance_list']

In [None]:
values[:5]

# Catboost Variations Not Using

In [177]:
# %%time
# cboost = CatBoostClassifier(silent=True, one_hot_max_size=5, random_seed=42)
    
# param_grid = dict() # n_estimators=n_ests)

# cat_clf = GridSearchCV(cboost, param_grid, scoring='roc_auc', cv=8, n_jobs=5)

# cat_clf.fit(X_train, y_train)

# print(cat_clf.best_score_)
# print(cat_clf.best_params_)
# print(cat_clf.best_estimator_)

# y_pred = cat_clf.predict(X_test)
# print(roc_auc_score(y_test, y_pred))


# savedir = 'models'
# joblib.dump(cat_clf.best_estimator_, f'{savedir}/catboost_month_incident_oh5_12.06.2018.joblib',compress=3)

0.7022735521377326
{}
<catboost.core.CatBoostClassifier object at 0x7fa6e1e8ea20>
0.6278820646943781
CPU times: user 3min 47s, sys: 6.84 s, total: 3min 53s
Wall time: 5min 12s


In [160]:
# %%time
# cboost = CatBoostClassifier(silent=True, random_seed=42)
    
# param_grid = dict() # n_estimators=n_ests)

# cat_clf = GridSearchCV(cboost, param_grid, scoring='roc_auc', cv=8, n_jobs=5)

# cat_clf.fit(X_train, y_train)

# print(cat_clf.best_score_)
# print(cat_clf.best_params_)
# print(cat_clf.best_estimator_)

# y_pred = cat_clf.predict(X_test)
# print(roc_auc_score(y_test, y_pred))


# savedir = 'models'
# joblib.dump(cat_clf.best_estimator_, f'{savedir}/catboost_month_inc_and_squared_days_12.06.2018.joblib',compress=9)

0.7017768352354733
{}
<catboost.core.CatBoostClassifier object at 0x7fa6f46b7470>
0.6276856711426504
CPU times: user 3min 16s, sys: 4.89 s, total: 3min 21s
Wall time: 4min 14s


In [161]:
# %%time
# cboost = CatBoostClassifier(silent=True, one_hot_max_size=200, random_seed=42)
    
# param_grid = dict() # n_estimators=n_ests)

# cat_clf = GridSearchCV(cboost, param_grid, scoring='roc_auc', cv=8, n_jobs=5)

# cat_clf.fit(X_train, y_train)

# print(cat_clf.best_score_)
# print(cat_clf.best_params_)
# print(cat_clf.best_estimator_)

# y_pred = cat_clf.predict(X_test)
# print(roc_auc_score(y_test, y_pred))


# savedir = 'models'
# joblib.dump(cat_clf.best_estimator_, f'{savedir}/catboost_month_inc_and_squared_days_tweaked_12.06.2018.joblib',compress=9)

0.7017768352354733
{}
<catboost.core.CatBoostClassifier object at 0x7fa6e2519908>
0.6276856711426504
CPU times: user 3min 49s, sys: 7.37 s, total: 3min 57s
Wall time: 5min 16s


# XGBoost Model Variations

In [43]:
import xgboost
from xgboost import XGBClassifier

In [44]:
%%time
xgb = XGBClassifier(silent=False, random_seed=42)

nests = [100, 300, 500]
lrates=[.02, .05]
colsample_bytrees = [0.5, 1]
subsamples = [0.3, 0.8]
scale_pos_weights = [0.5, 1]

param_grid = dict(learning_rate=lrates, n_estimators=nests, colsample_bytree=colsample_bytrees, subsample=subsamples, scale_pos_weight=scale_pos_weights)

xgb_clf = GridSearchCV(xgb, param_grid, scoring='roc_auc', cv=8, n_jobs=5)

xgb_clf.fit(X_train, y_train)

print(xgb_clf.best_score_)
print(xgb_clf.best_params_)
print(xgb_clf.best_estimator_)

y_pred = xgb_clf.predict(X_test)
print(roc_auc_score(y_test, y_pred))


savedir = 'models'
joblib.dump(xgb_clf.best_estimator_, f'{savedir}/xgb_12.05.2018.joblib',compress=9)

[17:12:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:12:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:12:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:12:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:12:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:12:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:12:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:12:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:12:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_