# Questions

* Should I calculate business days since incident? or Total days?
* How to do timeseries?
  * Only predict a date based on the info that's happened BEFORE it?
  * Book by week? exact day of year? month?
 ---
* Could I make an EITHER/OR case?
  * First do a classifier for Compensate or Deny. If compensate, then do a Regression for HOW MUCH they'll get back?
  * I could also do models that ONLY take into account how much they asked for, IF I have enough data
* Why is my RF 410 MB?

### 1st simple model, "settle" or "compensate" I got 68% ROC!

### *Is it an issue with 370 airports and 170 airlines?*

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import pyspark

import datetime as dt

import tabula
import joblib

In [3]:
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 60)
pd.set_option('display.precision', 3)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import sklearn
from sklearn.preprocessing import StandardScaler, Binarizer, LabelBinarizer, MultiLabelBinarizer, OneHotEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, cross_validate \
                                    ,cross_val_predict, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix,recall_score,precision_score, f1_score
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

from sklearn.metrics import roc_curve, auc

import itertools
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
from imblearn.over_sampling import SMOTE

In [4]:
dir = 'models/preliminary'

-----

In [99]:
# A function that does the cleaning if you pass the df

def colnames_dt_drops_dtypes(df):
    col_names = ['claim_number', 'date_received', 'incident_date', 'airport_code', 'airport_name',
           'airline', 'claim_type', 'claim_site', 'item_category', 'close_amount', 'disposition']
    df['date_received'] = pd.to_datetime(df['date_received'])
    df['incident_date'] = pd.to_datetime(df['incident_date'])
    df = df[df['disposition'] != "-"]
    df = df.dropna()
    df['claim_number'] = df['claim_number'].astype('int64')
    df['close_amount'] = df['close_amount'].astype('int64')
    df['binary_disposition'] = df['disposition']
    df['binary_disposition'] = df['binary_disposition'].where(df['binary_disposition'] == 'Deny', other='Compensate')
    return df

-----

In [127]:
col_names = ['claim_number', 'date_received', 'incident_date', 'airport_code', 'airport_name',
           'airline', 'claim_type', 'claim_site', 'item_category', 'claim_amount', 'status', 'close_amount', 'disposition']
           

In [128]:
df = pd.read_excel('raw/claims-2002-2006.xls', names=col_names)

In [129]:
df = df.drop(['claim_number', 'airport_name', 
                     #'binary_disposition',
                     #'disposition', 
#                      'date_received', 'incident_date',
              'status'
                    ]
                    , axis=1)

df = df.dropna()

In [130]:
#drop the 24 dates in date_received that happened after 2006
df = df.drop(index=df.date_received.sort_values(ascending=False)[:23].index)

In [131]:
df = df.dropna()
df['date_received'] = pd.to_datetime(df['date_received'])
df['incident_date'] = df.incident_date.apply(lambda x: np.nan if type(x) != dt.datetime else x)
df = df.dropna()
df['incident_date'] = pd.to_datetime(df['incident_date'])
df = df[df['disposition'] != "-"]

# df['claim_number'] = df['claim_number'].astype('int64')
df['close_amount'] = df['close_amount'].astype('int64')
df['claim_amount'] = df['claim_amount'].astype('int64')

df['binary_disposition'] = df['disposition']
df['binary_disposition'] = df['binary_disposition'].where(df['binary_disposition'] == 'Deny', other='Compensate')

# Time calculation
wait_period = df.date_received - df.incident_date
df['days_until_filed_claim'] = wait_period.dt.days

# Drop days where the 'date_received" was reported before 'incident_date'
df = df[df.days_until_filed_claim >= 0]

# Change some text to make it more human readable
df.claim_site[df.claim_site == '-'] = 'Unknown'
df.claim_type[df.claim_type == '-'] = 'Unknown'

# Decrease item_category to the top leel categories only (only 27 instead of 300+ of them)
df['item_category'] = df['item_category'].str.replace(';.+', '', regex=True)

In [134]:
df.sort_values(by='close_amount', ascending=False).head()

Unnamed: 0,date_received,incident_date,airport_code,airline,claim_type,claim_site,item_category,claim_amount,close_amount,disposition,binary_disposition,days_until_filed_claim
2703,2003-02-21,2003-01-08,LAS,Southwest Airlines,Passenger Property Loss,Checked Baggage,Other,250000,250000,Deny,Deny,44
36935,2004-06-03,2004-04-02,F,Northwest Airlines,Passenger Property Loss,Other,Other,45178,45178,Deny,Deny,62
50725,2004-11-24,2003-12-27,HPN,USAir,Personal Injury,Checkpoint,"Clothing - Shoes, belts, accessories, etc.",50150,20000,Settle,Compensate,333
47057,2004-10-09,2004-09-02,LAX,USAir,Passenger Property Loss,Checked Baggage,Luggage (all types including footlockers),14518,14518,Deny,Deny,37
52575,2004-12-21,2004-10-08,CMH,American Airlines,Property Damage,Checked Baggage,Photographic Film,13060,13060,Approve in Full,Compensate,74


In [505]:
df_simple = df.drop(['claim_number', 'airport_name', 
                     #'binary_disposition',
                     'disposition', 
                     'date_received', 'incident_date'
                    ]
                    , axis=1)

In [135]:
X = df.drop([#'claim_number', 'airport_name', 
             'binary_disposition', 'disposition', 
             'date_received', 'incident_date',
             'close_amount',
            ]
            , axis=1)
y = df['binary_disposition'].apply(lambda x: 1 if x == 'Compensate' else 0)

In [136]:
categorical = ['airport_code', 'airline', 'claim_type', 'claim_site', 'item_category']
continuous =  ['claim_amount', 'days_until_filed_claim']

In [137]:
enc = OneHotEncoder(sparse=False)
onehotarray = enc.fit_transform(X[categorical])
ss = StandardScaler()
continuousarray = ss.fit_transform(X[continuous])
X = np.concatenate((onehotarray, continuousarray), axis=1)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [142]:
rf = RandomForestClassifier(random_state=42)

criterions = ['gini'] #, 'entropy']
n_ests = [#100,
          300]
    
param_grid = dict(criterion=criterions, n_estimators=n_ests)

grid_rf = GridSearchCV(rf, param_grid, scoring='roc_auc', cv=5, n_jobs=-1)

grid_rf.fit(X_train, y_train)

print(grid_rf.best_score_)
print(grid_rf.best_params_)
print(grid_rf.best_estimator_)

0.6863849896669834
{'criterion': 'gini', 'n_estimators': 300}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)


In [None]:
dir = 'models/preliminary'

In [143]:
joblib.dump(grid_rf.best_estimator_, f'{dir}/rf_num2_with_requested_amount_but_no_dates')

['models/preliminary/rf_num2_with_requested_amount_but_no_dates']