#Questions

* Should I calculate business days since incident? or Total days?
* How to do timeseries?
  * Only predict a date based on the info that's happened BEFORE it?
  * Book by week? exact day of year? month?

In [270]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import pyspark

import datetime as dt

import tabula
import joblib

In [165]:
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 60)
pd.set_option('display.precision', 3)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import sklearn
from sklearn.preprocessing import StandardScaler, Binarizer, LabelBinarizer, MultiLabelBinarizer, OneHotEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, cross_validate \
                                    ,cross_val_predict, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix,recall_score,precision_score, f1_score
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

from sklearn.metrics import roc_curve, auc

import itertools
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
from imblearn.over_sampling import SMOTE

In [115]:
dir = 'models/preliminary'

-----

In [99]:
# A function that does the cleaning if you pass the df

def colnames_dt_drops_dtypes(df):
    col_names = ['claim_number', 'date_received', 'incident_date', 'airport_code', 'airport_name',
           'airline', 'claim_type', 'claim_site', 'item_category', 'close_amount', 'disposition']
    df['date_received'] = pd.to_datetime(df['date_received'])
    df['incident_date'] = pd.to_datetime(df['incident_date'])
    df = df[df['disposition'] != "-"]
    df = df.dropna()
    df['claim_number'] = df['claim_number'].astype('int64')
    df['close_amount'] = df['close_amount'].astype('int64')
    df['binary_disposition'] = df['disposition']
    df['binary_disposition'] = df['binary_disposition'].where(df['binary_disposition'] == 'Deny', other='Compensate')
    return df

-----

In [272]:
col_names = ['claim_number', 'date_received', 'incident_date', 'airport_code', 'airport_name',
           'airline', 'claim_type', 'claim_site', 'item_category', 'close_amount', 'disposition']
           

In [273]:
df = pd.read_excel('raw/claims-2010-2013.xls', names=col_names)

df['date_received'] = pd.to_datetime(df['date_received'])
df['incident_date'] = pd.to_datetime(df['incident_date'])
df = df[df['disposition'] != "-"]
df = df.dropna()

df['claim_number'] = df['claim_number'].astype('int64')
df['close_amount'] = df['close_amount'].astype('int64')
df['binary_disposition'] = df['disposition']
df['binary_disposition'] = df['binary_disposition'].where(df['binary_disposition'] == 'Deny', other='Compensate')

# Time calculation
wait_period = df.date_received - df.incident_date
df['days_until_filed_claim'] = wait_period.dt.days

# Drop days where the 'date_received" was reported before 'incident_date'
df = df[df.days_until_filed_claim >= 0]

# Calculate 

In [274]:
df.sort_values(by='close_amount', ascending=False).head()

Unnamed: 0,claim_number,date_received,incident_date,airport_code,airport_name,airline,claim_type,claim_site,item_category,close_amount,disposition,binary_disposition,days_until_filed_claim
27818,2012110798436,2012-07-27,2011-12-30 17:30:00,DTW,Detroit Metropolitan Wayne County Airport,Delta Air Lines,Personal Injury,Other,-,25000,Settle,Compensate,209
25965,2012061194755,2012-05-21,2011-12-11 09:30:00,ZZX,Non TSA Airport (motor vehicle),-,Personal Injury,Motor Vehicle,Other,16664,Settle,Compensate,161
20698,2011111689101,2011-11-17,2010-08-08 15:00:00,ATL,Hartsfield-Jackson Atlanta International Airport,Delta Air Lines,Personal Injury,Checkpoint,-,10000,Settle,Compensate,465
15738,2011061784438,2011-06-06,2010-10-26 21:00:00,LAX,Los Angeles International Airport,UAL,Passenger Property Loss,Checkpoint,Jewelry & Watches,9660,Settle,Compensate,222
21924,2012020390650,2012-01-04,2011-12-29 17:00:00,MCO,Orlando International Airport,Southwest Airlines,Passenger Property Loss,Checkpoint,Jewelry & Watches,7000,Settle,Compensate,5


In [275]:
X = df.drop(['claim_number', 'airport_name', 'binary_disposition', 'disposition', 
             'date_received', 'incident_date'
            ]
            , axis=1)
y = df['binary_disposition'].apply(lambda x: 1 if x == 'Compensate' else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [276]:
X.head()

Unnamed: 0,airport_code,airline,claim_type,claim_site,item_category,close_amount,days_until_filed_claim
0,SLC,Delta Air Lines,Property Damage,Checked Baggage,Cosmetics & Grooming,0,0
1,LAX,Southwest Airlines,Passenger Property Loss,Checked Baggage,Other,0,2
2,SEA,Delta Air Lines,Passenger Property Loss,Checked Baggage,Cameras; Cameras,0,1
4,LAS,American Airlines,Passenger Property Loss,Checked Baggage,Travel Accessories,0,2
5,DFW,American Airlines,Passenger Property Loss,Checked Baggage,Travel Accessories,0,1


In [286]:
df.claim_site.unique()

array(['Checked Baggage', 'Checkpoint', 'Other', 'Motor Vehicle',
       'Unknown', 'Bus Station'], dtype=object)

claim_number              48
date_received             48
incident_date             48
airport_code              48
airport_name              48
airline                   48
claim_type                48
claim_site                48
item_category             48
close_amount              48
disposition               48
binary_disposition        48
days_until_filed_claim    48
dtype: int64

In [167]:
ohe = OneHotEncoder()

In [168]:
ohe.fit_transform(X.drop(columns='close_amount'))

<34648x1506 sparse matrix of type '<class 'numpy.float64'>'
	with 173240 stored elements in Compressed Sparse Row format>

OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)

In [169]:
ohe.categories_

[array(['ABE', 'ABI', 'ABQ', 'ABR', 'ABY', 'ACK', 'ACT', 'ACV', 'ACY',
        'ADQ', 'AEX', 'AGS', 'AHN', 'AIA', 'ALB', 'ALO', 'ALS', 'ALW',
        'AMA', 'ANC', 'APN', 'ART', 'ASE', 'ATL', 'ATW', 'ATY', 'AUG',
        'AUS', 'AVL', 'AVP', 'AZO', 'BDL', 'BET', 'BFF', 'BFL', 'BGM',
        'BGR', 'BHB', 'BHM', 'BIL', 'BIS', 'BJI', 'BKG', 'BLI', 'BMI',
        'BNA', 'BOI', 'BOS', 'BPT', 'BQK', 'BQN', 'BRD', 'BRL', 'BRO',
        'BTM', 'BTR', 'BTV', 'BUF', 'BUR', 'BWI', 'BZN', 'CAE', 'CAK',
        'CDC', 'CDR', 'CDV', 'CEC', 'CEZ', 'CHA', 'CHO', 'CHS', 'CID',
        'CIU', 'CKB', 'CLE', 'CLL', 'CLT', 'CMH', 'CMI', 'CMX', 'COD',
        'COS', 'COU', 'CPR', 'CRP', 'CRQ', 'CRW', 'CSG', 'CVG', 'CWA',
        'CYS', 'DAB', 'DAL', 'DAY', 'DBQ', 'DCA', 'DEN', 'DFW', 'DHN',
        'DIK', 'DLG', 'DLH', 'DRO', 'DRT', 'DSM', 'DTW', 'DUJ', 'DVL',
        'EAR', 'EAT', 'EAU', 'ECP', 'EGE', 'EKO', 'ELM', 'ELP', 'ENA',
        'ENV', 'ERI', 'ESC', 'EUG', 'EVV', 'EWN', 'EWR', 'F', 'FAI', 'FAR',
 

In [161]:
mlb = MultiLabelBinarizer()
mlb.fit_transform(X.drop(columns='close_amount'))

array([[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0],
       [0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0],
       [1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0],
       [1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1]])

{'classes': None, 'sparse_output': False}

In [121]:
X.shape

(34648, 8)

In [122]:
y.shape

(34648,)

In [123]:
rf = RandomForestClassifier(random_state=42)

criterions = ['gini'] #, 'entropy']
n_ests = [#100,
          300]
    
param_grid = dict(criterion=criterions, n_estimators=n_ests)

grid_rf = GridSearchCV(rf, param_grid, scoring='roc_auc', cv=5, n_jobs=-1)

grid_rf.fit(X_train, y_train)

print(grid_rf.best_score_)
print(grid_rf.best_params_)
print(grid_rf.best_estimator_)

ValueError: could not convert string to float: 'RSW'