In [34]:
import numpy as np
import pandas as pd

In [71]:
rawdir = 'data/raw'
files = ['claims-2002-2006.xls', 'claims-2007-2009.xls', 'claims-2010-2013.xls',
         'claims-2014.xls', 'claims-2015.xlsx', 'claims-2016.csv', 'claims-2017.csv']
filelist = [f'{rawdir}/{file}' for file in files]

In [52]:
filelist

['data/raw/claims-2002-2006.xls',
 'data/raw/claims-2007-2009.xls',
 'data/raw/claims-2010-2013.xls',
 'data/raw/claims-2014.xls',
 'data/raw/claims-2015.xlsx',
 'data/raw/claims-2016.csv',
 'data/raw/claims-2017.csv']

In [53]:
d = {}
for i, file in enumerate(filelist):
    if 'xls' in file:
        d[i] = pd.read_excel(file)
    if 'csv' in file:
        d[i] = pd.read_csv(file)
        d[i] = d[i].iloc[:, 1:]
    else:
        assert "Error"

In [96]:
df = pd.concat(d, axis=0, sort=False, ignore_index=True)

In [87]:
df.shape

(221088, 14)

### Rename columns

In [97]:
col_names = ['claim_number', 'date_received', 'incident_date', 'airport_code', 'airport_name',
             'airline', 'claim_type', 'claim_site', 'item', 'claim_amount', 'status',
             'close_amount', 'disposition', 'item_category']

col_mapper = dict(zip(df.columns, col_names))
df.rename(columns=col_mapper, inplace=True)

## From this point forward, this is cleaning for 2002-2017 CLASSIFIER

### Clean and drop columns

In [112]:
df[df.close_amount.isna()]

Unnamed: 0,claim_number,date_received,incident_date,airport_code,airline,claim_type,claim_site,claim_amount,status,close_amount,disposition,item_category
9,2004102965797,2002-06-25,2002-05-04,ATL,,Passenger Property Loss,Checkpoint,4457.29,Denied,,Deny,Jewelry - Fine
33,2004050550501,2002-09-11,2002-08-05,BWI,,Passenger Property Loss,Checkpoint,1200.00,Denied,,Deny,Other
36,18193,2002-09-16,2002-07-26,BWI,American Airlines,,Other,428.21,Denied,,Deny,Currency
90,19078,2002-10-17,2002-09-12,SAT,Southwest Airlines,,Other,30.00,Approved,,Approve in Full,Computer - Laptop
108,21051,2002-10-21,2002-10-03,ATL,Delta Air Lines,,Other,75.00,Approved,,Approve in Full,Cell Phones
277,27499,2002-11-16,2002-10-02,PHL,,Passenger Property Loss,Checkpoint,50.00,Approved,,Approve in Full,Jewelry - Fine
299,20856,2002-11-19,2002-10-30,SEA,Southwest Airlines,,Other,23105.00,Denied,,Deny,"Baby - Strollers, car seats, playpen, etc."
454,0616755L,2002-12-03,2002-11-01,PHL,,Property Damage,Checkpoint,53.00,Approved,,Approve in Full,Cell Phones
457,19905,2002-12-03,2002-10-17,COS,,Passenger Property Loss,Checkpoint,125.00,Approved,,Approve in Full,"Clothing - Shoes, belts, accessories, etc."
517,21749,2002-12-09,2002-12-01,ATL,Delta Air Lines,Property Damage,Checkpoint,98.00,Approved,,Approve in Full,"Clothing - Shoes, belts, accessories, etc."


In [108]:
df.close_amount = df.close_amount.apply(lambda x: df.claim_amount if x is np.nan and df.disposition == "Approve in Full" else x) 

In [102]:
df.close_amount.where(df.close_amount is np.nan and df.disposition == "Approve in Full", df.claim_amount)

ValueError: Array conditional must be same shape as self

In [98]:
# combine the Item and Item Category column into just Item Category
df['item_category'].fillna(value=df['item'], inplace=True)
df.drop(columns=['item', 'airport_name'], inplace=True)

# Set DateTime columns
df['incident_date'] = pd.to_datetime(df['incident_date'], errors='coerce')
df['date_received'] = pd.to_datetime(df['date_received'], errors='coerce')
df.dropna(subset=['date_received', 'incident_date'], inplace=True)

# df['date_received'] = pd.to_datetime(df['date_received'])
# df['incident_date'] = pd.to_datetime(df['incident_date'])

# Limit to 3 outcomes of interest (and clean)
df.disposition = df.disposition.str.replace('D eny|Den y|D en y|De ny',
                           'Deny', regex=True)
df.disposition = df.disposition.str.replace('Approv e in Full',
                           'Approve in Full', regex=True)

df = df[(df['disposition'] == "Approve in Full") |
        (df['disposition'] == "Settle") |
        (df['disposition'] == "Deny")]

df.close_amount.where(df.close_amount is np.nan and df.disposition == "Approve in Full", df.claim_amount)

# Set other column types
df['close_amount'] = df['close_amount'].astype('float64')

ValueError: cannot convert float NaN to integer

In [None]:
df.head()

In [21]:
df = df.dropna()


df['binary_disposition'] = df['disposition']
df['binary_disposition'] = df['disposition'].where(df['disposition'] == 'Deny', other='Compensate')

# Change some text to make it more human readable
df.claim_site[df.claim_site == '-'] = 'Unknown'
df.claim_type[df.claim_type == '-'] = 'Unknown'

# Feature Engineering

### Count of items claimed

# TODO: Consider only using this for where claim_type is related to property.
df['num_items_or_incidents_claimed'] = df['item_category'].str.split(pat=';').apply(lambda x: len(x))
df['num_items_or_incidents_claimed'] = df['num_items_or_incidents_claimed'].where(df['claim_type'].str.contains('property', case=False) == True, other= 0)

### Time calculation
wait_period = df.date_received - df.incident_date
df['days_waited_to_file_claim'] = wait_period.dt.days

# Drop days where the 'date_received" was reported before 'incident_date'
df = df[df.days_waited_to_file_claim >= 0]

df.reset_index(inplace=True,drop=True)
df['bin_dispos_onehot'] = df['binary_disposition'].apply(lambda x: 1 if x == 'Compensate' else 0)

In [22]:
df.head()

Unnamed: 0,Claim Number,Date Received,Incident Date,Airport Code,Airline Name,Claim Type,Claim Site,Claim Amount,Status,Close Amount,...,airline,claim_type,claim_site,item_category,close_amount,disposition,binary_disposition,num_items_or_incidents_claimed,days_waited_to_file_claim,bin_dispos_onehot


In [8]:
df['Week/Year_inc_date'] = df['incident_date'].apply(lambda x: "%d/%d" % (x.week, x.year))
df['Week_inc_date'] = df['incident_date'].apply(lambda x: "%d" % (x.week))
df['Month_inc_date'] = df['incident_date'].apply(lambda x: "%d" % (x.month))
df['Year_inc_date'] = df['incident_date'].apply(lambda x: "%d" % (x.year))

df['Week_received'] = df['date_received'].apply(lambda x: "%d" % (x.week))
df['Month_received'] = df['date_received'].apply(lambda x: "%d" % (x.month))
df['Year_received'] = df['date_received'].apply(lambda x: "%d" % (x.year))