### Fraud Detector
#### Data Prep Notebook

This notebook makes use of a data set which can be found here:  
https://github.com/mwitiderrick/insurancedata/blob/master/insurance_claims.csv



In [None]:
import pandas as pd
import numpy as np
import regex
import boto3
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
s3bucket = 'your-bucket-name'
s3prefix = 'your-prefix'

In [None]:
df1 = pd.read_csv('data/insurance_claims.csv')

In [None]:
df1.shape

In [None]:
df1.head()

In [None]:
# provide an 'Unknown' category for any cells with a question mark
df1.replace('?', 'Unknown', inplace=True)

In [None]:
# the label must be lower case
df1['fraud_reported'].replace({'Y':'y', 'N': 'n'}, inplace=True)

### Expand our data set from 1,000 rows to 12,000 rows
This is done in several steps (in the next two cells):
1. Get lists of unique values from several columns
2. Randomize some columns of data in a second dataframe
3. Append this randomized dataframe (df2) to the original dataframe
4. Repeat multiple times

In [None]:
states = [ 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
           'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
           'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
           'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
           'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']

In [None]:
auto_makes = df1.auto_make.unique()
auto_models = df1.auto_model.unique()
auto_years = df1.auto_year.unique()
incident_cities = df1.incident_city.unique()
#incident_states = df.incident_state.unique()
incident_states = states
incident_locations = df1.incident_location.unique()
insured_zips = df1.insured_zip.unique()
insured_education_levels = df1.insured_education_level.unique()
collision_types = df1.collision_type.unique()
police_report_availables = df1.police_report_available.unique()
policy_numbers = df1.policy_number.unique()

In [None]:
df = df1.copy()

for i in range(0,12):
    df2 = df1.copy()
    df2.incident_state = np.random.choice(incident_states)
    df2.incident_city = np.random.choice(incident_cities)
    df2.insured_education_level = np.random.choice(insured_education_levels)
    df2.collision_type = np.random.choice(collision_types)
    df2.incident_location = np.random.choice(incident_locations)
    df2.police_report_available = np.random.choice(police_report_availables)
    df2.policy_number = np.random.choice(policy_numbers)
    df = pd.concat([df, df2], ignore_index=True, sort=False)

In [None]:
df.shape

In [None]:
# this columns appears to be empty
df.drop(['_c39'], axis=1, inplace=True)

In [None]:
# can't have hyphens in the column header names
df.rename(columns={'capital-gains': 'capital_gains', 'capital-loss': 'capital_loss'}, inplace=True)

In [None]:
# combine incident date and hour to a single date/time field
#df['baz'] = df.agg(lambda x: f"{x['bar']} is {x['foo']}", axis=1)
df['EVENT_TIMESTAMP'] = df.agg( lambda x: f'{x["incident_date"]}T{x["incident_hour_of_the_day"]:02d}:00:00Z', axis=1 )
df.drop(columns={'incident_date', 'incident_hour_of_the_day'}, inplace=True)

In [None]:
df.rename(columns={'fraud_reported': 'EVENT_LABEL', 'policy_number': 'ENTITY_ID'}, inplace=True)

In [None]:
f, ax = plt.subplots(figsize=(10, 10))
sns.countplot(x='EVENT_LABEL',data=df)

In [None]:
# split data set into train and test
train, test = train_test_split(df, test_size=0.2)

In [None]:
# save dataframes to csv files
train.to_csv('data/train.csv', index=None)
test.to_csv('data/test.csv', index=None)

In [None]:
# upload to s3
!aws s3 cp data/train.csv s3://$s3bucket/$s3prefix/train/
!aws s3 cp data/test.csv s3://$s3bucket/$s3prefix/test/


In [None]:
train