# Auto Insurance ML Model

In [2]:
# Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from collections import defaultdict
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

## Data preparation

In [3]:
# Load auto_insurance_churn.csv into a dataframe
input_file = 'Data/auto_insurance_churn.csv'
churn_df = pd.read_csv(input_file, index_col=0)

churn_df.head(10)

Unnamed: 0,income,has_children,length_of_residence,marital_status,home_market_value,home_owner,college_degree,good_credit,curr_ann_amt,days_tenure,age_in_years,latitude,longitude,city,state,county,churned
0,125000.0,1,8,Single,300000 - 349999,1,1,1,949.447656,4767,83,32.492035,-96.361291,Scurry,TX,Kaufman,0
1,42500.0,0,0,Single,,0,0,0,732.323969,1606,55,32.687588,-97.129238,Arlington,TX,Tarrant,0
2,27500.0,0,15,Married,75000 - 99999,1,0,1,1143.329062,4774,83,32.748467,-96.47544,Forney,TX,Kaufman,0
3,80372.176,0,0,Unknown,1000 - 24999,1,0,0,1030.842884,3597,31,33.084836,-96.858913,The Colony,TX,Denton,0
4,125000.0,0,0,Unknown,,0,0,1,1224.092836,723,55,32.687588,-97.129238,Arlington,TX,Tarrant,0
5,70000.0,1,14,Married,100000 - 124999,1,0,1,896.197651,6291,50,32.830436,-96.759775,Dallas,TX,Dallas,0
6,87500.0,1,3,Single,75000 - 99999,1,0,1,1027.144843,1204,55,32.755547,-97.23573,Fort Worth,TX,Tarrant,0
7,62500.0,1,5,Married,50000 - 74999,1,0,1,813.401926,5983,59,33.080878,-96.669351,Allen,TX,Collin,0
8,125000.0,0,3,Married,75000 - 99999,1,1,1,1126.795506,5308,66,32.852216,-97.213551,North Richland Hills,TX,Tarrant,0
9,42500.0,1,5,Unknown,75000 - 99999,0,1,1,796.800046,2673,36,32.661237,-97.398141,Fort Worth,TX,Tarrant,0


In [4]:
# Check number of unique counties
churn_df['county'].nunique()

14

To include a location element as a feature, county will be taken as the cardinality is low compared to city (14 vs. 96) but the two are naturally related to each other. City might be considered in subsequent variations depending on if location appears to an important feature in the model to see if the more granular information it provides improves performance.

In [5]:
# Remove columns that won't be used for model
churn_df = churn_df.drop(columns=['latitude', 'longitude', 'city', 'state'])

Home market value will be transformed to use the value in-between the two numbers; to deal with NaN values without throwing out data from non-home-owners, 0 will be imputted. Additionally the values end with an indefinite 1000000 Plus, which will be converted to just 1000000.

In [6]:
# Fill NaN values in home_market_value with '0 - 0' to match pattern
churn_df['home_market_value'] = churn_df['home_market_value'].fillna('0 - 0')

In [7]:
# Replace 1000000 Plus with 1000000 - 1000000
churn_df.loc[churn_df['home_market_value'] == '1000000 Plus'] = '1000000 - 1000000'

In [8]:
# Split home_market_value
split_hmv = churn_df['home_market_value'].str.split(' - ')

In [9]:
# Change dtype of list elements to integers
split_hmv = split_hmv.apply(lambda x: list(map(int, x)))

In [10]:
# Get mean and reassign to dataframe column
churn_df['home_market_value'] = split_hmv.apply(np.mean)

In [11]:
churn_df.head(10)

Unnamed: 0,income,has_children,length_of_residence,marital_status,home_market_value,home_owner,college_degree,good_credit,curr_ann_amt,days_tenure,age_in_years,county,churned
0,125000.0,1,8,Single,324999.5,1,1,1,949.447656,4767,83,Kaufman,0
1,42500.0,0,0,Single,0.0,0,0,0,732.323969,1606,55,Tarrant,0
2,27500.0,0,15,Married,87499.5,1,0,1,1143.329062,4774,83,Kaufman,0
3,80372.176,0,0,Unknown,12999.5,1,0,0,1030.842884,3597,31,Denton,0
4,125000.0,0,0,Unknown,0.0,0,0,1,1224.092836,723,55,Tarrant,0
5,70000.0,1,14,Married,112499.5,1,0,1,896.197651,6291,50,Dallas,0
6,87500.0,1,3,Single,87499.5,1,0,1,1027.144843,1204,55,Tarrant,0
7,62500.0,1,5,Married,62499.5,1,0,1,813.401926,5983,59,Collin,0
8,125000.0,0,3,Married,87499.5,1,1,1,1126.795506,5308,66,Tarrant,0
9,42500.0,1,5,Unknown,87499.5,0,1,1,796.800046,2673,36,Tarrant,0


In [16]:
# Label encode martial_status and county columns
encoder = defaultdict(LabelEncoder)
cols_to_encode = ['marital_status', 'county']
churn_df[cols_to_encode] = churn_df[cols_to_encode].apply(lambda x: encoder[x.name].fit_transform(x))

churn_df.head()

Unnamed: 0,income,has_children,length_of_residence,marital_status,home_market_value,home_owner,college_degree,good_credit,curr_ann_amt,days_tenure,age_in_years,county,churned
0,125000.0,1,8,2,324999.5,1,1,1,949.447656,4767,83,10,0
1,42500.0,0,0,2,0.0,0,0,0,732.323969,1606,55,14,0
2,27500.0,0,15,1,87499.5,1,0,1,1143.329062,4774,83,10,0
3,80372.176,0,0,3,12999.5,1,0,0,1030.842884,3597,31,4,0
4,125000.0,0,0,3,0.0,0,0,1,1224.092836,723,55,14,0


In [18]:
# Check to ensure encoding can be reversed
test_inverse = churn_df[cols_to_encode].apply(lambda x: encoder[x.name].inverse_transform(x))

test_inverse.head()

Unnamed: 0,marital_status,county
0,Single,Kaufman
1,Single,Tarrant
2,Married,Kaufman
3,Unknown,Denton
4,Unknown,Tarrant
