In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Load data

In [None]:
data_file = '/kaggle/input/car-insurance-data/Car_Insurance_Claim.csv'
data = pd.read_csv(data_file)
data.head(10)

## Get labels and features

In [None]:
# Label data
y = data.OUTCOME

Drop `ID`, `POSTAL_CODE` and `OUTCOME` (which is the label).

In [None]:
features = []
for i in data.columns:
    if i != 'OUTCOME' and i != 'ID' and i != 'POSTAL_CODE':
        features.append(i)
X = data[features]

## Split data

In [None]:
# Split data
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y)

## Categorial label encoding
Convert categorial labels to integer number to work with Decision Tree Classifier.

### Age encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

train_X2 = train_X.copy()
test_X2 = test_X.copy()
train_y2 = train_y.copy()
test_y2 = test_y.copy()

age_encoder = LabelEncoder()
train_X2['AGE'] = age_encoder.fit_transform(train_X['AGE'])
test_X2['AGE'] = age_encoder.transform(test_X['AGE'])
age_encoder.transform(['16-25', '26-39', '40-64', '65+'])

### Gender encoding

In [None]:
gender_encoder = LabelEncoder()
train_X2['GENDER'] = gender_encoder.fit_transform(train_X['GENDER'])
test_X2['GENDER'] = gender_encoder.transform(test_X['GENDER'])
gender_encoder.transform(['female', 'male'])

### Race encoding

In [None]:
race_encoder = LabelEncoder()
train_X2['RACE'] = race_encoder.fit_transform(train_X['RACE'])
test_X2['RACE'] = race_encoder.transform(test_X['RACE'])
race_encoder.transform(['majority', 'minority'])

### Driving exp. encoding

In [None]:
exp_encoder = LabelEncoder()
train_X2['DRIVING_EXPERIENCE'] = exp_encoder.fit_transform(train_X['DRIVING_EXPERIENCE'])
test_X2['DRIVING_EXPERIENCE'] = exp_encoder.transform(test_X['DRIVING_EXPERIENCE'])
exp_encoder.transform(['0-9y', '10-19y', '20-29y', '30y+'])

### Education encoding

In [None]:
train_X2['EDUCATION'] = train_X2['EDUCATION'].str.replace('none','0')
train_X2['EDUCATION'] = train_X2['EDUCATION'].str.replace('high school','1')
train_X2['EDUCATION'] = train_X2['EDUCATION'].str.replace('university','2')
test_X2['EDUCATION'] = test_X2['EDUCATION'].str.replace('none','0')
test_X2['EDUCATION'] = test_X2['EDUCATION'].str.replace('high school','1')
test_X2['EDUCATION'] = test_X2['EDUCATION'].str.replace('university','2')

### Income encoding

In [None]:
train_X2['INCOME'] = train_X2['INCOME'].str.replace('poverty','0')
train_X2['INCOME'] = train_X2['INCOME'].str.replace('working class','1')
train_X2['INCOME'] = train_X2['INCOME'].str.replace('middle class','2')
train_X2['INCOME'] = train_X2['INCOME'].str.replace('upper class','3')
test_X2['INCOME'] = test_X2['INCOME'].str.replace('poverty','0')
test_X2['INCOME'] = test_X2['INCOME'].str.replace('working class','1')
test_X2['INCOME'] = test_X2['INCOME'].str.replace('middle class','2')
test_X2['INCOME'] = test_X2['INCOME'].str.replace('upper class','3')

### Vehicle ownership, married, children encoding

In [None]:
train_X2['VEHICLE_OWNERSHIP'] = train_X2['VEHICLE_OWNERSHIP'].apply(lambda x: 1 if x >= 0.5 else 0)
test_X2['VEHICLE_OWNERSHIP'] = test_X2['VEHICLE_OWNERSHIP'].apply(lambda x: 1 if x >= 0.5 else 0)
train_X2['MARRIED'] = train_X2['MARRIED'].apply(lambda x: 1 if x >= 0.5 else 0)
test_X2['MARRIED'] = test_X2['MARRIED'].apply(lambda x: 1 if x >= 0.5 else 0)
train_X2['CHILDREN'] = train_X2['CHILDREN'].apply(lambda x: int(x) if x >= 0.5 else 0)
test_X2['CHILDREN'] = test_X2['CHILDREN'].apply(lambda x: int(x) if x >= 0.5 else 0)

### Vehicle year

In [None]:
train_X2['VEHICLE_YEAR'] = train_X2['VEHICLE_YEAR'].str.replace('before 2015','0')
train_X2['VEHICLE_YEAR'] = train_X2['VEHICLE_YEAR'].str.replace('after 2015','1')
test_X2['VEHICLE_YEAR'] = test_X2['VEHICLE_YEAR'].str.replace('before 2015','0')
test_X2['VEHICLE_YEAR'] = test_X2['VEHICLE_YEAR'].str.replace('after 2015','1')

### Vehicle type

In [None]:
vht_encoder = LabelEncoder()
train_X2['VEHICLE_TYPE'] = vht_encoder.fit_transform(train_X['VEHICLE_TYPE'])
test_X2['VEHICLE_TYPE'] = vht_encoder.transform(test_X['VEHICLE_TYPE'])
vht_encoder.transform(['sedan', 'sports car'])

### Fill missing values

In [None]:
from sklearn.impute import SimpleImputer

imputation = SimpleImputer()
imputed_X_train = pd.DataFrame(imputation.fit_transform(train_X2))
imputed_X_test = pd.DataFrame(imputation.transform(test_X2))

In [None]:
train_X2.head(10)

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error

decision_tree = DecisionTreeClassifier()
decision_tree.fit(imputed_X_train, train_y)

test_predictions = decision_tree.predict(imputed_X_test)
test_predictions = [1 if x >= 0.5 else 0 for x in test_predictions]

In [None]:
test_y_cv = []
for row in test_y:
    test_y_cv.append(int(row))

In [None]:
total = 0
count = 0  # Correct
for i in zip(test_predictions, test_y_cv):
    if i[0] == i[1]:
        count += 1
    total += 1
print(count, total)