In [None]:
import numpy as np
import pandas as pd
import sklearn
import keras
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import r2_score

%matplotlib inline

Download needed libraries and read in the csv file to see what we have to work with

In [None]:
df = pd.read_csv('../input/insurance/insurance.csv')
df.head()

Looks like we have a relatively small amount of variables and some objects, lets take a deeper look at what we have to work with

In [None]:
print('Size of our data: ', df.shape)
df.info()

Looks like we have3 objects being the Sex, Smoking status, and region of the contractor. Next, we check for any null values that may me detrimental to our model

In [None]:
print('Columns with null values: ', df.columns[df.isnull().any()])
df.isnull().sum()

Awesome! our data looks very clean and really easy to work with. The only thing we have to do to our data is to try to change the Dtype of the objects into something more usable.

Perfect! Now that our data looks good to go when it comes to putting it into our model, lets start to visualize what the importance of our data correlates to the charges.

In [None]:
plt.figure(figsize = (8,8))
sns.set(style = 'whitegrid')
sns.heatmap(df.corr(), annot = True, cmap = 'Blues')

hmmm, when it comes to our correlation map, we see that age has a higher correlation that the other numerical varibales while bmi is a close second and the amount of children looks irrelevant to the charges.

In [None]:
plt.figure(figsize = (10,8))
plt.title('mean charges for each sex')
sns.barplot(x = 'sex', y = 'charges', data = df)

Males pay more on average than women when on insurance.

In [None]:
plt.figure(figsize = (8,8))
sns.countplot(x = 'sex', data = df)

We also have around the same amount of contractors being male and female

In [None]:
sns.catplot(x = 'sex', data = df, kind = 'count', hue = 'smoker')

we also have relatively the same amount of smokers and non-smokers.

In [None]:
sns.catplot(x = 'region', data = df, kind = 'count', hue = 'sex')

We also have a balanced amount of male and females in each region. So far, our data looks very balanced and does not have an overflow or underflow of data.

In [None]:
plt.figure(figsize = (9,6))
plt.title('Density of age groups')
sns.distplot(df['age'], bins = 40)

In [None]:
sns.jointplot(x = 'age', y = 'charges', kind = 'kde', data = df)

Looks like a lot of people pay less than 20k in charges for most age groups but the average payment gets higher for older folks which would make sense.

In [None]:
sns.pairplot(df, hue = 'sex', palette = 'Accent')

We have now visualized a lot of our data and will try to seperate the dataset into male and females and model their predictions. Before that, we will need to do some data pre-processing and change our objects into usable data. We do this by converting the categories into integers and using dummy variables as we are in a regression problem.

In [None]:
df.loc[df['sex'] == 'male', 'sex'] = 1 #sets males to 1
df.loc[df['sex'] == 'female', 'sex'] = 0 # sets females to 0

df.loc[df['smoker'] == 'yes', 'smoker'] = 1 #sets smokers to 1
df.loc[df['smoker'] == 'no', 'smoker'] = 0 # sets non-smokers to 0

In [None]:
dummy = pd.get_dummies(df['region'])
df = pd.concat([df, dummy], axis = 1)
df.head(10)

In [None]:
data = df.drop(['region'], axis = 1)
print('Shape of our data: ', data.shape)
data.head()
data.info()

We have 2 objects that we need to convert into floats

In [None]:
data['sex'] = np.asarray(data['sex']).astype('float32')
data['smoker'] = np.asarray(data['smoker']).astype('float32')

data.info()

In [None]:

df_male = data[data['sex'] == 1]
df_female = data[data['sex'] == 0]

In [None]:
df_male.head()

In [None]:
df_female.head()

nice! We have successfully grabbed our opposite sets for our model. Lets start splitting our data so we can plug it into the model

In [None]:
Xm = df_male.drop(['charges', 'sex'], axis = 1)
ym = df_male['charges']

print('Male data shape: ', Xm.shape)
Xm.head()

In [None]:
Xf = df_female.drop(['charges', 'sex'], axis = 1)
yf = df_female['charges']

print('Female data shape: ', Xf.shape)
Xf.head()


In [None]:
Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm, ym, test_size = 0.2, random_state = 10)
Xf_train, Xf_test, yf_train, yf_test = train_test_split(Xf, yf, test_size = 0.2, random_state = 10)

In [None]:
def nn():
    model = Sequential()
    model.add(Dense(64, input_dim = 8, activation = 'relu'))
    model.add(Dropout(0.15))
    model.add(Dense(64, activation = 'relu'))
    model.add(Dropout(0.2))
    model.add(Dense(40, activation = 'relu'))
    model.add(Dropout(0.15))
    model.add(Dense(54, activation = 'relu'))
    model.add(Dropout(0.18))
    model.add(Dense(1))
    
    model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['mse', 'mae'])
    
    return model

In [None]:
model_male = nn()
model_female = nn()

history_male = model_male.fit(Xm_train, ym_train, validation_split = 0.1, batch_size = 3, epochs = 55, verbose = 0)
history_female = model_female.fit(Xf_train, yf_train, validation_split = 0.1, batch_size = 3, epochs = 55, verbose = 0)

In [None]:
plt.plot(history_male.history['mse'])
plt.plot(history_male.history['val_mse'])
plt.title('Mean_Squared_Error for Males')
plt.xlabel('epochs')
plt.ylabel('MSE')
plt.legend(['Training', 'Validation'], loc = 'upper right')
plt.show()

In [None]:
plt.plot(history_female.history['mse'])
plt.plot(history_female.history['val_mse'])
plt.title('Mean_Squared_Error for Females')
plt.xlabel('epochs')
plt.ylabel('MSE')
plt.legend(['Training', 'Validation'], loc = 'upper right')
plt.show()

In [None]:
plt.plot(history_male.history['mae'])
plt.plot(history_male.history['val_mae'])
plt.title('Mean Absolute Error for Males')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.legend(['Training', 'Validation'], loc = 'upper right')
plt.show()

In [None]:
plt.plot(history_female.history['mae'])
plt.plot(history_female.history['val_mae'])
plt.title('Mean Absolute Error for Females')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.legend(['Training', 'Validation'], loc = 'upper right')
plt.show()

In [None]:
scores = model_male.evaluate(Xm_test, ym_test, verbose = 0)
scores2 = model_female.evaluate(Xf_test, yf_test, verbose = 0)

print('Mean Squared Error of male model: ', scores[1])
print('Mean Squared Error of female model: ', scores2[1])

In [None]:
print('Mean Absolute Error of male model: ', scores[2])
print('Mean Absolute Error of female model: ', scores2[2])

In [None]:
ym_pred = model_male.predict(Xm_test).flatten()
yf_pred = model_female.predict(Xf_test).flatten()

error_male = ym_pred - ym_test
error_female = yf_pred - yf_test

In [None]:
plt.figure(figsize = (8,6))
plt.title('Distribution of error for male model')
sns.distplot(error_male, kde = True, bins = 50)

In [None]:
plt.figure(figsize = (8,6))
plt.title('Distribution of Error for female model')
sns.distplot(error_female, kde = True, bins = 50)

In [None]:
print('r2 score for male model: ', r2_score(ym_test, ym_pred))
print('r2 score for female model: ', r2_score(yf_test, yf_pred))

After creating our model and looking at the history and seeing the result, we can now say that our model is great as it has a r2_score of >0.69. This and the high density around the error being 0, our model is can now be usable for the real world. The model can be greatly increased in accuracy if we added more neurons, layers, and a higher percentage in our dropout layer. We can also incorporate normalization but I chose to not go too in depth with a dataset that small. Thank you, and I hope y'all enjoy this kernel! - Michael Ramirez