In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/insurance/insurance.csv')
display(df.head())
display(df.isna().sum())

The dataset does not have null values.

#### Categorical variables

In [None]:
cat_col = [cname for cname in df.columns if df[cname].dtype in ['object']]
cat_col

There are 3 columns that contain categorical data, let's transform the features into ordinal integers

In [None]:
df1 = df.copy() # make a copy of the dataset 

from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()

df1[cat_col] = ordinal_encoder.fit_transform(df1[cat_col])

df1.head()

Now in order to see the correlation between all the vaariable, we will add all the encoded catergorical variables from 'df1' to 'df'

In [None]:
df['sex_code'] = df1.sex
df['smoker_code'] = df1.smoker
df['region_code'] = df1.region
df.head()

In [None]:
sns.set(rc={'figure.figsize':(10,7)}) # Defines the size of seaborn graphs 
sns.heatmap(df.corr(), annot=True)

From the heatmap we can see that 'charges' is highly correlated with the fact of a person being a smoker or not 

In [None]:
# let's find out the percentage of males and females in the dataset

labels = list(df.sex.unique())
sizes = list(df.sex.value_counts())

fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.2f%%', startangle=90)
plt.show()

The percentage is almost the same for the two categories

In [None]:
# Let's compare male and females in respect to age vs charges 

sns.FacetGrid(data=df, col='sex', hue='smoker', height=5).map(sns.scatterplot, 'age', 'charges').add_legend()

In [None]:
sns.barplot(data=df, x='sex', y='charges', hue='smoker')

Males and females are charged almost the same. Gender does not influences the charges, what really influences the charges is if a person smokes

In [None]:
# The number of smokers between males and females
sns.catplot(x="smoker", kind="count", hue = 'sex', palette="pink", data=df)

Now let's see the distribution of the data for smokers and non-smokers.

In [None]:
sns.histplot(data=df[df.smoker == 'no'], x='charges', color='y')
sns.histplot(data=df[df.smoker == 'yes'], x='charges')

Smokers spend more on treatment

Does BMI influences on the charges? 

In [None]:
sns.lmplot(data=df, x='bmi', y='charges', hue='smoker')

BMI does not have a great influence on the price that is paid

In orer to better understand the data contained in the variable BMI, we will divide the data into an BMI strata and construct side-by-side boxplots of the distribution 

In [None]:
df['bmiagre'] = pd.cut(df.bmi, [10, 20, 30, 40, 50])
sns.boxplot(data=df, x='bmiagre', y='charges', hue='smoker').set(title='outliers')

We can see that from 20 to 40% there's a lot of outliers in the non-smoker category

In [None]:
# Distribution of smoker and non-smoker according to the age 

sns.histplot(data=df, x='age', hue='smoker')

In [None]:
sns.lmplot(data=df, x='age', y='charges', hue='smoker')

### Random forests

In [None]:
# For the Random forests algorithm we are going to use the df1 dataframe

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

y = df1.charges
features = ['age', 'sex', 'bmi', 'children', 'smoker', 'region']
X = df1[features]


train_X, val_X, train_y, val_y = train_test_split(X, y, test_size = 0.2, train_size=0.8, random_state = 4)

forest_model = RandomForestRegressor(n_estimators=100, random_state=0)
forest_model.fit(train_X, train_y)
pred = forest_model.predict(val_X)
print(mean_absolute_error(val_y, pred))

In [None]:
forest_model.score(val_X, val_y)

In [None]:
width = 12
height = 10
plt.figure(figsize=(width, height))


ax1 = sns.distplot(val_y, hist=False, color="r", label="Actual Value")
sns.distplot(pred, hist=False, color="b", label="Fitted Values" , ax=ax1)


plt.title('Actual vs Fitted Values for Price')
plt.xlabel('Price')

plt.show()
plt.close()

### Logistic Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

quad = PolynomialFeatures(degree = 2)
x_quad = quad.fit_transform(X)
Y = df1.charges

X_train,X_test,Y_train,Y_test = train_test_split(x_quad,Y, random_state = 0)
plr = LinearRegression().fit(X_train,Y_train)
pred_logis = plr.predict(X_test)
print(mean_absolute_error(Y_test, pred_logis))

In [None]:
width = 12
height = 10
plt.figure(figsize=(width, height))


ax1 = sns.distplot(val_y, hist=False, color="r", label="Actual Value")
sns.distplot(pred_logis, hist=False, color="b", label="Fitted Values" , ax=ax1)


plt.title('Actual vs Fitted Values for Price')
plt.xlabel('Price')

plt.show()
plt.close()

### Random Forest with pipeline 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score

my_pipeline = Pipeline(steps=[
    ('preprocessor', SimpleImputer()),
    ('model', RandomForestRegressor(n_estimators=50, random_state=0))
])

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')

print("Average MAE score:", scores.mean())

### XGB Regressor 

In [None]:
from xgboost import XGBRegressor

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(train_X, train_y, early_stopping_rounds=3, 
             eval_set=[(val_X, val_y)],
             verbose=False)

predictions = my_model.predict(val_X)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, val_y)))

In [None]:
width = 12
height = 10
plt.figure(figsize=(width, height))


ax1 = sns.distplot(val_y, hist=False, color="r", label="Actual Value")
sns.distplot(predictions, hist=False, color="b", label="Fitted Values" , ax=ax1)


plt.title('Actual vs Fitted Values for Price')
plt.xlabel('Price')

plt.show()
plt.close()