# Hello Guys! 
* Hope you are doing well. This is my notebook where I have worked on predicting insurance cost based on the features provided in the data set
* I wanted to test the dataset with different regression models
* I have also done feature engineering initially and then exploratory analysis to build an understanding of the relationship between variables.
* Hope you Enjoy!

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

In [None]:
df = pd.read_csv('../input/insurance/insurance.csv')

In [None]:
df.head()

# Data Cleaning and Feature Engineering 

In [None]:
msno.matrix(df)

In [None]:
df[df.isnull()].count()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
Male = pd.get_dummies(df['sex'], drop_first=True)
df = pd.concat([df, Male], axis=1 )

In [None]:
Smoker = pd.get_dummies(df['smoker'], drop_first=True)
df = pd.concat([df, Smoker], axis=1 )

In [None]:
df = df.rename(columns={'yes':'Smoker'})

In [None]:
df['region'].unique()

In [None]:
region = pd.get_dummies(df['region'])
df = pd.concat([df, region], axis=1 )
#df.drop('region', axis=1,inplace=True)
#df.drop(['sex','smoker'], axis=1, inplace=True)

In [None]:
df.head()

* We will be keeping the categorical columns for now because we want to perform exploratory analysis on them
* Will drop them before applying predictive models

In [None]:
plt.figure(figsize=(12,6))
sns.set_style('white')
sns.countplot(x='sex', data = df, palette='GnBu')
sns.despine(left=True)


In [None]:
plt.figure(figsize=(14,10))
sns.set_style('white')
sns.boxplot(x='sex', y='charges', data = df, palette='OrRd', hue='Smoker')
sns.despine(left=True)


In [None]:
fig, ax =plt.subplots(nrows= 1, ncols = 3, figsize= (14,6))
sns.scatterplot(x='age', y='charges', data = df, palette='coolwarm', hue='sex', ax=ax[0])
sns.scatterplot(x='age', y='charges', data = df, palette='GnBu', hue='Smoker', ax=ax[1])
sns.scatterplot(x='age', y='charges', data = df, palette='magma_r', hue='region', ax=ax[2])
sns.set_style('dark')
sns.despine(left=True)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
fig, ax =plt.subplots(nrows= 1, ncols = 2, figsize= (14,6))
sns.boxplot(x='region', y='charges', data = df, palette='GnBu', hue='Smoker', ax=ax[0])
sns.boxplot(x='region', y='charges', data = df, palette='coolwarm', hue='sex', ax=ax[1])

In [None]:
fig, ax =plt.subplots(nrows= 1, ncols = 2, figsize= (14,6))
sns.scatterplot(x='bmi', y='charges', data = df, palette='GnBu_r', hue='sex', ax=ax[0])
sns.scatterplot(x='bmi', y='charges', data = df, palette='magma', hue='Smoker', ax=ax[1])
sns.set_style('dark')
sns.despine(left=True)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
df.drop(['sex', 'region', 'smoker', 'southwest'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(16,6))
sns.heatmap(df.corr(), cmap='OrRd')

# Predictive Analysis

In [None]:
X=df.drop('charges', axis=1)
y=df['charges']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.4)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(X_train)
scaled_x_test = scaler.transform(X_test)

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=300)
rfr.fit(scaled_x_train, y_train)
predict = rfr.predict(scaled_x_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
mean_absolute_error(y_test, predict)


In [None]:
np.sqrt(mean_squared_error(y_test, predict))

# Multiple Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(scaled_x_train, y_train)
predict2 = lr.predict(scaled_x_test)

In [None]:
mean_absolute_error(y_test, predict2)

In [None]:
np.sqrt(mean_squared_error(y_test, predict2))

# Support Vector Regressor

In [None]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(scaled_x_train, y_train)
predict3 = regressor.predict(scaled_x_test)

In [None]:
mean_absolute_error(y_test, predict3)

In [None]:
np.sqrt(mean_squared_error(y_test, predict3))

# Comparing all three models!

In [None]:
fig, ax = plt.subplots(1,3, figsize=(16,6))
sns.set_style('dark')
g = sns.scatterplot(predict,y_test, ax=ax[0], color='red')
g.set_title('Random Forest Regressor')
g.set_xlabel('Predict')

sns.set_style('dark')
h = sns.scatterplot(predict2,y_test, ax=ax[1], color='green')
h.set_title('Mutliple Linear Regression')
h.set_xlabel('Predict')


sns.set_style('dark')
f = sns.scatterplot(predict3,y_test, ax=ax[2])
f.set_title('Support Vector Regression')
f.set_xlabel('Predict')



* The data is just 1400 rows so it that is why the predicted values are drifted. 
* However the best results achieved from the data set were from Random Forest Regressor 

# Please Upvote if you like it! Reach me out to work on projects together! 
# Thank You!

Randomly conducted Test

In [None]:
entry_1 = df[:][257:477].drop('charges', axis=1)
pred = rfr.predict(entry_1)
np.sqrt(mean_squared_error(df[:][257:477]['charges'], pred))

In [None]:
entry_1 = df[:][257:477].drop('charges', axis=1)
pred = lr.predict(entry_1)
np.sqrt(mean_squared_error(df[:][257:477]['charges'], pred))

In [None]:
entry_1 = df[:][257:477].drop('charges', axis=1)
pred = regressor.predict(entry_1)
np.sqrt(mean_squared_error(df[:][257:477]['charges'], pred))