# Importing Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
%matplotlib inline

#Medical Cost Personal Datasets
data = pd.read_csv(r"/kaggle/input/insurance/insurance.csv")
print(data.shape)
data

In [None]:
#removing duplicate entries
data= data.drop_duplicates(keep="first")
data.shape

# Exploratory Data Analysis and data preprocessing

In [None]:
#checking for null values if any
data.isnull().sum()

In [None]:
print(data["sex"].unique())
print(data["children"].unique())
print(data["smoker"].unique())
print(data["region"].unique())

In [None]:
data_dup=data.copy(deep=True)

In [None]:
data= pd.get_dummies(data, drop_first=True)

In [None]:
data.head()

In [None]:
sns.pairplot(data)

In [None]:
sns.set(color_codes=True)
sns.lmplot(x='age', y='charges',data = data)

The feature "age" is nowhere near to have a linear effect on our target variable "charges" showing that the medical espenses has hardly anything to do with "age" of the person alone!

In [None]:
sns.lmplot(x='bmi', y='charges',data = data)

In [None]:
data.corr()['charges'].sort_values()

In [None]:
f, ax = plt.pyplot.subplots(figsize=(10, 8))
corr = data.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(240,10,as_cmap=True),
            square=True, ax=ax)

In [None]:
sns.catplot(data=data_dup, kind="swarm", x="children", y="charges", col="smoker")

Charges tend to reach higher for smokers as compared to non-smokers.

In [None]:
sns.catplot(data=data, kind="swarm", x="sex_male", y="charges", col="smoker_yes")

Charges seems to be slightly higher for women than men in both the categories of smoker.

# Model Training

In [None]:
data.columns

In [None]:
data=data[['age', 'bmi', 'children', 'sex_male', 'smoker_yes','region_northwest', 'region_southeast', 'region_southwest', 'charges']]
data

In [None]:
#TARGET AND INDEPENDENT VARIABLE SPLIT
data = data.sample(frac=1)

X_train,y_train,X_test,y_test= data.iloc[:1000,:-1],data.iloc[:1000,-1],data.iloc[1000:,:-1],data.iloc[1000:,-1]

##   MODEL 1

In [None]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()

# train the model
clf.fit(X_train, y_train)

# predict on test data
predict = clf.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
plt.pyplot.scatter(y_test, predict)
plt.pyplot.ylabel('Predicted')
plt.pyplot.xlabel('Actual')
print('RMSE: %.4f' % np.sqrt(mean_squared_error(y_test, predict)))
print('r2 score: %.4f' % r2_score(y_test, predict))

In [None]:
sns.distplot(y_test-predict)

This shows that the random forest regressor works quite well in predicting the health charges.

## MODEL 2

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
dtree = DecisionTreeRegressor(max_depth=8, min_samples_leaf=0.13, random_state=3)

dtree.fit(X_train, y_train)

In [None]:
DecisionTreeRegressor(criterion='mse')

In [None]:
predict2= dtree.predict(X_test)

In [None]:
#from sklearn.metrics import mean_squared_error, r2_score
plt.pyplot.scatter(y_test, predict2)
plt.pyplot.ylabel('Predicted')
plt.pyplot.xlabel('Actual')
print('RMSE: %.4f' % np.sqrt(mean_squared_error(y_test, predict2)))
print('r2 score: %.4f' % r2_score(y_test, predict2))

In [None]:
sns.distplot(y_test-predict2)

Comparing the root mean squared errors of both the models and their r2 scores, we can conclude that random forest fits our data better than a decision tree.

## MODEL 3

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

quad = PolynomialFeatures (degree = 2)
x_quad = quad.fit_transform(X_train)

plr = LinearRegression().fit(X_train,y_train)

Y_train_pred = plr.predict(X_train)
Y_test_pred = plr.predict(X_test)

print(plr.score(X_test,y_test))

In [None]:
plt.pyplot.scatter(y_test, Y_test_pred)
plt.pyplot.ylabel('Predicted')
plt.pyplot.xlabel('Actual')
print('RMSE: %.4f' % np.sqrt(mean_squared_error(y_test, Y_test_pred)))
print('r2 score: %.4f' % r2_score(y_test, Y_test_pred))

In [None]:
sns.distplot(y_test-Y_test_pred)

Compared to model 1 and 2, model three suits our data the least.

## MODEL 4

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
X_train,y_train,X_test,y_test=X_train.to_numpy(),y_train.to_numpy(),X_test.to_numpy(),y_test.to_numpy()

In [None]:
nn = tf.keras.Sequential([keras.layers.Dense(units=10,activation="linear"),keras.layers.Dense(1, activation='linear')])#units=number of neurons of the neural network
nn.compile(optimizer='adam', loss='mean_squared_error')

nn.fit(X_train,y_train, epochs=1000)

In [None]:
prediction=nn.predict(X_test)

In [None]:
plt.pyplot.scatter(y_test, prediction)
plt.pyplot.ylabel('Predicted')
plt.pyplot.xlabel('Actual')
print('RMSE: %.4f' % np.sqrt(mean_squared_error(y_test, prediction)))
print('r2 score: %.4f' % r2_score(y_test, prediction))

In [None]:
sns.distplot(y_test-prediction)

We have to adjust our hyperparameters to train our DL model better.