# Medical Insurance Cost Perdiction

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

 ## importing dataset

In [None]:
dataset = pd.read_csv('../input/insurance/insurance.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
dataset['region'].value_counts()

## EDA

In [None]:
sns.pairplot(dataset)

In [None]:
dataset.corr()

In [None]:
sns.heatmap(data=dataset.corr(), cmap='coolwarm')

In [None]:
sns.boxplot(x='children', y='charges', data=dataset)

In [None]:
sns.boxplot(x='smoker', y='charges', data=dataset)

In [None]:
sns.boxplot(x='sex', y='charges', data=dataset)

In [None]:
sns.boxplot(x='region', y='charges', data=dataset)

## Data Preprocessing

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
ct = ColumnTransformer(transformers=[('encodes', OneHotEncoder(), [1, 4, 5])], remainder='passthrough')

In [None]:
X = np.array(ct.fit_transform(X))

In [None]:
X

## Spliting Data into Training and Testing Set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lin_regressor = LinearRegression()

In [None]:
lin_regressor.fit(X_train, y_train)

In [None]:
y_pred_lr = lin_regressor.predict(X_test)

## Support Vector Regression

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scx = StandardScaler()
scy = StandardScaler()
X_train_svr = scx.fit_transform(X_train)
y_train_svr = scy.fit_transform(y_train.reshape(len(y_train), 1))

In [None]:
from sklearn.svm import SVR

In [None]:
svr_regressor = SVR(kernel='rbf')

In [None]:
svr_regressor.fit(X_train_svr, y_train_svr)

In [None]:
y_pred_svr = scy.inverse_transform(svr_regressor.predict(scx.transform(X_test)))

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf_regressor = RandomForestRegressor(n_estimators=10, random_state=0)

In [None]:
rf_regressor.fit(X_train, y_train)

In [None]:
y_pred_rf = rf_regressor.predict(X_test)

# Multiple Random Forest Models

In [None]:
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

In [None]:
from sklearn.metrics import r2_score
def rf_models(model,X_tr=X_train,X_v=X_test,y_tr=y_train,y_v=y_test):
    model.fit(X_tr,y_tr)
    mod_preds=model.predict(X_v)
    return r2_score(y_v,mod_preds)

## Model Evaluation

In [None]:
from sklearn.metrics import r2_score
print('Linear Regression', r2_score(y_test, y_pred_lr))
print('Support Vector Regression', r2_score(y_test, y_pred_svr))
print('Random Forest Regression', r2_score(y_test, y_pred_rf))
print("")

#Printing the scores generated from the multiple Random Forest models
for i in range(len(models)):
    print("Random Forest model {}:{}".format(i+1,rf_models(models[i])))

In [None]:
model_dict={"Random Forest model {}".format(i+1): rf_models(models[i]) for i in range(len(models))}
model_dict['Linear Regression']=r2_score(y_test, y_pred_lr)
model_dict['Support Vector Regression']=r2_score(y_test, y_pred_svr)
model_dict['Random Forest Regression']=r2_score(y_test, y_pred_rf)
print("The Worst Model so far is:",min(model_dict,key=model_dict.get))
print("The Best Model so far is:",max(model_dict,key=model_dict.get))