## Libraries

In [None]:
# basic Libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# visulations libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

# Data pre-processing libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Modelling Libraries
from sklearn.ensemble import RandomForestRegressor

# evalution & CV libraries
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV

## Importing The Dataset

In [None]:
df = pd.read_csv('/kaggle/input/medical-insurance-premium-prediction/Medicalpremium.csv')
df.head()

In [None]:
df.info()

## Visualizations

In [None]:
# Premium Price
plt.figure(figsize=(10, 6))
plt.xticks(rotation=70)
sns.countplot(x='PremiumPrice', data=df)

most of the PremiumPrice range is >23000

In [None]:
# age
plt.figure(figsize=(10, 6))
sns.histplot(x='Age', data=df, bins=20, kde=True, color='green')

In [None]:
# Height
plt.figure(figsize=(10, 6))
sns.histplot(x='Height', data=df, bins=20, kde=True, color='red')

In [None]:
# weight
plt.figure(figsize=(10, 6))
sns.histplot(x='Weight', data=df, bins=20, kde=True, color='purple')

here, most people height is >160 and weight is <100

In [None]:
# plot countplot of categorical feature.
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('categorical feature')

sns.countplot(x='Diabetes', data=df, ax=axes[0,0], palette=['#17869E',"#656d4a"])
sns.countplot(x='BloodPressureProblems', data=df, ax=axes[0,1], palette=['#9d6b53',"#9e2a2b"])
sns.countplot(x='AnyTransplants', data=df, ax=axes[0,2], palette=['#8338ec',"#e0afa0"])
sns.countplot(x='AnyChronicDiseases', data=df, ax=axes[1,0], palette=['#c1121f',"#17c3b2"])
sns.countplot(x='KnownAllergies', data=df, ax=axes[1,1], palette=['#1982c4',"#a98467"])
sns.countplot(x='HistoryOfCancerInFamily', data=df, ax=axes[1,2], palette=['#ffcb69',"#fe6d73"])

In [None]:
fig, axes = plt.subplots(4, 3, figsize=(20, 20))
sns.scatterplot(x='Age', y='Height', data=df, ax=axes[0, 0])
sns.scatterplot(x='Age', y='Weight', data=df, ax=axes[0, 1])
sns.scatterplot(x='Age', y='PremiumPrice', data=df, ax=axes[0, 2])
sns.scatterplot(x='Height', y='Age', data=df, ax=axes[1, 0])
sns.scatterplot(x='Height', y='Weight', data=df, ax=axes[1, 1])
sns.scatterplot(x='Height', y='PremiumPrice', data=df, ax=axes[1, 2])
sns.scatterplot(x='Weight', y='Age', data=df, ax=axes[2, 0])
sns.scatterplot(x='Weight', y='Height', data=df, ax=axes[2, 1])
sns.scatterplot(x='Weight', y='PremiumPrice', data=df, ax=axes[2, 2])
sns.scatterplot(x='PremiumPrice', y='Age', data=df, ax=axes[3, 0])
sns.scatterplot(x='PremiumPrice', y='Height', data=df, ax=axes[3, 1])
sns.scatterplot(x='PremiumPrice', y='Weight', data=df, ax=axes[3, 2])

In [None]:
X = df.drop(['PremiumPrice'], axis=1)
y = df['PremiumPrice']

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(X.corr(), annot=True, cmap='copper_r')

## Standardizing The Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Model creation

In [None]:
reg_rf = RandomForestRegressor()
reg_rf.fit(X_train, y_train)

In [None]:
y_pred = reg_rf.predict(X_test)

In [None]:
# score of train data
reg_rf.score(X_train, y_train)

In [None]:
# score of test data
reg_rf.score(X_test, y_test)

In [None]:
plt.scatter(y_test, y_pred, alpha = 0.5)
plt.xlabel("y_test")
plt.ylabel("y_pred")
plt.show()

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
metrics.r2_score(y_test, y_pred)

## Hyperparameter Tuning

In [None]:
rf = RandomForestRegressor()

n_estimators = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]
max_features = ['auto', 'sqrt']
max_depth = [5, 10, 15, 20, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [None]:
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid,
                               scoring='neg_mean_squared_error', 
                               n_iter = 10, cv = 5, verbose=2, 
                               random_state=42, n_jobs = 1)

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
y_pred = rf_random.predict(X_test)

In [None]:
plt.scatter(y_test, y_pred, alpha = 0.5)
plt.xlabel("y_test")
plt.ylabel("y_pred")
plt.show()

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))