In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# importing required modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Loading the dataset and analysing the values

In [None]:
insurance = pd.read_csv('/kaggle/input/insurance/insurance.csv')
insurance.head()

In [None]:
insurance.shape

In [None]:
for i in ['sex','smoker','region']:
    print(insurance[i].unique())

In [None]:
insurance.describe()

# Visualizing the relation of various features

### Applying pairplot for all the numerical data since it provides a lot of insights about the relation is a single plot which is easy to analyse and compare

In [None]:
sns.pairplot(insurance);

### Applying scatter plots for all the categorical features individually

In [None]:
# vizualising the relation of categorical features with the dependent variable

sns.scatterplot(x=insurance['sex'], y=insurance['charges']);

In [None]:
sns.scatterplot(x=insurance['smoker'], y=insurance['charges']);

In [None]:
sns.scatterplot(x=insurance['region'], y=insurance['charges']);

# Feature Engineering

In [None]:
# checking missing values

insurance.isnull().sum()

In [None]:
# abstraction of label and features

X, y = insurance.drop('charges', axis=1), insurance.charges
print(X.shape, y.shape)

In [None]:
X.head()

In [None]:
# encoding categorical values

columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [1,4,5])], remainder='passthrough') 
X = columnTransformer.fit_transform(X)
X[0:5]

In [None]:
#avoiding the dummy variable trap (reduces the dimensionality of the data) with negligible difference in R2-Score

X = np.delete(X, [1,3,4], 1)
X[0:5]

In [None]:
# splitting the dataset into training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(y_train.shape, y_test.shape)

In [None]:
# feature scaling

st = StandardScaler()
X_train = st.fit_transform(X_train)
X_test = st.transform(X_test)

# Applying RandomForest Regression and fitting training data to model

In [None]:
# finding the best model parameters

scores = []
for i in range(2,15):
    reg = RandomForestRegressor(max_depth=i).fit(X_train, y_train)
    scores.append(r2_score(y_test, reg.predict(X_test)))
plt.plot(list(range(2,15)), scores)

In [None]:
# making the model with the best parameters

reg = RandomForestRegressor(max_depth=4)

In [None]:
# cross validation scores

scores = cross_val_score(reg, X, y, cv=5)
print('Cross Validation Scores: {:.3f} {:.3f} {:.3f} {:.3f} {:.3f}'.format(*scores))

In [None]:
# fitting the training data into the model

reg.fit(X_train, y_train)
print('Training Data Score: {:.2f}'.format(reg.score(X_train, y_train)))
print('Testing Data Score: {:.2f}'.format(reg.score(X_test, y_test)))

In [None]:
# most influential features

reg.feature_importances_

In [None]:
# R2-Score of the model and mean_squared_error

print('R2 Score: {}'.format(r2_score(y_test, reg.predict(X_test))))
print('Mean Squared Error: {}'.format(mean_squared_error(reg.predict(X_test),y_test)))