In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression

# Importing Data

In [None]:
df = pd.read_csv('../input/insurance/insurance.csv')
df.head()

# Basic Exploration

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

# Data Visualization

* Making pairplots

In [None]:
sns.pairplot(df)

* Correlation Matrix

In [None]:
corr = df.corr()
corr

In [None]:
fig,ax = plt.subplots(figsize = (8,8))
ax = sns.heatmap(corr,
                 annot = True,
                 linewidths = 0.5,
                 fmt = '.2f',
                 cmap = 'YlGnBu');

In [None]:
df.head()

* Plotting some columns

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sns.barplot(x = "sex", y = "charges", ax=ax, data=df);

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
sns.barplot(x = "children", y = "charges", ax=ax, data=df);

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sns.barplot(x = "smoker", y = "charges", ax=ax, data=df);

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
sns.barplot(x = "region", y = "charges", ax=ax, data=df);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))
sns.scatterplot(x = "bmi", y = "charges", ax=ax, data=df);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))
sns.scatterplot(x = "age", y = "charges", ax=ax, data=df);

* Label Encoding the Categorical columns

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])
df['smoker'] = le.fit_transform(df['smoker'])
df['region'] = le.fit_transform(df['region'])

In [None]:
df.head()

In [None]:
df.info()

* Splitting The data into training and testing sets

In [None]:
x = df.drop('charges',axis = 1)
y = df['charges']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)

# Model Building

# RandomForest

* Building Model

In [None]:
%%time
model = RandomForestRegressor(n_estimators = 1000,random_state = 42)
model.fit(x_train,y_train)

* Scores

In [None]:
print(model.score(x_train,y_train))
print(model.score(x_test,y_test))

In [None]:
rf_preds = model.predict(x_test)

In [None]:
preds = pd.DataFrame({'Actual Charges': y_test,
                      'Predicted Charges': rf_preds,
                      'Difference': rf_preds - y_test})
preds.head()

* Plotting feature importance

In [None]:
print(model.feature_importances_)

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(5).plot(kind='barh')
plt.title('RandomForest Feature impotance')
plt.show()

# XgBoost

* Building Model

In [None]:
xg_model = xgb.XGBRegressor()
xg_model.fit(x_train,y_train)

* Scores

In [None]:
print(xg_model.score(x_train,y_train))
print(xg_model.score(x_test,y_test))


In [None]:
xg_preds = xg_model.predict(x_test)

In [None]:
xpreds = pd.DataFrame({'Actual Charges': y_test,
                      'Predicted Charges': xg_preds,
                      'Difference': xg_preds - y_test})
xpreds.head()

In [None]:
print(xg_model.feature_importances_)

* Plotting feature importance

In [None]:
xfeat_importances = pd.Series(xg_model.feature_importances_, index=x.columns)
xfeat_importances.nlargest(5).plot(kind='barh')
plt.title('XGBoost Feature impotance')
plt.show()

# LinearRegression

* Building Model

In [None]:
reg = LinearRegression()
reg.fit(x_train,y_train)

* Scores

In [None]:
print(reg.score(x_train,y_train))
print(reg.score(x_test,y_test))

In [None]:
reg_preds = reg.predict(x_test)

In [None]:
rpreds = pd.DataFrame({'Actual Charges': y_test,
                      'Predicted Charges': reg_preds,
                      'Difference': reg_preds - y_test})
rpreds.head()

# Plotting The Scores of all the Models

In [None]:
scores = pd.DataFrame({'RandomForest': model.score(x_test,y_test),
                       'XGBoost': xg_model.score(x_test,y_test),
                       'LinearRegression': reg.score(x_test,y_test)},
                      index = [0])
scores

In [None]:
scores.T.plot(kind = 'bar',
              figsize = (10,10))
plt.title('Scores of all Model')
plt.xlabel('Model Name')
plt.ylabel('Scores');

# Plz Upvote!!! If You like This Kernal