In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../input/insurance/insurance.csv")

In [None]:
df.head()

### Columns

- **age:** age of primary beneficiary
- **sex:** insurance contractor gender, female, male
- **bmi:** Body mass index, providing an understanding of body, weights that are relatively high or low relative to height,
objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9
- **children:** Number of children covered by health insurance / Number of dependents
- **smoker:** Smoking
- **region:** the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.
- **charges:** Individual medical costs billed by health insurance

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr())
plt.show()

# EDA

In [None]:
df.hist(figsize=(15,10))
plt.show()

In [None]:
plt.figure(figsize=(9,6))
sns.kdeplot(data = df , x="bmi", hue="sex", shade=True, palette="mako")
plt.show()

In [None]:
plt.style.use("fivethirtyeight")

fig=plt.figure(figsize=(15,7))
ax=fig.add_subplot(121)
ax=sns.boxplot(df.sex, df.charges, palette="Pastel1")

ax=fig.add_subplot(122)
ax=sns.boxplot(df.smoker,df.charges, palette="Set2")
ax.set_ylabel("")
plt.show()

In [None]:
plt.figure(figsize=(15,7))
sns.boxplot(data=df, x="children",y="charges", hue="sex", palette="Pastel1")
plt.show()

In [None]:
plt.figure(figsize=(15,7))
sns.violinplot(data=df, x="region", y="charges", hue="smoker", palette="crest")
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(data=df, x="age", y="charges", hue="smoker", palette="inferno_r")
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(data=df, x="bmi", y="charges", hue="smoker", palette="mako_r")
plt.show()

# Data Preprocessing

## Null-Duplicated Values

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.info()

In [None]:
# Children column int64 but is not true , this column should be object or categorical
df.children.unique()

In [None]:
df.children = df.children.astype("object")

## Encoding

In [None]:
cat_cols=df.select_dtypes(include="object").columns
num_cols=df.select_dtypes(exclude="object").columns

In [None]:
cat_cols

In [None]:
num_cols

### Categorical Columns Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe= OneHotEncoder()
ohe_data = ohe.fit_transform(df[cat_cols]).toarray()
ohe_cols = ohe.get_feature_names(cat_cols)

ohe_df = pd.DataFrame(data=ohe_data, columns=ohe_cols)

df = df.join(ohe_df)

In [None]:
df.drop(cat_cols, axis=1, inplace=True)
df.head()

### Numerical Columns 
    Outliers - Skewness - Normalization

    - i don't necessary any process on Age column so i will skip this

In [None]:
num_cols 

In [None]:
num_cols = num_cols[1:]

In [None]:
# Outliers - zscore

before = df.shape[0]

for col in num_cols :
    
    mean = df[col].mean()
    std = df[col].std()
    
    max_val = mean + 3*std
    min_val = mean - 3*std
    
    outliers = df[ (df[col]>max_val) | (df[col]<min_val)].index
    
    df.drop(outliers, axis=0, inplace=True)
    

after = df.shape[0]

print("Total Number of Outleirs :",(before-after))

In [None]:
# Skewness 
# if skewness > 0.5 ,  this is a high skewness

from scipy.stats import skew

skew_cols = df[num_cols].apply(lambda x : skew(x)).sort_values(ascending=False)

skew_cols = skew_cols[skew_cols>0.5].index

df[skew_cols] = np.log1p(df[skew_cols])

In [None]:
df[num_cols].skew()

In [None]:
# Normalization

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

cols = df.columns

data = scaler.fit_transform(df)

df = pd.DataFrame(data=data, columns=cols)

df.head()

# Models

In [None]:
X = df.drop("charges", axis=1)
y= df["charges"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=9) 

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
lr_acc = round(r2_score(y_test,y_pred),2)*100
print("RMSE of Linear Regression:",lr_rmse)
print(f"Accuracy of Linear Regression {lr_acc} %")

In [None]:
plt.figure(figsize=(8,6))
sns.regplot(x=y_test, y=y_pred, 
            scatter_kws=dict(color="#4e75b5"),
            line_kws=dict(color="#b05862", linewidth=3))
plt.xlabel("True")
plt.ylabel("Pred")
plt.title("Linear Regression")
plt.show()

## Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt_reg = DecisionTreeRegressor(max_depth=4, random_state=9)
dt_reg.fit(X_train, y_train)

y_pred = dt_reg.predict(X_test)


dt_reg_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
dt_reg_acc = round(r2_score(y_test, y_pred),2)*100
print("RMSE of DT Regressor:",dt_reg_rmse)
print(f"Accuracy of DT Regressor {dt_reg_acc} %")

In [None]:
plt.figure(figsize=(8,6))
sns.regplot(x=y_test, y=y_pred, 
            scatter_kws=dict(color="#4e75b5"),
            line_kws=dict(color="#b05862", linewidth=3))
plt.xlabel("True")
plt.ylabel("Pred")
plt.title("DT Regressor")
plt.show()

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(max_depth=4 , random_state=9)

rf_reg.fit(X_train, y_train)

y_pred = rf_reg.predict(X_test)

rf_reg_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rf_reg_acc = round(r2_score(y_test, y_pred),2)*100

print("RMSE of RF Regressor:",rf_reg_rmse)
print(f"Accuracy of RF Regressor {rf_reg_acc} %")

In [None]:
plt.figure(figsize=(8,6))
sns.regplot(x=y_test, y=y_pred, 
            scatter_kws=dict(color="#4e75b5"),
            line_kws=dict(color="#b05862", linewidth=3))
plt.xlabel("True")
plt.ylabel("Pred")
plt.title("RF Regressor")
plt.show()

## Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gb_reg = GradientBoostingRegressor(random_state=9)

gb_reg.fit(X_train, y_train)

y_pred = gb_reg.predict(X_test)

gb_reg_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
gb_reg_acc = round(r2_score(y_test, y_pred),1)*100

print("RMSE of GB Regressor:",gb_reg_rmse)
print(f"Accuracy of GB Regressor {gb_reg_acc} %")

In [None]:
plt.figure(figsize=(8,6))
sns.regplot(x=y_test, y=y_pred, 
            scatter_kws=dict(color="#4e75b5"),
            line_kws=dict(color="#b05862", linewidth=3))
plt.xlabel("True")
plt.ylabel("Pred")
plt.title("GB Regressor")
plt.show()

## Adaboost Regressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor

ada_reg = AdaBoostRegressor(random_state=9)
ada_reg.fit(X_train, y_train)
y_pred = ada_reg.predict(X_test)

ada_reg_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
ada_reg_acc = round(r2_score(y_test, y_pred),2)*100

print("RMSE of GB Regressor:",ada_reg_rmse)
print(f"Accuracy of GB Regressor {ada_reg_acc} %")

In [None]:
plt.figure(figsize=(8,6))
sns.regplot(x=y_test, y=y_pred, 
            scatter_kws=dict(color="#4e75b5"),
            line_kws=dict(color="#b05862", linewidth=3))
plt.xlabel("True")
plt.ylabel("Pred")
plt.title("AdaBoost Regressor")
plt.show()

## XGBoost Regressor

In [None]:
from xgboost import XGBRegressor

xgb_reg = XGBRegressor(random_state=9)
xgb_reg.fit(X_train, y_train)

y_pred = xgb_reg.predict(X_test)

xgb_reg_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
xgb_reg_acc = round(r2_score(y_test, y_pred),2)*100

print("RMSE of GB Regressor:",xgb_reg_rmse)
print(f"Accuracy of GB Regressor {xgb_reg_acc} %")

In [None]:
plt.figure(figsize=(8,6))
sns.regplot(x=y_test, y=y_pred, 
            scatter_kws=dict(color="#4e75b5"),
            line_kws=dict(color="#b05862", linewidth=3))
plt.xlabel("True")
plt.ylabel("Pred")
plt.title("XGBoost Regressor")
plt.show()

# Evaluating All Models

In [None]:
rmse_scores = {"Linear Regression": [lr_rmse],
                  "Decision Tree Regressor": [dt_reg_rmse],
                  "Random Forest": [rf_reg_rmse],
                  "Gradient Boosting Regressor":[gb_reg_rmse],
                  "Ada Boost Regressor" : [ada_reg_rmse],
                  "XGBRegressor":[xgb_reg_rmse]
              }
rmse_scores = pd.DataFrame(rmse_scores)

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(rmse_scores.columns, rmse_scores.iloc[0], palette="Set2")
plt.title("RMSE of All Models")
plt.show()

In [None]:
r2_acc = {"Linear Regression": [lr_acc],
                  "Decision Tree Regressor": [dt_reg_acc],
                  "Random Forest": [rf_reg_acc],
                  "Gradient Boosting Regressor":[gb_reg_acc],
                  "Ada Boost Regressor" : [ada_reg_acc],
                  "XGBRegressor":[xgb_reg_acc]
              }
r2_acc= pd.DataFrame(r2_acc)

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(r2_acc.columns, r2_acc.iloc[0], palette="Set2")
plt.title("R2 Accuracy Scores %")
plt.show()