In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../input/insurance/insurance.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(12,6))
sns.displot(data=df,x='charges',kde=True)

In [None]:
df['region'].unique()

In [None]:
df.isna().sum()

In [None]:
len(df)

# LET'S TRY LABEL ENCODING FIRST THEN WE CAN DO ONE-HOT-ENCODING

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
df['sex'] = le.fit_transform(df['sex'])

In [None]:
df.head()

# female = 0
# male = 1

In [None]:
df['smoker'] = le.fit_transform(df['smoker'])

In [None]:
df.head()

# smoker:
# Yes = 1
# No = 0

In [None]:
df['region'] = le.fit_transform(df['region'])

In [None]:
df.head()

# Region: Southwest=3   ; Northwest=1   ; Southeast=2   ; Northeast=0

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.corr()['charges'].sort_values()[:-1]

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(data=df.corr(),annot=True,cmap='plasma')

In [None]:
X = df.drop('charges',axis=1)
y = df['charges']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()

In [None]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
X_train

# 1) LINEAR REGRESSION

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression(fit_intercept=True)

In [None]:
lr.fit(X_train,y_train)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
lr_pred = lr.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score

In [None]:
(mean_squared_error(y_test,lr_pred))**0.5

# Bad Results with LR

# 2) CatBoost Regression

In [None]:
from catboost import CatBoostRegressor

In [None]:
cb = CatBoostRegressor(random_state=42)

In [None]:
cb.fit(X_train,y_train,use_best_model=True,eval_set=(X_test,y_test),early_stopping_rounds=30)

In [None]:
cb_pred = cb.predict(X_test)

In [None]:
(mean_squared_error(y_test,cb_pred))**0.5

In [None]:
r2_score(y_test,cb_pred)

# Relatively better than LR

# 3) KNN Regression

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn = KNeighborsRegressor(n_neighbors=5)

In [None]:
knn.fit(X_train,y_train)

In [None]:
knn_pred = knn.predict(X_test)

In [None]:
(mean_squared_error(y_test,knn_pred))**0.5

# KNN better than LR but worse than CB

# 4) SVR

In [None]:
from sklearn.svm import SVR

In [None]:
svr = SVR()

In [None]:
svr.fit(X_train,y_train)

In [None]:
svr_pred = svr.predict(X_test)

In [None]:
(mean_squared_error(y_test,svr_pred))**0.5

# SVR worst

# 5) Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(random_state=42,n_estimators=100)

In [None]:
rf.fit(X_train,y_train)

In [None]:
rf_pred = rf.predict(X_test)

In [None]:
(mean_squared_error(y_test,rf_pred))**0.5

# RF 2nd best after CB

# So the best method was CatBoost Regression

# NOW LET'S TRY IMPLEMENTING ONE-HOT ENCODING

In [None]:
df2 = pd.read_csv("../input/insurance/insurance.csv")

In [None]:
df2.head()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
ct = ColumnTransformer(transformers = [('encoder',OneHotEncoder() , [1,3,4,5])],remainder='passthrough')

In [None]:
X2 = df.drop('charges',axis=1)
y2 = df['charges']
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X2, y2, test_size=0.3, random_state=42)

In [None]:
X_train_2 = np.array(ct.fit_transform(X_train_2))
X_test_2 = np.array(ct.transform(X_test_2))

In [None]:
X_train_2 = sc.fit_transform(X_train_2)
X_test_2 = sc.transform(X_test_2)

In [None]:
X_train_2

In [None]:
X_train_2.shape

In [None]:
X_test_2.shape

# 1) Linear Regression

In [None]:
lr2 = LinearRegression()

In [None]:
lr2.fit(X_train_2,y_train_2)

In [None]:
lr2_pred = lr2.predict(X_test_2)

In [None]:
(mean_squared_error(y_test_2,lr2_pred))**0.5

# 2) CatBoost Regression

In [None]:
cb2 = CatBoostRegressor(random_state=42)

In [None]:
cb2.fit(X_train_2,y_train_2,use_best_model=True,eval_set=(X_test_2,y_test_2),early_stopping_rounds=30,plot=True)

In [None]:
cb2_pred = cb2.predict(X_test_2)

In [None]:
(mean_squared_error(y_test_2,cb2_pred))**0.5

# ONE HOT ENCODING MADE THE MODEL A BIT MORE WORSE

In [None]:
df.describe()

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(y_test_2,cb2_pred)

# BEST PRACTICE FOR THIS CASE WOULD BE TO USE CB WITH LABEL ENCODING