In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')
from scipy import stats
import numpy as np

In [None]:
df = pd.read_csv('/kaggle/input/insurance/insurance.csv')

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.head(7)

In [None]:
df.isnull().sum().sum()

# EDA

In [None]:
data=df.copy()

In [None]:
plt.style.use('seaborn')

In [None]:
plt.figure(figsize=(5,3))
data.sex.value_counts().plot.pie(autopct='%1.1f%%')
plt.show()

In [None]:
plt.figure(figsize=(5, 3))
df.groupby('smoker').sex.value_counts().unstack(0).plot.bar()
plt.title('Smoking cases between genders')
plt.xticks(rotation=0)
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
data.groupby(['region', 'sex']).smoker.value_counts().unstack(0).plot.bar()
plt.title('Smoking cases between genders and regions')
plt.xticks(rotation=0)
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
data.groupby('region').charges.mean().plot.bar()
plt.title('Average charges by regions')
plt.xticks(rotation=0)
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
data.groupby('age').charges.mean().plot()
plt.title('Average charges by age')
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.regplot(data['age'], data['charges'])
plt.show()

# Feature Engineering

In [None]:
plt.figure(figsize=(6,4))
sns.distplot(df.charges)
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.distplot(df.charges, fit=stats.norm)
plt.show()

In [None]:
print("Skewness coeff. is: %f" % df['charges'].skew())
print("Kurtosis coeff. is: %f" % df['charges'].kurt())

In [None]:
df['charges']=np.log1p(df['charges'])

In [None]:
plt.figure(figsize=(6,4))
sns.distplot(df.charges, fit=stats.norm)
plt.show()

# Outlier Analysis

In [None]:
df.describe().T

In [None]:
plt.style.use('default')
fig, ax=plt.subplots(2,2, figsize=(15, 8))
i, j=0, 0
for column in df._get_numeric_data().columns:
    sns.boxplot(df[column], ax=ax[i][j])
    j+=1
    if j==2:
        i+=1
        j=0

In [None]:
df.shape

In [None]:
df=df.query("`bmi`<45")

In [None]:
df.shape

In [None]:
plt.style.use('default')
fig, ax=plt.subplots(2,2, figsize=(15, 8))
i, j=0, 0
for column in df._get_numeric_data().columns:
    sns.boxplot(df[column], ax=ax[i][j])
    j+=1
    if j==2:
        i+=1
        j=0

# Encoding

In [None]:
df=pd.get_dummies(df, drop_first=True)

In [None]:
Y=df.charges
X=df.drop(['charges'], axis=1)
print(Y.shape, X.shape)

# Model 

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [None]:
X_train, X_test, Y_train, Y_test=train_test_split(X, Y, test_size=0.25, random_state=42)

In [None]:
results={'model': [], 'r2_score': [], 'Rmse': []}

In [None]:
models=[LinearRegression(), Lasso(), Ridge(), ElasticNet(), KNeighborsRegressor(), 
        DecisionTreeRegressor(), LGBMRegressor(), CatBoostRegressor(silent=True)]

In [None]:
for model in models:
    m=model
    m.fit(X_train, Y_train)
    Y_test_pred=m.predict(X_test)
    rmse=np.sqrt(mean_squared_error(Y_test, Y_test_pred))
    r2=r2_score(Y_test, Y_test_pred)
    results['model'].append(type(model).__name__)
    results['r2_score'].append(r2)
    results['Rmse'].append(rmse)

In [None]:
results_frame=pd.DataFrame(results)
results_frame.sort_values('r2_score', ascending=False)

# Model Tuning

In [None]:
ridge_params={'alpha': [0.1, 0.01, 0.005, 0.05, 0.001 ,0.2, 0.3, 0.5, 0.8, 0.9, 1], 
                   'solver': ['auto', 'svd', 'cholesky']}
lasso_params={'alpha': [0.1, 0.01, 0.005, 0.05, 0.001 ,0.2, 0.3, 0.5, 0.8, 0.9, 1]
                   }
lgbm_params={'learning_rate': [0.1, 0.2], 'max_depth': list(range(-1, 2, 1)), 
                   'n_estimators': list(range(97, 103))}
catboost_params={'depth'         : [6,8,10],
                  'learning_rate' : [0.01, 0.05, 0.1],
                  'iterations'    : [30, 50, 100]
                 }

In [None]:
def tuning(x, y, model, model_params):
    results=[]
    X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.25, random_state=42)
    clf=GridSearchCV(model, model_params, cv=5, return_train_score=True)
    clf.fit(X_train, Y_train)
    Y_test_pred=clf.predict(X_test)
    test_error=np.sqrt(mean_squared_error(Y_test, Y_test_pred))
    r2score=r2_score(Y_test, Y_test_pred)                                                  
    print(type(model).__name__)
    print('Rmse: {}'.format(test_error))
    print('r2_score: {}'.format(r2score))
    results.append({'model': type(model).__name__, 
                    'RMSE': test_error,
                    'R2 Score': r2score,
                    'best_score': clf.best_score_,
                    'best_params': clf.best_params_})
    return results

In [None]:
result1=tuning(X, Y, Ridge(), ridge_params)

In [None]:
result2=tuning(X, Y, Lasso(), lasso_params)

In [None]:
result3=tuning(X, Y, LGBMRegressor(silent=True), lgbm_params)

In [None]:
result4=tuning(X, Y, CatBoostRegressor(silent=True), catboost_params)

In [None]:
res1 = pd.DataFrame(result1)
res2 = pd.DataFrame(result2)
res3 = pd.DataFrame(result3)
res4 = pd.DataFrame(result4)

In [None]:
res=pd.concat([res1, res2, res3, res4])

In [None]:
sorted_results=res.sort_values('R2 Score', ascending=False)
sorted_results

# Final Model

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
lg=CatBoostRegressor(**sorted_results.iloc[0, 4], silent=True)
lg.fit(X_train, Y_train)
Y_test_pred=lg.predict(X_test)
test_error=np.sqrt(mean_squared_error(Y_test, Y_test_pred))
r2score=r2_score(Y_test, Y_test_pred)                                                  
print(type(model).__name__)
print('r2_score: {}%'.format(np.round(r2score, 3)*100))
print('Rmse: {}'.format(test_error))