In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.stats as stat
import warnings
warnings.simplefilter("ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("../input/insurance/insurance.csv")
#First Look
data.head(10)

In [None]:
data.info()

In [None]:
# İnfo Data
categoric = data.select_dtypes("object")
numeric = data.select_dtypes(["int","float"])
print("Number of NA observations in the data : {}".format(data.isna().sum().sum()))
print("Number of categorical column : {}".format(len(data.select_dtypes("object").columns)))
print("Number of numeric column : {}".format(len(data.select_dtypes(["int","float"]).columns)))

In [None]:
# Describe Data
data.describe().T

## Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plot

In [None]:
sns.pairplot(data)

In [None]:
# Dist. of BMI
sns.distplot(data.bmi,color= "red")
print("P Value for Shapiro-Wilks test : {}".format(stat.shapiro(data.bmi)[1]))
print("Skewness : {}".format(stat.skew(data.bmi)))
print("BMI is not normal.")

In [None]:
# Charges-Bmi
sns.scatterplot(data.charges,data.bmi,alpha= 1)

* We discard those with more than 50 variables from BMI

In [None]:
data = data[data["bmi"]<=50]
sns.distplot(data.bmi,color= "red")
print("P Value for Shapiro-Wilks test : {}".format(stat.shapiro(data.bmi)[1]))
print("Skewness : {}".format(stat.skew(data.bmi)))
print("BMI still not normal but now closer to normal.")

In [None]:
# Charges
sns.distplot(data.charges,color = "red")
print("P Value for Shapiro-Wilks test : {}".format(stat.shapiro(data.charges)[1]))
print("Skewness : {}".format(stat.skew(data.charges)))
print("Charges is not normal.")

In [None]:
# Since the target variable is not normal, I will apply transforming techniques
# Log Transform
log_trans = np.log1p(data.charges)
sns.distplot(log_trans,color = "red")
print("P Value for Shapiro-Wilks test : {}".format(stat.shapiro(log_trans)[1]))
print("Skewness : {}".format(stat.skew(log_trans)))
print("Charges is not normal.")

In [None]:
# Root Square Transform
sqrt_trans = np.sqrt(data.charges)
sns.distplot(sqrt_trans,color = "red")
print("P Value for Shapiro-Wilks test : {}".format(stat.shapiro(sqrt_trans)[1]))
print("Skewness : {}".format(stat.skew(sqrt_trans)))
print("Charges is not normal.")

* Still not compatible enough but close. I will apply the sqrt transform

## Encoding

In [None]:
categoric = data.select_dtypes("object")
numeric = data.select_dtypes(["int","float"])
dummy = pd.get_dummies(categoric,drop_first=True)
data.drop(categoric.columns,axis = 1,inplace = True)
data  = pd.concat([dummy,data],axis = 1)

In [None]:
X =data.drop("charges",axis = 1)
y = np.sqrt(data.charges)

# Model

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.model_selection import KFold,cross_val_predict
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from scipy.special import inv_boxcox
kf = KFold(shuffle=True,random_state=42,n_splits=5)
scale = StandardScaler()

In [None]:
# Simple Linear Regression
scores = []
lr = LinearRegression()
for train_index,test_index in kf.split(X):
    X_train,X_test,y_train,y_test = (X.iloc[train_index,:],X.iloc[test_index,:],
                                     y.iloc[train_index],y.iloc[test_index])
    
    model = lr.fit(X_train,y_train)
    pred = model.predict(X_test)
    #pred = inv_boxcox(pred,lambd)
    scores.append(r2_score(pred,y_test))
    plot.scatter(y_test.values,pred)
    plot.show()
print(scores)

In [None]:
# Lasso
alphas = np.geomspace(0.00001,10,num = 20) 
scores = []
for alpha in alphas:
    lasso = Lasso(alpha=alpha,max_iter = 100000)
    estimator = Pipeline([("scaler",scale),("lasso_regression",lasso)])
    predictions = cross_val_predict(estimator,X,y,cv=kf)
    scores.append(r2_score(y,predictions))


plot.semilogx(alphas,scores,"-*")
r2_lasso = pd.DataFrame(zip(alphas,scores),columns=["Alpha","R2_Score"])

In [None]:
r2_lasso

* As alpha values ​​increase, while exchanging variance and deviation, as the alpha increases, the model will be more biased and the R2 score decreases. I'm going to keep the alpha as small as possible to achieve this unwanted state balance.

In [None]:
# Lasso with add Polynomial Features
pf = PolynomialFeatures(degree = 2)
scores = []
alphas = np.geomspace(0.00001,1,num = 10) 

for alpha in alphas:
    lasso = Lasso(alpha=alpha,max_iter = 100000)
    estimator = Pipeline([("polynomial_feature",pf),("scaler",scale),("lasso_regression",lasso)])
    predictions = cross_val_predict(estimator,X,y,cv=kf)
    scores.append(r2_score(y,predictions))
    print("For Alpha :: {}".format(alpha),"----> Root Mean Squared Error : {}".format(np.sqrt(mean_squared_error(y,predictions))))


plot.semilogx(alphas,scores,"-*")
pf_lasso_r2 = pd.DataFrame(list(zip(alphas,scores)),columns=["Alpha","R2_Score"])

In [None]:
pf_lasso_r2

In [None]:
# Ridge

pf = PolynomialFeatures(degree = 2)
scores = []
alphas = np.geomspace(0.0001,20,num = 15) 

for alpha in alphas:
    ridge = Ridge(alpha=alpha,max_iter = 100000)
    estimator = Pipeline([("polynomial_feature",pf),("scaler",scale),("ridge_regression",ridge)])
    predictions = cross_val_predict(estimator,X,y,cv=kf)
    print("For Alpha :: {}".format(alpha),"----> Root Mean Squared Error : {}".format(np.sqrt(mean_squared_error(y,predictions))))
    scores.append(r2_score(y,predictions))
plot.semilogx(alphas,scores,"-o")
ridge_r2 = pd.DataFrame(list(zip(alphas,scores)),columns=["Alpha","R2_Score"])

In [None]:
ridge_r2

In [None]:
from sklearn.model_selection import GridSearchCV
estimator = Pipeline([("polynomial",PolynomialFeatures(include_bias=False)),
                      ("scale",scale),
                      ("ridge_regression",Ridge())])
params = {"polynomial__degree":[1,2,3],
          "ridge_regression__alpha":np.geomspace(0.001,10,20)}
grid = GridSearchCV(estimator,params,cv = kf)
grid.fit(X,y)
grid.best_score_

# Main Model

In [None]:
grid.best_params_

In [None]:
estimator = Pipeline([("polynomial",PolynomialFeatures(degree = 2,include_bias=False)),
                      ("scale",scale),
                      ("ridge_regression",Ridge(alpha=1.438449888287663))])
estimator.fit(X_train,y_train)
predict = estimator.predict(X_test)

print("R2 Score for Ridge Regression : {}".format(r2_score(predict,y_test)))
print("Root Mean Squared Error : {}".format(np.sqrt(mean_squared_error(y_test,predict))))
print("Mean Absolute Error : {}".format(mean_absolute_error(predict,y_test)))

* Conclusion ::
Although Lasso gave a higher R2 score than ridge, I set up the main model with Ridge because Lasso is running very slow. The optimal parameter for the Ridge was found to be alpha = 1.4384988, and the model established with optimal parameters could be explained by about 83%. If you find it useful, please do not be afraid to give ops and we can speak in the comments to guide me.
Thanks :)