In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
insurance= pd.read_csv("../input/insurance/insurance.csv")
df= insurance.copy()
df.head(10)

In [None]:
df.describe().T

In [None]:
df.isnull().values.any()

In [None]:
df.info()

In [None]:
dms = pd.get_dummies(df[['sex', 'smoker', 'region']])
dms.head()

In [None]:
x= df.drop(["charges",'sex', 'smoker', 'region'], axis= 1)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression

In [None]:
x= pd.concat([x,dms.drop(["sex_male","smoker_yes","region_southwest"], axis=1)], axis=1 )
x.head()

In [None]:
y= df["charges"]
y.head()

In [None]:
df= pd.concat([x,y],axis=1)
df.head()

In [None]:
x_tr, x_te, y_tr, y_te= train_test_split(x, y, test_size= .2, random_state= 28)
print("x_tr: {} \nx_te: {} \ny_tr: {} \ny_te: {}".format(x_tr.shape, 
                                                         x_te.shape,
                                                         y_tr.shape,
                                                         y_te.shape))

In [None]:
linModel= LinearRegression().fit(x_tr, y_tr)

In [None]:
linModel.coef_

In [None]:
linModel.intercept_

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
linModel.score(x_tr, y_tr)

In [None]:
y_pred= linModel.predict(x_te)
y_pred[0:5]

In [None]:
print("Linear Model RMSE: {}".format( np.sqrt(mean_squared_error(y_te, y_pred)))) 

In [None]:
cv_linModel= -cross_val_score(linModel, 
                             x_tr,
                             y_tr,
                             cv= 10,
                             scoring= "neg_mean_squared_error").mean()

print("Tuned Model RMSE: {}".format(np.sqrt(cv_linModel)))  

In [None]:
y_pred= cross_val_predict(linModel, x_te, y_te, cv=10)

In [None]:
np.sqrt(mean_squared_error(y_te, y_pred))

# PCR Regression

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale 
pca = PCA()

In [None]:
x_tr_reduced= pca.fit_transform(scale(x_tr))

In [None]:
np.cumsum(np.round(pca.explained_variance_ratio_, decimals = 4)*100)

In [None]:
pcr_model = LinearRegression().fit(x_tr_reduced[:,0:6],y_tr)

In [None]:
pcr_model.score(x_tr_reduced[:,0:6], y_tr)

In [None]:
y_pred= pcr_model.predict(x_tr_reduced[:,0:6])

In [None]:
np.sqrt(mean_squared_error(y_tr, y_pred))

In [None]:
x_te_reduced = PCA().fit_transform(scale(x_te))

In [None]:
y_pred= pcr_model.predict(x_te_reduced[:,0:6])

In [None]:
np.sqrt(mean_squared_error(y_te, y_pred))

## Model Tuning

In [None]:
from sklearn import model_selection
cv_10 = model_selection.KFold(n_splits = 10,
                             shuffle = True,
                             random_state = 1)

In [None]:
lm = LinearRegression()
RMSE = []

In [None]:
for i in np.arange(1, x_tr_reduced.shape[1] + 1):
    
    score = np.sqrt(-1*model_selection.cross_val_score(lm, 
                                                       x_tr_reduced[:,:i], 
                                                       y_tr.ravel(), 
                                                       cv=cv_10, 
                                                       scoring='neg_mean_squared_error').mean())
    RMSE.append(score)
    
RMSE

## PLS Regression

In [None]:
from sklearn.cross_decomposition import PLSRegression, PLSSVD

In [None]:
x_tr.head()

In [None]:
y_tr.head()

In [None]:
pls_model= PLSRegression().fit(x_tr, y_tr)

In [None]:
y_pred= pls_model.predict(x_te)

In [None]:
np.sqrt(mean_squared_error(y_te, y_pred))

In [None]:
r2_score(y_te, y_pred)

In [None]:
import matplotlib.pyplot as plt

In [None]:
RMSE = []

for i in np.arange(1, x_tr.shape[1] + 1):
    pls = PLSRegression(n_components=i)
    score = np.sqrt(-1*cross_val_score(pls, x_tr, y_tr, cv=cv_10, scoring='neg_mean_squared_error').mean())
    RMSE.append(score)


plt.plot(np.arange(1, x_tr.shape[1] + 1), np.array(RMSE), '-v', c = "r")
plt.xlabel('Count of Components')
plt.ylabel('RMSE')
plt.title('Salary');

In [None]:
pls_model = PLSRegression(n_components = 7).fit(x_tr, y_tr)

In [None]:
y_pred= pls_model.predict(x_te)

In [None]:
np.sqrt(mean_squared_error(y_te, y_pred))

# Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

In [None]:
ridge_model= Ridge().fit(x_tr, y_tr)

In [None]:
y_pred= ridge_model.predict(x_te)

In [None]:
np.sqrt(mean_squared_error(y_te, y_pred))

## Model Tuning

In [None]:
from sklearn.linear_model import RidgeCV

lambdas= 10**np.linspace(-10,10,500)*0.5

ridge_cv = RidgeCV(alphas = lambdas, 
                   scoring = "neg_mean_squared_error",
                   normalize = True)

In [None]:
ridge_cv.fit(x_tr, y_tr)

In [None]:
ridge_cv.alpha_

In [None]:
ridge_tuned = Ridge(alpha = ridge_cv.alpha_, normalize = True).fit(x_tr,y_tr)

In [None]:
np.sqrt(mean_squared_error(y_te, ridge_tuned.predict(x_te)))

In [None]:
r2_score(y_te, ridge_tuned.predict(x_te))

# Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

In [None]:
lasso_model= Lasso().fit(x_tr, y_tr)

In [None]:
np.sqrt(mean_squared_error(y_te, lasso_model.predict(x_te)))

## Model Tuning

In [None]:
from sklearn.linear_model import LassoCV

In [None]:
lasso_cv_model= LassoCV(alphas= None, cv=10, max_iter= 1000, normalize= True).fit(x_tr, y_tr)

In [None]:
lasso_cv_model.alpha_

In [None]:
lasso_tuned= Lasso(alpha= lasso_cv_model.alpha_).fit(x_tr, y_tr)

In [None]:
np.sqrt(mean_squared_error(y_te, lasso_tuned.predict(x_te)))

In [None]:
r2_score(y_te, lasso_tuned.predict(x_te))

# ElasticNet Regression

In [None]:
from sklearn.linear_model import ElasticNet, ElasticNetCV

In [None]:
enet_model= ElasticNet().fit(x_tr, y_tr)

In [None]:
enet_model.get_params()

In [None]:
np.sqrt(mean_squared_error(y_te, enet_model.predict(x_te)))

In [None]:
r2_score(y_te, enet_model.predict(x_te))

## Model Tuning

In [None]:
enet_cv= ElasticNetCV(cv= 10, random_state= 28).fit(x_tr, y_tr)

In [None]:
enet_cv.alpha_, enet_cv.l1_ratio

In [None]:
enet_tuned= ElasticNet(alpha= enet_cv.alpha_, l1_ratio= enet_cv.l1_ratio).fit(x_tr, y_tr)

In [None]:
np.sqrt(mean_squared_error(y_te, enet_tuned.predict(x_te)))

In [None]:
r2_score(y_te, enet_tuned.predict(x_te))