In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
insurance_raw_df = pd.read_csv("/kaggle/input/insurance/insurance.csv")

In [None]:
insurance_raw_df.head()

**Check for the size**

In [None]:
insurance_raw_df.shape

**Check for Missing Records**

In [None]:
insurance_raw_df.info()

* There are no missing values in any columns.
* We have the numeric columns - age, bmi, children, charges
* The text columns are - sex, smoker, region

## EDA

In [None]:
insurance_raw_df.describe()

**Check the numeric columns**

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(insurance_raw_df["charges"])
plt.show()

* The target variable is Right Skewed

In [None]:
plt.hist(insurance_raw_df["age"])
plt.show()

In [None]:
plt.hist(insurance_raw_df["bmi"])
plt.show()

* age column looks normally distributed

In [None]:
plt.hist(insurance_raw_df["children"])
plt.show()

* Although it is a numeric column, however the values are ordinal

### How does each numeric variable contribute to the Target variable?

In [None]:
plt.scatter(x=insurance_raw_df["age"],y=insurance_raw_df["charges"])
plt.show()

* Although the charges appear to be increasing with the age, however it looks there could be 3 different segments of how the charges vary with the age.

In [None]:
plt.scatter(x=insurance_raw_df["bmi"],y=insurance_raw_df["charges"])
plt.show()

* With bmi as well, thought there doesn't seem to be a linear relation, however there are clusters visible.

In [None]:
plt.scatter(x=insurance_raw_df["children"],y=insurance_raw_df["charges"])
plt.show()

**Check the Categorical Columns**

In [None]:
# sex, smoker, region

import seaborn as sns

In [None]:
sns.countplot(insurance_raw_df["sex"])
plt.show()

* The records are equally distributed for the Male and Female

In [None]:
sns.countplot(insurance_raw_df["smoker"])
plt.show()

* The number of smokers is less compared to the non-smokers.

In [None]:
sns.countplot(insurance_raw_df["region"])
plt.show()

* The number of records are equally divided for the region as well.

## Feature Engineering

In [None]:
insurance_df = insurance_raw_df.copy()

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(insurance_df, test_size=0.3, random_state=42)

In [None]:
y_train = train_set["charges"].copy()
X_train = train_set.drop("charges", axis=1)

In [None]:
cat_features = ["sex", "smoker", "region"]

In [None]:
ohe_features = pd.get_dummies(X_train[cat_features], drop_first=True)

In [None]:
ohe_features.head()

In [None]:
X_train = pd.concat([X_train,ohe_features], axis=1)

In [None]:
X_train.head()

In [None]:
X_train.drop(columns=cat_features, inplace=True)

In [None]:
X_train.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [None]:
X_train=pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

In [None]:
X_train.head()

In [None]:
from  sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [None]:
lin_reg.intercept_, lin_reg.coef_

In [None]:
lr_coef = pd.DataFrame(data = lin_reg.coef_, index = X_train.columns)
lr_coef.loc["intercept", 0] = lin_reg.intercept_ 
lr_coef

In [None]:
some_data = X_train.iloc[:5]
some_label = y_train.iloc[:5]

print("Predictions:", lin_reg.predict(some_data))

In [None]:
print("Actuals:", list(some_label))

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
lin_predictions_train = lin_reg.predict(X_train)
lin_mse_train = mean_squared_error(y_train, lin_predictions_train)
lin_rmse_train = np.sqrt(lin_mse_train)
lin_rmse_train

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_train = r2_score(y_train, lin_predictions_train)
r2_train

In [None]:
plt.scatter(y_train, lin_predictions_train)
plt.show()

In [None]:
plt.hist(lin_predictions_train-y_train)
plt.show()

### Predicting on the test data

In [None]:
y_test = test_set["charges"].copy()
X_test = test_set.drop("charges", axis=1)

In [None]:
ohe_features_test = pd.get_dummies(X_test[cat_features], drop_first=True)

In [None]:
X_test = pd.concat([X_test,ohe_features_test], axis=1)

In [None]:
X_test.drop(columns=cat_features, inplace=True)

In [None]:
X_test=pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

In [None]:
lin_predictions_test = lin_reg.predict(X_test)
lin_mse_test = mean_squared_error(y_test, lin_predictions_test)
lin_rmse_test = np.sqrt(lin_mse_test)
lin_rmse_test

In [None]:
r2_test = r2_score(y_test, lin_predictions_test)
r2_test

## Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

alpha = [-3,-2,-1,1e-15, 1e-10, 1e-8,1e-5,1e-4, 1e-3,1e-2,0.5,1,1.5, 2,3,4, 5, 10, 20, 30, 40]

for i in alpha:
    ridge_reg = Ridge(alpha=i)
    ridge_reg.fit(X_train, y_train)
    
    ridge_reg_predictions_train = np.absolute(ridge_reg.predict(X_train))
    ridge_reg_mse = mean_squared_error(y_train, ridge_reg_predictions_train)
    ridge_reg_rmse = np.sqrt(ridge_reg_mse)
    print(i, ridge_reg_rmse)

In [None]:
alpha = [-40,-30,-20,-10,-5,-4,-3,-2,-1,1e-15, 1e-10, 1e-8,1e-5,1e-4, 1e-3,1e-2,0.5,1,1.5, 2,3,4, 5, 10, 20, 30, 40]

for i in alpha:
    ridge_reg = Ridge(alpha=i)
    ridge_reg.fit(X_train, y_train)
    
    ridge_reg_predictions_train = np.absolute(ridge_reg.predict(X_train))
    ridge_reg_mse = mean_squared_error(y_train, ridge_reg_predictions_train)
    ridge_reg_rmse = np.sqrt(ridge_reg_mse)
    print(i, ridge_reg_rmse)

### Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

alpha = [-40,-30,-20,-10,-5,-4,-3,-2,-1,1e-15, 1e-10, 1e-8,1e-5,1e-4, 1e-3,1e-2,0.5,1,1.5, 2,3,4, 5, 10, 20, 30, 40]

for i in alpha:
    lasso_reg = Lasso(alpha=i)
    lasso_reg.fit(X_train, y_train)
    
    lasso_reg_predictions_train = np.absolute(lasso_reg.predict(X_train))
    lasso_reg_mse = mean_squared_error(y_train, lasso_reg_predictions_train)
    lasso_reg_rmse = np.sqrt(lasso_reg_mse)
    print(i, lasso_reg_rmse)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)

In [None]:
tree_reg_predictions = tree_reg.predict(X_train)
tree_mse = mean_squared_error(y_train, tree_reg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [None]:
tree_r2_score = r2_score(y_train, tree_reg_predictions)
tree_r2_score

In [None]:
from sklearn.model_selection import cross_val_score

tree_scores = cross_val_score(tree_reg, X_train, y_train,
                             scoring = "neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standar deviationi:", scores.std())
    
display_scores(tree_rmse_scores)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)

In [None]:
forest_reg_predictions = forest_reg.predict(X_train)
forest_reg_mse = mean_squared_error(y_train, forest_reg_predictions)
forest_reg_rmse = np.sqrt(forest_reg_mse)
forest_reg_rmse

In [None]:
forest_r2_score = r2_score(y_train, forest_reg_predictions)
forest_r2_score

## Applying Random Forest on the Test data

In [None]:
forest_reg_predictions = forest_reg.predict(X_test)
forest_reg_mse = mean_squared_error(y_test, forest_reg_predictions)
forest_reg_rmse = np.sqrt(forest_reg_mse)
forest_reg_rmse

In [None]:
forest_r2_score = r2_score(y_test, forest_reg_predictions)
forest_r2_score