In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# A Simple Practical Guide to Linear Regression
This notebook provides a practical guide to implement linear regression, walking through the model building lifecycle: EDA, feature engineering, model implementation and model evaluation. Please visit article "[A Practical Guide to Linear Regression](https://towardsdatascience.com/a-practical-guide-to-linear-regression-3b1cb9e501a6)" for step by step guide or visit [my website](http://www.visual-design.net) for more articles like this. 
![Linear Regression Cheatsheet](https://miro.medium.com/max/1400/1*_xszvgfP2xIQz7krzbJMOA.png)

# Load Dataset

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_string_dtype, is_numeric_dtype

df = pd.read_csv('../input/insurance-premium-prediction/insurance.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe(include = 'all')

In [None]:
num_list = []
cat_list = []

for column in df:
    plt.figure(column, figsize = (5,5))
    plt.title(column)
    if is_numeric_dtype(df[column]):
        df[column].plot(kind = 'hist')
        num_list.append(column)
    elif is_string_dtype(df[column]):
        # show only the TOP 10 value count in each categorical data
        df[column].value_counts()[:10].plot(kind = 'bar')
        cat_list.append(column)
        
print(num_list)
print(cat_list)

# EDA
- correlation analysis
- pairplot
- pairplot with hue

In [None]:
correlation = df.corr()
sns.heatmap(correlation, cmap = "GnBu", annot = True)

In [None]:
sns.pairplot(df,height = 2.5)

In [None]:
# pairplot with hue
for i in range(0, len(cat_list)):
    hue_cat = cat_list[i]
    sns.pairplot(df, hue = hue_cat)

# Feature Engineering
- encode categorical data
- log transformation

In [None]:
# Categorical Data Encoding using One-Hot vs. Label Encoding

# One Hot Encoding using get dummies
# df = pd.get_dummies(df, columns = cat_list)

# Label Encoding
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder


for i in cat_list:
    df[i] = LabelEncoder().fit_transform(df[i])

df.head()

In [None]:
# log transformation

df['log_expenses'] = np.log2(df['expenses'] +1)

plt.figure(1)

df['expenses'].plot(kind = 'hist')

plt.figure(2)
df['log_expenses'].plot(kind = 'hist')

print(df)
df = df.drop(['expenses'], axis=1)

In [None]:
# X - input features matrix
X = df.drop(['log_expenses'], axis=1)

# y - output target vector
y = df["log_expenses"]

# split into train and test set
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
sns.pairplot(df,height = 1.5)

In [None]:
plt.figure(column, figsize = (10,10))
correlation = df.corr()
sns.heatmap(correlation, cmap = "GnBu", annot = True)

# Linear Regression Model
- split dataset into train, test
- build the model
- transform expenses predictions to original scale

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(normalize = True)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
coef = model.coef_
intercept = model.intercept_

plt.figure(1, figsize = (18,6))
sns.barplot(x = X_train.columns, y = coef, palette = "GnBu")

In [None]:
# transform expenses predictions to original scale
expenses_pred = 2**y_pred
plt.figure(0)
sns.histplot(y_pred, bins = 50)
plt.figure(1)
sns.histplot(expenses_pred, bins = 50)

# Model Evaluation
- error distribution
- MAE, MSE, RMSE
- R Squared

In [None]:
import sklearn.metrics as metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# Error Distribution
plt.figure()
sns.histplot((y_test - y_pred), bins = 50 )

# MAE, MSE, RMSE
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# R Squared - Coefficient of Determination
print('R Squared:', round(model.score(X_test, y_test),2))

# Compare with Stochastic Gradient Descent (SGD)

In [None]:
# Stochastic Gradient Descent (SGD)

from sklearn.linear_model import SGDRegressor

sgd_model = SGDRegressor(eta0=0.01, max_iter= 10000, learning_rate = 'adaptive')
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)


# model evaluation
plt.figure()
sns.histplot((y_test - y_pred), bins = 50)

print("R Squared:", round(sgd_model.score(X_test, y_test))) 
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))