In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.metrics import r2_score , mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures , LabelEncoder
from sklearn.tree import DecisionTreeRegressor , plot_tree
from sklearn.ensemble import RandomForestRegressor


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Dataset

## Columns

**age:** age of primary beneficiary

**sex:** insurance contractor gender, female, male

**bmi:** Body mass index, providing an understanding of body, weights that are relatively high or low relative to height,
objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9

**children:** Number of children covered by health insurance / Number of dependents

**smoker:** Smoking

**region:** the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.

**charges**: Individual medical costs billed by health insurance

In [None]:
data = pd.read_csv("/kaggle/input/insurance/insurance.csv")
data.head(5)

## Describe

In [None]:
data.info()

In [None]:
data.describe().T

## Preprocessing

### Transformation non-numerical labels to numerical labels.
*    Smoker :
        - Yes = 1
        - No  =0
*    Sex :
        - Male = 1
        - Female = 0

In [None]:
le = LabelEncoder()
data["sex"] = le.fit_transform(data["sex"])
data["sex"].unique()

In [None]:
data["smoker"] = le.fit_transform(data["smoker"])
data["smoker"].unique()

In [None]:
data["region"] = le.fit_transform(data["region"])
data["region"].unique()

## Visualize

In [None]:
sns.pairplot(data, hue="smoker");

In [None]:
sns.heatmap(data.corr() , annot=True);

# Multiple Linear Regression

## Data Preprocessing

In [None]:
X, y = data.drop(["charges"], axis=1), data["charges"].values.reshape(-1,1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Create Model & Fit

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

## Get Results

In [None]:
print("intercept : " , model.intercept_)
print("slope : " , model.coef_)

## Predict

In [None]:
y_pred = model.predict(X_test)
y_pred_df = pd.DataFrame(y_pred, columns=["Predicted Response" ])
y_test_df = pd.DataFrame(y_test, columns=["Real Values"])
pd.concat([y_test_df , y_pred_df] , axis=1)

## Metrics

In [None]:
score = r2_score(y_test , y_pred)
print("R2 Score : {}".format(score))

In [None]:
MSE = mean_squared_error(y_test , y_pred)
print("MSE: {}".format(MSE))