In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

In [None]:
medical_data = pd.read_csv('/kaggle/input/insurance/insurance.csv')

medical_data.head()

In [None]:
medical_data.shape

In [None]:
medical_data.info()

In [None]:
medical_data.describe()

**Univariate Exploratory Data Analysis**

In [None]:
medical_data.age.describe()

In [None]:
fig = px.histogram(medical_data, x='age', marginal = 'box', nbins = 47, title='Distribution of Age')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
medical_data.bmi.describe()

In [None]:
fig = px.histogram(medical_data, x='bmi', marginal='box', title='Distribution of BMI')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
medical_data.children.describe()

In [None]:
fig = px.histogram(medical_data, x='children', marginal = 'box', title = 'Distribution of children')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
medical_data.charges.describe()

In [None]:
fig = px.histogram(medical_data, x='charges', marginal='box', title='Distribution of charges')
fig.update_layout(bargap=0.1)
fig.show()

**Distribution of charges with respect to smokers**

In [None]:
fig = px.histogram(medical_data, x='charges', marginal='box', color='smoker', color_discrete_sequence=['green', 'grey'], title='Distribution of charges with respect to smoker')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
medical_data.sex.value_counts()

In [None]:
fig = px.histogram(medical_data, x='sex', title='Distribution for sex')
fig.show()

In [None]:
fig = px.histogram(medical_data, x='sex', color='smoker', title='Distribution of sex with respect to smoker')
fig.show()

In [None]:
medical_data.region.value_counts()

In [None]:
fig = px.histogram(medical_data, x='region', title='Distribution for region')
fig.show()

In [None]:
fig = px.histogram(medical_data, x='region', color='smoker', title='Distribution of region with respect to smoker')
fig.show()

In [None]:
fig = px.histogram(medical_data, x='region', color='sex', title='Distribution of region with respect to sex')
fig.show()

In [None]:
medical_data.smoker.value_counts()

In [None]:
fig = px.histogram(medical_data, x='smoker', title='Distribution of smoker')
fig.show()

In [None]:
fig = px.histogram(medical_data, x='smoker', color='region' ,title='Distribution of smoker')
fig.show()

In [None]:
fig = px.histogram(medical_data, x='age', color='smoker' ,title='Distribution of age with smoker')
fig.show()

**BiVariate analysis**

In [None]:
fig = px.scatter(medical_data, x='age', y='charges', color='smoker', opacity=0.8, title='Age vs Charges')
fig.update_traces(marker_size=5)
fig.show()

In [None]:
fig = px.scatter(medical_data, x='bmi', y='charges', color='smoker', opacity=0.8, title='BMI vs Charges')
fig.update_traces(marker_size=5)
fig.show()

In [None]:
fig = px.violin(medical_data, x='children', y='charges', color='smoker', title='Children vs Charges')
fig.show()

**Corelation for the given data**

In [None]:
medical_data.charges.corr(medical_data.age)

In [None]:
medical_data.charges.corr(medical_data.bmi)

In [None]:
medical_data.charges.corr(medical_data.children)

In [None]:
smoker_values = {'no': 0, 'yes': 1}
smoker_numeric = medical_data.smoker.map(smoker_values)
smoker_numeric

In [None]:
medical_data.charges.corr(smoker_numeric)

In [None]:
sex_values = {'female': 0, 'male': 1}
sex_numeric = medical_data.sex.map(sex_values)
sex_numeric

In [None]:
medical_data.charges.corr(sex_numeric)

In [None]:
region_values = {'northeast': 0, 'northwest' : 1, 'southeast' : 2, 'southwest' : 3}
region_numeric = medical_data.region.map(region_values)
region_numeric

In [None]:
medical_data.charges.corr(region_numeric)

In [None]:
medical_data.corr()

In [None]:
sns.heatmap(medical_data.corr(), cmap='Reds', annot=True)
plt.title('Correlation of numeric values')

In [None]:
smoker_codes = {'no' : 0, 'yes' : 1}
medical_data['smoker_code'] = medical_data.smoker.map(smoker_codes)
medical_data.head()

In [None]:
sex_codes = {'female' : 0, 'male' : 1}
medical_data['sex_code'] = medical_data.sex.map(sex_codes)
medical_data.head()

In [None]:
from sklearn import preprocessing
encoding = preprocessing.OneHotEncoder()
encoding.fit(medical_data[['region']])
encoding.categories_

In [None]:
one_hot = encoding.transform(medical_data[['region']]).toarray()

In [None]:
medical_data[['northeast', 'northwest', 'southeast', 'southwest']] = one_hot
medical_data.head()

**Scaling the numerical values**

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
numerical_columns = ['age', 'bmi', 'children']
scaler = StandardScaler()
scaler.fit(medical_data[numerical_columns])

In [None]:
scaler.mean_

In [None]:
scaler.var_

In [None]:
scaled_inputs = scaler.transform(medical_data[numerical_columns])
scaled_inputs

In [None]:
categorical_columns = ['smoker_code', 'sex_code', 'northeast', 'northwest', 'southeast', 'southwest']
categorical_inputs = medical_data[categorical_columns]
categorical_inputs.values

**Creating a model**

In [None]:
inputs = np.concatenate((scaled_inputs, categorical_inputs), axis=1)
inputs

In [None]:
targets = medical_data['charges']
targets

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
inputs_train, inputs_test, targets_train, targets_test = train_test_split(inputs, targets, test_size = 0.3, random_state = 3)

In [None]:
model = LinearRegression()

In [None]:
model.fit(inputs_train, targets_train)

In [None]:
predictions_test = model.predict(inputs_test)

In [None]:
def rmse(target, prediction):
    return np.sqrt(np.mean(np.square(target - prediction)))

In [None]:
loss = rmse(targets_test, predictions_test)
print(f'The Root Mean Square Loss for this Linear Model is {loss}.')

In [None]:
model.coef_

In [None]:
model.intercept_

**Feature Scaling**

In [None]:
inputs_columns = ['age', 'bmi', 'children', 'smoker_code', 'sex_code', 'northeast', 'northwest', 'southeast', 'southwest']

In [None]:
weight_df = pd.DataFrame({
    'feature' : np.append(inputs_columns, 'Bias'),
    'weights' : np.append(model.coef_, model.intercept_)
})

In [None]:
weight_df

**Predicting for new customers**

In [None]:
new_customer = [[28, 30, 2, 1, 0, 0, 1, 0, 0]]

scaler.transform([[28, 30, 2]])

In [None]:
predictions_new_customer = model.predict([[-0.79795355, -0.10882659, 0.75107928, 1, 0, 0, 1, 0, 0]])

In [None]:
predictions_new_customer

In [None]:
weight_df.to_csv('output.csv', index=False)