<h1 style="margin:auto;width:70%">Medical Cost- EDA and SVR vs Polynomial vs Linear Regression </h1>

<img style="width:90%" src='https://qtxasset.com/2017-04/healthcare_costs.jpg?uGsn4AUuFsDmwZ7S1weCkchfYzttb7Ra'/>




<br>

# Columns

- **age: age of primary beneficiary**


- **sex: insurance contractor gender, female, male**


- **bmi: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height, objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9**


- **children: Number of children covered by health insurance / Number of dependents**


- **smoker: Smoking**


- **region: the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.**


- **charges: Individual medical costs billed by health insurance.**
_________

<h3 style='color:purple' >If you liked this notebook, you can Vote for it. Thanks :)</h3>


_____

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.linear_model    import LinearRegression
from sklearn.svm             import SVR
from sklearn.metrics         import mean_absolute_error, mean_squared_error, median_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing   import StandardScaler, PolynomialFeatures

In [None]:
df = pd.read_csv('../input/insurance/insurance.csv')

In [None]:
df.head()

------------
# EDA

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df.describe().T

In [None]:
pa_color = ['#343A40','#7F8B52']

In [None]:
plt.figure(figsize=(8,5))

sns.boxplot(y=df['age'], x=df['sex'], palette=pa_color)

plt.title('Age in male & female', fontsize=20)

plt.xlabel('Sex', fontsize=20)
plt.ylabel('Age', fontsize=20)

plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

plt.show()

In [None]:
plt.figure(figsize=(8,5))

sns.boxplot(y=df['charges'], x=df['sex'], palette=pa_color)

plt.title('Charges & Sex', fontsize=20)

plt.xlabel('Sex', fontsize=20)
plt.ylabel('charges', fontsize=20)

plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

plt.show()

In [None]:
plt.figure(figsize=(8,5))

sns.boxplot(y=df['charges'], x=df['children'], palette=pa_color)

plt.title('Charges & Children', fontsize=20)

plt.xlabel('children', fontsize=20)
plt.ylabel('charges', fontsize=20)

plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

plt.show()

In [None]:
plt.figure(figsize=(8,5))

sns.boxplot(y=df['charges'], x=df['region'], palette=pa_color)

plt.title('Charges & Region', fontsize=20)

plt.xlabel('region', fontsize=20)
plt.ylabel('charges', fontsize=20)

plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

plt.show()

In [None]:
plt.figure(figsize=(8,5))

sns.boxplot(y=df['charges'], x=df['smoker'], palette=pa_color)

plt.title('Charges & Smoker', fontsize=20)

plt.xlabel('Smoker', fontsize=20)
plt.ylabel('charges', fontsize=20)

plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

plt.show()

In [None]:
sns.displot(x=df['charges'], kind='kde', fill=True, color=pa_color[0] ,aspect=2 )

plt.title('Charges', fontsize=20)

plt.xlabel('charges', fontsize=20)
plt.ylabel('Density', fontsize=20)

plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

plt.show()

In [None]:
sns.displot(x=df['bmi'], kind='kde', fill=True, color=pa_color[0] ,aspect=2 )

plt.title('BMI', fontsize=20)

plt.xlabel('bmi', fontsize=20)
plt.ylabel('Density', fontsize=20)

plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

plt.show()

In [None]:
sns.displot(x=df['age'], kind='kde', fill=True, color=pa_color[0] ,aspect=2 )

plt.title('Age', fontsize=20)

plt.xlabel('age', fontsize=20)
plt.ylabel('Density', fontsize=20)

plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

plt.show()

In [None]:
plt.figure(figsize=(9,6))

sns.heatmap(df.corr(), annot=True, cmap=['#393E46','#334443','#34656D','#A35709','#C6FFC1'])

plt.title('Correlations between columns.', fontsize=20)


plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

plt.show()

_____

# Treat With Outliears

In [None]:
up = df['bmi'].quantile(.99)
down = df['bmi'].quantile(.01)

df = df[(df['bmi']>down) & (df['bmi']<up)]

new_bmi = df[(df['bmi']>down) & (df['bmi']<up)]['bmi']

sns.boxplot(y=new_bmi , palette=pa_color)

In [None]:
up = df['charges'].quantile(.841)
down = df['charges'].quantile(.01)

df = df[(df['charges']>down) & (df['charges']<up)]

new_charges = df[(df['charges']>down) & (df['charges']<up)]['charges']

sns.boxplot(y=new_charges,  palette=pa_color)

_______

**use dummies for categorical columns.**

In [None]:
df = pd.get_dummies(data=df, drop_first=True)

In [None]:
# df['children_0'] = df['children'].map({0:1,1:0,2:0,3:0,4:0,5:0})
df['children_1'] = df['children'].map({0:0,1:1,2:0,3:0,4:0,5:0})
df['children_2'] = df['children'].map({0:0,1:0,2:1,3:0,4:0,5:0})
df['children_3'] = df['children'].map({0:0,1:0,2:0,3:1,4:0,5:0})
df['children_4'] = df['children'].map({0:0,1:0,2:0,3:0,4:1,5:0})
df['children_5'] = df['children'].map({0:0,1:0,2:0,3:0,4:0,5:1})

In [None]:
df.drop('children', axis=1, inplace=True)

____

# Split train and test data

In [None]:
X = df.drop('charges', axis=1)
y = df['charges']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

___

# Standard Scaler

In [None]:
sc = StandardScaler()

In [None]:
X_train = sc.fit_transform(X_train)
X_test  = sc.fit_transform(X_test)

_____

# Build SVR Model

In [None]:
SVR_Model = SVR(kernel='linear', C=20,)

In [None]:
SVR_Model.fit(X_train, y_train)

In [None]:
y_predict = SVR_Model.predict(X_test)

In [None]:
train_score = SVR_Model.score(X_train, y_train) * 100

print('Train Score :', train_score, '%') # 53.92

In [None]:
test_score = SVR_Model.score(X_test, y_test) * 100

print('Test Score :', test_score, '%') # 51.95

In [None]:
mean_absolute_error(y_predict, y_test)

In [None]:
mean_squared_error(y_predict, y_test)

In [None]:
median_absolute_error(y_predict, y_test)

_______________

# Build Linear Regression Model

In [None]:
linear_Model = LinearRegression()

In [None]:
linear_Model.fit(X_train, y_train)

In [None]:
y_linear_predict = linear_Model.predict(X_test)

In [None]:
linear_Model.score(X_train, y_train) * 100

In [None]:
linear_Model.score(X_test, y_test) * 100

_________________
# Build Polynomial Regression Moedel

In [None]:
poly = PolynomialFeatures(degree=2)

In [None]:
x_poly = poly.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_poly, y, test_size=.25, random_state=42)

In [None]:
X_train = sc.fit_transform(X_train)
X_test  = sc.fit_transform(X_test)

In [None]:
poly_Model = LinearRegression()

In [None]:
poly_Model.fit(X_train, y_train)

In [None]:
y_poly_predict = poly_Model.predict(X_test)

In [None]:
poly_Model.score(X_train, y_train) * 100 # 84.81

In [None]:
poly_Model.score(X_test, y_test) * 100 # 82.96