# Source of this dataset

https://www.kaggle.com/mirichoi0218/insurance

# About the data

This dataset was inspired by the book Machine Learning with R by Brett Lantz. The data contains medical information and costs billed by health insurance companies. It contains 1338 rows of data and the following columns: age, gender, BMI, children, smoker, region, insurance charges.


# Importing Modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Loading the data

In [None]:
med_insurance_df = pd.read_csv("../input/insurance/insurance.csv")

In [None]:
med_insurance_df.head(2)

In [None]:
med_insurance_df.info()

# Exploratory Data Analysis

In [None]:
med_insurance_df['region'].unique()

In [None]:
med_insurance_df.groupby('region').max()['charges']

In [None]:
med_insurance_df.groupby('region').mean()['charges']

In [None]:
pd.get_dummies(med_insurance_df['region'])

- We will have 2 columns northeast and southeast.
- For northwest and southwest, avg charges are in the same range.

# Encoding Categorical Data

In [None]:
med_insurance_df['northeast_region'] = pd.get_dummies(med_insurance_df['region'])['northeast']
med_insurance_df['southeast_region'] = pd.get_dummies(med_insurance_df['region'])['southeast']
med_insurance_df['male'] = pd.get_dummies(med_insurance_df['sex'])['male']
med_insurance_df['smoker'] = pd.get_dummies(med_insurance_df['smoker'])['yes']
med_insurance_df.head(2)

In [None]:
# It is obvious that females have lower BMI than male, the output supports the fact

print("Maximun ",med_insurance_df.groupby('male').max()['bmi'])
print("Mean ",med_insurance_df.groupby('male').mean()['bmi'])

In [None]:
print("Maximun ",med_insurance_df.groupby('smoker').max()['bmi'])
print("Mean ",med_insurance_df.groupby('smoker').mean()['bmi'])

In [None]:
print("Maximun ",med_insurance_df.groupby('male').max()['charges'])
print("Mean ",med_insurance_df.groupby('male').mean()['charges'])

In [None]:
# Smokers have more medical charges than non-smokers

print("Maximun ",med_insurance_df.groupby('smoker').max()['charges'])
print("Mean ",med_insurance_df.groupby('smoker').mean()['charges'])

# Visualizing Data

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(med_insurance_df.corr(), annot=True, cmap='viridis')

In [None]:
sns.scatterplot(x='age', y='charges', data=med_insurance_df, hue='smoker')

Observation : Smokers having  more medical charges 

In [None]:
sns.scatterplot(x='age', y='charges', data=med_insurance_df, hue='male')

**Observation :** male and female have almost same medical charges

In [None]:
sns.countplot(x='smoker', data=med_insurance_df, hue='male')

In [None]:
sns.countplot(x='smoker', data=med_insurance_df, hue='region')

In [None]:
sns.countplot(x='male', data=med_insurance_df)

**Observation** : Its a balanced dataset

# Drop Unnecessary Features 

In [None]:
med_insurance_df.columns

- We will drop the columns "male" and "children" for having little correlation with medical charges.
- We will drop column "sex" as it is already encoded.


In [None]:
for col in ['sex', 'children', 'region', 'male']:
  if col in med_insurance_df.columns:
    med_insurance_df.drop(col, axis=1, inplace=True)

# Splitting the data for training and testing

In [None]:
X = med_insurance_df.drop('charges', axis=1)
y = med_insurance_df['charges']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)

pred = lm.predict(X_test)

# Model evaluating

In [None]:
print(lm.intercept_)

In [None]:
df = pd.DataFrame(lm.coef_, index=X.columns, columns=['Coefficient'])
df

In [None]:
plt.scatter(y_test, pred)
plt.xlabel("y_test")
plt.ylabel("pred")
plt.title("True value vs. Predicted")

In [None]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, pred))
print('MSE:', metrics.mean_squared_error(y_test, pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))