# ****Medical Cost Prediction****

In [None]:
import pandas as pd
# Import data
data = data = pd.read_csv('/kaggle/input/insurance/insurance.csv')
data.shape

In [None]:
data.head()

In [None]:
# Check each features datatype and null status 
data.info()

#### We have 4 numerical feature ( age, bmi, children, charges) and 2 categorical feature (sex, region) and no null data 

## Univarient Analysis
Only one feature at a time

In [None]:
# Describe all numerical data (age, bmi, children, charges)
data.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [None]:
# Check Gender Feature 

sns.countplot(data['sex'])

plt.title('Person by Gender', fontsize='16', fontweight='bold')
plt.xlabel('Gender', fontsize='14')
plt.ylabel('Total Persons', fontsize='14')
plt.show()

## Decision: Male are slightly more than female

In [None]:
# Check region feature 

sns.countplot(data['region'])

plt.title('Person by Region', fontsize='16', fontweight='bold')
plt.xlabel('Region Name', fontsize='14')
plt.ylabel('Total Persons', fontsize='14')
plt.show()

## Decision: More peoples are lived in "southeast" region

In [None]:
# Check "smoker" feature 

sns.countplot(data['smoker'])

plt.title('Person by Smoking Status', fontsize='16', fontweight='bold')
plt.xlabel('Smoking Status', fontsize='14')
plt.ylabel('Total Persons', fontsize='14')
plt.show()

## Decision: Lots of peoples are not smoker 

In [None]:
## Check "Smoker" feature 
sns.countplot(data['children'])

plt.title('Person by Having Children', fontsize='16', fontweight='bold')
plt.xlabel('Number of Children', fontsize='14')
plt.ylabel('Total Persons', fontsize='14')
plt.show()

## Decision: Al most 50% of the peoples have not any children. 

### Distribution Analysis

In [None]:
## Checking peoples age distribution
sns.histplot(data['age'], color = 'orange')

plt.title('Person Age Distributions', fontsize='16', fontweight='bold')
plt.xlabel('Age', fontsize='14')
plt.ylabel('Total Persons', fontsize='14')

plt.show()

## Almost 200 peoples age in between 1 to 20. And rest of them age around 30 to 35 

In [None]:
## Checking peoples "BMIistribution
sns.histplot(data['bmi'], color = 'blue')

plt.title('Persons BMI Distributions', fontsize='16', fontweight='bold')
plt.xlabel('BMI', fontsize='14')
plt.ylabel('Total Persons', fontsize='14')
plt.show()

## Decision: Here small number of peoples are in normal BMI(18.5 ~ 24.9) 
## and most of the peoples are in overweight and Obese

In [None]:
## Check peoples 'expense' distrivution
sns.histplot(data['charges'], color = 'gray')
plt.title('Distribution of Expense', fontsize='16', fontweight='bold')
plt.xlabel('Expenses', fontsize='14')
plt.ylabel('Total Persons', fontsize='14')
plt.show()

## Decision: Majority peoples health expenses are around 15k. 

## Bivariate Analysis
Involves with two variables relationships between them 

In [None]:
import plotly.express as px

In [None]:
## Comparison of Age with expenses
px.scatter(data,
           x = 'age',
           y = 'charges', 
          marginal_y = 'violin',
          trendline='ols')

## Decision: Trend line goes bottom to top. So, If peoples "Age" are increase then their health "expenses" will be increase. 

In [None]:
# # Comparison of Expense with BMI
px.scatter(data, 
           y = "charges", 
           x = "bmi", trendline='ols'
          )

## Decision: Trend line goes bottom to top. So, If peoples "BMI" are increase then their health "expenses" will be increase. 

In [None]:
sns.boxplot(x= data['smoker'], y = data['charges'])
plt.show()

## Decision: If someone do smoke his health expenses will be 2-3 times increase than a non-smoker. 

In [None]:
# sns.boxplot(x= data['children'], y = data['charges'])
sns.catplot("children","charges", data=data, kind="bar")
plt.show()

## Decision: Thoese Who have 2-4 children made more expense than others 

## Multivariate Analysis

In [None]:
px.scatter(data,
          x='charges',
          y='age',
          facet_col = 'region',
          facet_row = 'children', 
          color= 'smoker',
          trendline='ols')

In [None]:
px.scatter(data,
          x='charges',
          y='bmi',
          facet_row = 'children', 
          facet_col = 'region', 
          color= 'smoker',
          trendline='ols')

In [None]:
# # Relation between BMI, Age, Smoker with expenses. 

px.scatter(data,
          x='charges',
          y='bmi',
         size='age',
          color= 'smoker',
          hover_name = 'charges',
          size_max = 12)

## Decision: Whatever the BMI is if person do not smoke then is medical cost is decent (aroind 15K). 
## But, If Smoker persons BMI increase then his cost increase dramatically high. 

In [None]:
## Relation between Gender, Regoin with expenses 
px.bar_polar(data, 
             r='charges',
             theta='region', 
             color='sex',
            template = 'plotly_dark')

# # Females expenses are less then males expense. All of the regoins expeses are same 
## except "southeast" where males expenses are two times higher than females expenses. 

In [None]:
data[['charges', 'region']].groupby(['region']).agg(['min', 'max', 'mean']).style.background_gradient(cmap='ocean')

### Decision Lists Again 
1. If peoples age are increase then their medical expenses will increase dramatically. 
2. Males medical ecpenses are larger than females. 
3. majority peoples BMI are under or getter than normal BMI. So, they are in thin, overweight and Obese stage. Each stage peoples expenses are more than normal BMI peoples. 
4. Southeast reajon peopes are doing more expense than other regions. 
5. Thoese Who have 2-4 children made more expense than others. 
6. If a person is not smoker then your medical cost will normal. 
7. If Smoker persons BMI increase then his cost increase dramatically high.

# Data Preparation for ML model
All categorical should be Numorical

In [None]:
data.head(3)

In [None]:
# Check data types
data.dtypes

# All object data type features (sex, smoker, region) are categorical feature. 
# We have to encode these feature with numerical data. 

In [None]:
# Encodeing sex. 
# Male are make more expenses than female. So, let, Male = 2 and Female = 1)
data.sex.unique()
data['sex'] = data['sex'].replace(('female', 'male'), (1, 2))

In [None]:
# Encoding 'smoker'
# smoker = yes makes more expense, so let (yes = 2, no = 1)
data['smoker'] = data['smoker'].replace(('yes', 'no'), (2, 1))

In [None]:

data.region.unique()
# Southeast region makes highest expense so let region southeast = 2 and others are 1
data['region'] = data['region'].replace(('southeast', 'southwest', 'northwest', 'northeast'), (2, 1, 1, 1))

In [None]:
data.dtypes
# Now all of the features have numerical data

In [None]:
# Complete encoded data 
data.head(3)

 ## Dependent & Independent Feature 
 Seperating dependent ( charges) feature with independent (rest of the feature) 

In [None]:
# Dependent 
y = data['charges']

In [None]:
# Independent 
x = data.drop(['charges'], axis = 1)
print(x.shape)
print(x.columns)

## Spliting Train & Test Data 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=0)

print('Size of x_train = ', x_train.shape)
print('Size of x_test  = ', x_test.shape)
print('Size of y_train = ', y_train.shape)
print('Size of y_test  = ', y_test.shape)

## Feature Scaling
used to normalize the range of independent variable. 

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# Applying Linear Regression Model
Find the relationship between with dependent variable with one or more independent variables

To validate a linear model is approprieate or not you have to test flowing four assumption. 
Assumptions:
1. Linearity: Relationship between dependent and independent variable must be linear. Which means trend of two variables must be increasing or decreasing. 
2. Homoscedasticity: In statistical terms, the variance of all the variabes msut be same. 
3. Independence: All the observations must be independent on each other. 
4. Normality: All the variables must flow a normal distribution.

In [None]:
from sklearn.linear_model import LinearRegression 

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)

y_predict = model.predict(x_test)

### R square (r2) Score: 
It is generally used to determine how good is the model. ( Correlation between independent and dependent as target feature)

### Root Mean Squared Error(RMSE)
It is the squared root of the mean of the difference between actual and the predicted values. 

In [None]:
# checking model accuracy 
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np 

lr_r2_score = r2_score(y_test, y_predict)
print('R square Score = ', round(lr_r2_score, 3))

mse = mean_squared_error(y_test, y_predict)
rmse = np.sqrt(mse)
print('Root Mean Squared Error = ', round(rmse, 3))

So, If this model will predict any result it will be between actual value with plus of minus 5663. For that, this model will gain around 79% accuracy. 

# Random Forest 

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf_model = RandomForestRegressor()
rf_model.fit(x_train, y_train)

y_predict_rf = rf_model.predict(x_test)

In [None]:
# checking model accuracy 
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np 
rf_r2_score = r2_score(y_test, y_predict_rf)
print('R square Score = ', round(rf_r2_score, 3))

rf_mse = mean_squared_error(y_test, y_predict_rf)
rf_rmse = np.sqrt(rf_mse)
print('Root Mean Squared Error = ', round(rf_rmse, 3))

So, If this model will predict any result it will be between actual value with plus of minus 4448 For that, this model will gain around 87% accuracy. 

# Gradient Boosting Regressor Model


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gb_model = GradientBoostingRegressor(max_depth=2, n_estimators=100, learning_rate =.1)
gb_model.fit(x_train, y_train)

y_predict_gb = gb_model.predict(x_test)

In [None]:
# checking model accuracy 
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np 

gb_mse = mean_squared_error(y_test, y_predict_gb)
gb_rmse = np.sqrt(gb_mse)
print('Root Mean Squared Error = ', round(gb_rmse, 3))

gb_r2_score = r2_score(y_test, y_predict_gb)
print('R square Score = ', round(gb_r2_score, 3))

So, If this model will predict any result it will be between actual value with plus of minus 4063 For that, this model will gain around 89% accuracy. 

In [None]:
# ## Comparison of actual and predicted results 
# df = pd.DataFrame({'Actual': y_test, 'Predicted': y_predict_gb})
# df.to_csv('Data/gradient_boosting_result.csv', index=False)
# print('Your predicted data saved ')

# Ensembles of Model
It is a process where multiple diverse models are applied to obtain the better predictive performance. 

In [None]:
# # Average three model (Linear Regression, RF and Gradient Boosting)
avg_model = (y_predict + y_predict_rf + y_predict_gb) /3

In [None]:
# checking model accuracy 
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np 

mse = mean_squared_error(y_test, avg_model)
rmse = np.sqrt(mse)
print('Root Mean Squared Error = ', round(rmse, 3))

r2_score = r2_score(y_test, avg_model)
print('R square Score = ', round(r2_score, 3))

So, If this model will predict any result it will be between actual value with plus of minus 4282 For that, this model will gain around 88% accuracy. 

In [None]:
# So, GBM perform best out of three model. THen RF model and last Linear Regression model. 
# Now, create an average weigheted model where GBM is 50%, RF is 40% and LR is 10%. 

weighted_avg_model = ((.1 * y_predict) + (.4 * y_predict_rf) + (.5 * y_predict_gb))

In [None]:
# checking model accuracy 
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np 

mse = mean_squared_error(y_test, weighted_avg_model)
rmse = np.sqrt(mse)
print('Root Mean Squared Error = ', round(rmse, 3))

r2_score = r2_score(y_test, weighted_avg_model)
print('R square Score = ', round(r2_score, 3))

After that, accuracy is 89%. Also, In here again GBM is winner. 

# Cross Validation
It is a resampling procedure wich is used to evaluate the machine learning models on limited data samples. Its goal is to predict new data that is that is not tested before. It has a single parameter called K. 

In [None]:
from sklearn.model_selection import cross_val_score 
score = cross_val_score(gb_model, x, y, cv=5)
print(score)

# If the cross validate scores are not fer dirrer from others then the model is pretty good. 

### Model Accuracy Comparison

In [None]:
import numpy as np
import matplotlib.pyplot as plt


r2_scores = [lr_r2_score*100, rf_r2_score*100, gb_r2_score*100]
model_names = ['LR', 'RF', 'GBM']

total_bar = np.arange(len(model_names))
color = ['#9edd1d', '#3edd1d', '#f7c851']

fig, ax = plt.subplots(figsize=(10, 3))
bar = plt.bar(model_names, r2_scores, align='center', alpha=.75, color=color)
plt.xticks(total_bar, model_names)
plt.ylabel('Accuracy',fontsize=14, color='black')
plt.xlabel('Model Name',fontsize=14, color='black')
plt.title('Model (LR, RF, GB) Performance Comparison', fontsize=16, color='black', fontweight='bold')

# # this functions will set data lebel 
def autolabel(bar):
    for bar in bar:
        height = int(bar.get_height())
        ax.text(bar.get_x() + .4, .5*height,
                height, va='bottom',
                fontsize=14, color='black')
        
autolabel(bar)

plt.show()