In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor


In [3]:
med_data = pd.read_csv(r"C:\Dataset\Medical_insurance.csv")
med_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
med_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2772 entries, 0 to 2771
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       2772 non-null   int64  
 1   sex       2772 non-null   object 
 2   bmi       2772 non-null   float64
 3   children  2772 non-null   int64  
 4   smoker    2772 non-null   object 
 5   region    2772 non-null   object 
 6   charges   2772 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 151.7+ KB


# Problem Statements

## 1. What are the most important factors that affect medical expenses?

#### A. Does BMI affect medical expenses?

In [5]:
# BMI categories: Underweight = <18.5, Normal weight = 18.5 - 24.9
# BMI categories: Overweight = 25 - 29.9, Obesity = 30 or greater
weights = ["Underweight", "Normal", "Overweight", "Obese"]
bmi_categories = []
for bmi in med_data["bmi"]:
    if bmi < 18.5:
        bmi_categories.append(weights[0])
    elif bmi > 30:
        bmi_categories.append(weights[3])
    elif bmi >= 25:
        bmi_categories.append(weights[2])
    else:
        bmi_categories.append(weights[1])

med_data.insert(3, "bmi_category", bmi_categories)
med_data.head()

Unnamed: 0,age,sex,bmi,bmi_category,children,smoker,region,charges
0,19,female,27.9,Overweight,0,yes,southwest,16884.924
1,18,male,33.77,Obese,1,no,southeast,1725.5523
2,28,male,33.0,Obese,3,no,southeast,4449.462
3,33,male,22.705,Normal,0,no,northwest,21984.47061
4,32,male,28.88,Overweight,0,no,northwest,3866.8552


In [6]:
for weight in weights:
    print(f'Peopel who are {weight} have average medical expense \
    {round(med_data[med_data["bmi_category"] == weight].loc[:, "charges"].mean(), 2)}')
    print(f'There are {med_data[med_data["bmi_category"] == weight].loc[:, "bmi_category"].count()} people who are {weight}.')
    print()

Peopel who are Underweight have average medical expense     8852.2
There are 40 people who are Underweight.

Peopel who are Normal have average medical expense     10218.52
There are 472 people who are Normal.

Peopel who are Overweight have average medical expense     11023.41
There are 794 people who are Overweight.

Peopel who are Obese have average medical expense     15573.47
There are 1466 people who are Obese.



If bmi_category is being used as a feature in the model, will need to solve the imbalance data issue. Otherwise, the predictive model could become really good at predicting medical expense for people who are obese and not so good for other people who are not obese.

#### Does age affect medical expense?

In [7]:
print(f'Age column has {len(med_data["age"].value_counts())} unique ages.')
print(f'The oldest persion is {med_data["age"].max()} and the youngest person is {med_data["age"].min()}.')


Age column has 47 unique ages.
The oldest persion is 64 and the youngest person is 18.


Will add a categorical variable for age

In [8]:
# Age 62 or over are considered senior citizen
# Will consider Age 18 to 25 as young adult
# Age 40 to 61 will be considered as middle age
# Age 26 - 39 will be called adult
# ya = young adult, s = senior, aup = adult upper bound

age_categories = ["Young_Adult", "Adult", "Middle_Age", "Senior"]
def add_age_col(df, column_name, ya=25, s=62, aup=39): 
    age_list = []

    for age in df[column_name]:
        if age <= ya:
            age_list.append(age_categories[0])
        elif age >= s:
            age_list.append(age_categories[3])
        elif age > ya and age <= aup:
            age_list.append(age_categories[1])
        else:
            age_list.append(age_categories[2])
        
    if "age_category" in df.columns:
        df["age_category"] = age_list
    else:
        df.insert(1, "age_category", age_list)
    return df

display(add_age_col(med_data, "age"))

Unnamed: 0,age,age_category,sex,bmi,bmi_category,children,smoker,region,charges
0,19,Young_Adult,female,27.900,Overweight,0,yes,southwest,16884.92400
1,18,Young_Adult,male,33.770,Obese,1,no,southeast,1725.55230
2,28,Adult,male,33.000,Obese,3,no,southeast,4449.46200
3,33,Adult,male,22.705,Normal,0,no,northwest,21984.47061
4,32,Adult,male,28.880,Overweight,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...,...,...
2767,47,Middle_Age,female,45.320,Obese,1,no,southeast,8569.86180
2768,21,Young_Adult,female,34.600,Obese,0,no,southwest,2020.17700
2769,19,Young_Adult,male,26.030,Overweight,1,yes,northwest,16450.89470
2770,23,Young_Adult,male,18.715,Normal,0,no,northwest,21595.38229


In [9]:
def category_info(df, column_name, category_name):
    for _ in category_name:
        print(f'There are {df[df[column_name] == _].loc[:, column_name].count()} who are {_}.')
        print(f'The average medical expense for {_} is {round(df[df[column_name] == _].loc[:,"charges"].mean(), 2)}.')
        print()

category_info(med_data, "age_category", age_categories)

There are 644 who are Young_Adult.
The average medical expense for Young_Adult is 9027.09.

There are 760 who are Adult.
The average medical expense for Adult is 11126.37.

There are 1228 who are Middle_Age.
The average medical expense for Middle_Age is 15915.77.

There are 140 who are Senior.
The average medical expense for Senior is 21046.2.



In [10]:
### Change age consider to be senior from 62 to 59 to see the effect
med_copy = med_data.copy()
add_age_col(med_copy, "age", s=59)
category_info(med_copy, "age_category", age_categories)

There are 644 who are Young_Adult.
The average medical expense for Young_Adult is 9027.09.

There are 760 who are Adult.
The average medical expense for Adult is 11126.37.

There are 1084 who are Middle_Age.
The average medical expense for Middle_Age is 15268.57.

There are 284 who are Senior.
The average medical expense for Senior is 20915.17.



In [11]:
# def misc_info(df, column_name):
children_num = list(med_data["children"].unique())
for num in children_num:
    print(f'The average medical expense for people with {num} children is \
    {round(med_data[med_data["children"] == num].loc[:, "charges"].mean(), 2)}')
    print()

The average medical expense for people with 0 children is     12317.92

The average medical expense for people with 1 children is     12722.65

The average medical expense for people with 3 children is     15304.07

The average medical expense for people with 2 children is     15268.18

The average medical expense for people with 5 children is     8706.04

The average medical expense for people with 4 children is     13550.98



In [12]:
med_data.corr()

Unnamed: 0,age,bmi,children,charges
age,1.0,0.113048,0.037574,0.298624
bmi,0.113048,1.0,-0.001492,0.199846
children,0.037574,-0.001492,1.0,0.066442
charges,0.298624,0.199846,0.066442,1.0


In [13]:
children_list =[]
for child in med_data["children"]:
    if child > 0:
        children_list.append("Yes")
    else:
        children_list.append("No")

med_data.insert(6, "Have_children?", children_list)

In [14]:
print(med_data[med_data["sex"] == "male"].mean())
print(med_data[med_data["sex"] == "female"].mean())

age            38.748222
bmi            30.960633
children        1.120910
charges     14013.872721
dtype: float64
age            39.481698
bmi            30.434473
children        1.081991
charges     12486.831977
dtype: float64


In [15]:
print(med_data[med_data["smoker"] == "yes"].mean())
print()
print(med_data[med_data["smoker"] == "no"].mean())
print()
print(f'There are {len(med_data[med_data["smoker"] == "yes"])} smoker in this data.')
print()
print(f'There are {len(med_data[med_data["smoker"] == "no"])} non-smoker in this data.')

age            38.460993
bmi            30.840656
children        1.117021
charges     32223.139764
dtype: float64

age           39.275362
bmi           30.665765
children       1.097826
charges     8417.874411
dtype: float64

There are 564 smoker in this data.

There are 2208 non-smoker in this data.


## 2. How well can machine learning models predict medical expenses?

In [16]:
med_data.head()

Unnamed: 0,age,age_category,sex,bmi,bmi_category,children,Have_children?,smoker,region,charges
0,19,Young_Adult,female,27.9,Overweight,0,No,yes,southwest,16884.924
1,18,Young_Adult,male,33.77,Obese,1,Yes,no,southeast,1725.5523
2,28,Adult,male,33.0,Obese,3,Yes,no,southeast,4449.462
3,33,Adult,male,22.705,Normal,0,No,no,northwest,21984.47061
4,32,Adult,male,28.88,Overweight,0,No,no,northwest,3866.8552


In [17]:
# Will drop the variables age, bmi, region, and charges
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

condense_data = med_data.drop(columns=["age", "bmi", "children","region", "charges"])
variables_to_encode = condense_data.columns[condense_data.dtypes == object].tolist()
cols_trans = make_column_transformer((OneHotEncoder(), variables_to_encode), remainder="passthrough")
# The purpose of cols_trans is to handles the categorical variables by transforming them into dummy variables without affecting
# numeric variables

x_train = condense_data.iloc[:1800, :]
x_test = condense_data.iloc[1800:, :]

# Will round charges to the nearest 100s because predictions are just educated guesses
target_train = round(med_data["charges"][:1800], -2)
target_test = round(med_data["charges"][1800:], -2)

print(len(x_test), len(target_test))

972 972


In [18]:
from sklearn.pipeline import make_pipeline
rf_regressor = RandomForestRegressor(n_estimators=500,
                                     min_samples_leaf=10,
                                     oob_score=True,
                                     random_state=0)

pipe = make_pipeline(cols_trans, rf_regressor)
pipe.fit(x_train, target_train)

# line below is to make prediction round to the nearest tenth
y_pred = np.round(pipe.predict(x_test), -2)

# Merging the actual medical charges, the predicted medical charges, and adding an difference squared column
charge = pd.DataFrame(data=y_pred, columns=["predicted_charge"])
charge["Actual_charge"] = target_test.reset_index(drop=True)
charge["difference_squared"] = [x for x in (charge["Actual_charge"] - charge["predicted_charge"]) ** 2]
mean_sqrt_error = charge["difference_squared"].sum() / len(charge["difference_squared"])
print(mean_sqrt_error)

# lines below are to check that I have calculated mean square error correctly.
from sklearn.metrics import mean_squared_error
mean_squared_error(target_test, y_pred)

22555432.098765433


22555432.098765433

## 3. How can machin learning models be used to improve the efficiency and profiability of health insurance companies?