## U.S. Medical Insurance Costs
For my Data Scientist + ML professional certification course on Codecademy

Robert Hall 12/23/2023

In [2]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import statistics as st

Step 1: Gather Basic Summary Statistics on the data

1) Total number of patients (rows)
2) Summary statistics on age, BMI, number of children, cost
3) Proportions of patients who are male, female
4) Proportions of patients who are smokers, non-smokers
5) Proportions of patients who are from each region

In [3]:
# import data and put into pandas dataframe, view head

with open("insurance.csv", 'r') as data:
    data_csv = pd.read_csv(data)
    df = pd.DataFrame(data_csv)

df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
# 1. Total number of patients (rows)

rows = df.shape[0]

print("Number of Rows: " + str(rows))

Number of Rows: 1338


In [5]:
# 2. Summary statistics on age, BMI, number of children, cost

df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [6]:
# 3. Proportions of patients who are male, female

# calculate numbers of males, females
n_male = len(df[df['sex'] == 'male'])
n_female = len(df[df['sex'] == 'female'])

p_male = n_male / rows
p_female = n_female / rows

print("Proportion of Males: " + str(round(p_male, 2)))
print("Proportion of Females: " + str(round(p_female, 2)))

print('\n')

# verify that the number of males + number of females = the total number of rows
print("No null values: " + str(n_male + n_female == rows))


Proportion of Males: 0.51
Proportion of Females: 0.49


No null values: True


In [7]:
# 4. Proportions of patients who are smokers, non-smokers

n_smoker = len(df[df['smoker'] == 'yes'])
n_nsmoker = len(df[df['smoker'] == 'no'])

p_smoker = n_smoker / rows
p_nsmoker = n_nsmoker / rows

print("Proportion of Smokers: " + str(round(p_smoker, 2)))
print("Proportion of Nonsmokers: " + str(round(p_nsmoker, 2)))

print('\n')

print("No null values: " + str(n_smoker + n_nsmoker == rows))

Proportion of Smokers: 0.2
Proportion of Nonsmokers: 0.8


No null values: True


Step 2:

1) Find the difference between the cost of insurance for the average smoker, and that of the average non-smoker

In [8]:
# 1. Find the difference between the cost of insurance for the average smoker, and that of the average non-smoker

# create dataframes of all attributes of smokers and nonsmokers
df_smokers = df[df['smoker'] == 'yes']
df_nsmokers = df[df['smoker'] == 'no']

# create dataframes of only charges for smokers and nonsmokers, respectively
df_smoker_charges = df_smokers[['charges']]
df_nsmoker_charges = df_nsmokers[['charges']]

# find the mean costs for smokers and non-smokers
mean_cost_smokers = np.array(df_smoker_charges.values.tolist()).flatten()
mean_cost_smokers = round(st.mean(mean_cost_smokers), 2)

mean_cost_nsmokers = np.array(df_nsmoker_charges.values.tolist()).flatten()
mean_cost_nsmokers = round(st.mean(mean_cost_nsmokers), 2)

# find average difference in cost for smokers versus nonsmokers

mean_difference_nsmoke = mean_cost_smokers - mean_cost_nsmokers

# print each cost, and print the difference in average cost
print("Average insurance cost for smokers:          $" + str(round(mean_cost_smokers, 2)))
print("Average insurance cost for non-smokers:      $" + str(round(mean_cost_nsmokers, 2)))
print('\n')
print(f"One could save an average of ${mean_difference_nsmoke} in insurance costs by simply not smoking!")

Average insurance cost for smokers:          $32050.23
Average insurance cost for non-smokers:      $8434.27


One could save an average of $23615.96 in insurance costs by simply not smoking!
