# U.S. Medical Insurance Costs

In [1]:
import csv

In [2]:
dataset = {}
with open("insurance.csv", newline='') as insurance_csv:
    dict_reader = csv.DictReader(insurance_csv) # converts the lines of our CSV file to Python dictionaries
    column_names = dict_reader.fieldnames
    for column_name in column_names:
        dataset[column_name] = []
    for row in dict_reader:
        # row is an ordered dictionary entry containing a tuple with the column heading and the data for each observation
        # key is the column heading and the value is data at each observation
        for key, value in row.items():
            dataset[key].append(value)

In [3]:
dataset.keys()

dict_keys(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'])

In [4]:
for key in dataset.keys():
    print(len(dataset[key]))

1338
1338
1338
1338
1338
1338
1338


### Creates a count of any column from the data set

In [5]:
def variable_count(data, variable_key): # data is dataset, variable_key is the column you want to count
    count_dict = {}
    for i in range(len(data[variable_key])):
        if dataset[variable_key][i] not in count_dict:
            count_dict[dataset[variable_key][i]] = 1
        else:
            count_dict[dataset[variable_key][i]] += 1
    return count_dict

In [6]:
smoker = variable_count(dataset, 'smoker')
smoker

{'yes': 274, 'no': 1064}

In [7]:
child = variable_count(dataset, 'children')
child

{'0': 574, '1': 324, '3': 157, '2': 240, '5': 18, '4': 25}

### Zip the two lists into a tuple and use the information to count the number of people who smoke who have 0 to 5 children

In [8]:
# Make this a function like the one above if possible
data = list(zip(dataset['children'], dataset['smoker']))
child_smoker = {}
for row in data:
    if row[1] == 'yes':
        if row[0] not in child_smoker:
            child_smoker[row[0]] = 1
        else:
            child_smoker[row[0]] += 1
child_smoker

{'0': 115, '1': 61, '2': 55, '3': 39, '4': 3, '5': 1}

In [9]:
for i in range(6):
    percentage = (child_smoker[str(i)]/child[str(i)])
    print(f"Percentage of people with {i} children who smoke is: {percentage: 0.2%}")

Percentage of people with 0 children who smoke is:  20.03%
Percentage of people with 1 children who smoke is:  18.83%
Percentage of people with 2 children who smoke is:  22.92%
Percentage of people with 3 children who smoke is:  24.84%
Percentage of people with 4 children who smoke is:  12.00%
Percentage of people with 5 children who smoke is:  5.56%


### Some Testing on making Classes - not sure if this is useful

In [23]:
class Person:
    def __init__(self, age, sex, bmi, children, smoker, region, charges):
        self.age = age
        self.sex = sex
        self.bmi = bmi
        self.children = children
        self.smoker = smoker
        self.region = region
        self.charges = charges
        
    def __str__(self):
        return f"Age: {self.age}\nSex: {self.sex}\nBMI: {self.bmi}\nChildren: {self.children}\nSmoker: {self.smoker}\
        \nRegion: {self.region}\nCharges: {self.charges}"


class Insurance:

    def __init__(self):
        self.insurance_list =[]
        self.size = len(self.insurance_list)
        self.age_list = []
        self.sex_list = []
        self.bmi_list = []
        self.children_list = []
        self.smoker_list = []
        self.region_list = []
        self.charges_list = []
        
    @property
    def size(self):
        return len(self.insurance_list)

    @size.setter
    def size(self, value):    
        self._size = value


    def add_person(self, Person):
        self.insurance_list.append(Person)
        self.age_list.append(Person.age)
        self.sex_list.append(Person.sex)
        self.bmi_list.append(Person.bmi)
        self.children_list.append(Person.children)
        self.smoker_list.append(Person.smoker)
        self.region_list.append(Person.region)
        self.charges_list.append(Person.charges)
    
    
    def get_person(self, index):
        return self.insurance_list[index]
    
    def get_list(self, name):
        if hasattr(self, f"{name}_list"):
            return getattr(self, f"{name}_list")
        else:
            raise ValueError(f"No such name exists: {name}")
    
    def count(self, name):
        count_dict = {}
        attr_list = self.get_list(name)
        for value in attr_list:
            if value not in count_dict:
                count_dict[value] = 1
            else:
                count_dict[value] += 1
        return count_dict

In [24]:
# Test the Person Class
person_data = {
    "age": 30,
    "sex": "male",
    "bmi": 25.5,
    "children": 2,
    "smoker": False,
    "region": "north",
    "charges": 5000.0
}

person_insurance = Person(**person_data)

print("Age:", person_insurance.age)
print("Sex:", person_insurance.sex)
print("BMI:", person_insurance.bmi)
print("Children:", person_insurance.children)
print("Smoker:", person_insurance.smoker)
print("Region:", person_insurance.region)
print("Charges:", person_insurance.charges)

Age: 30
Sex: male
BMI: 25.5
Children: 2
Smoker: False
Region: north
Charges: 5000.0


In [25]:
insurance_info = Insurance()
print(insurance_info.size)
with open("insurance.csv", newline='') as insurance_csv:
    dict_reader = csv.DictReader(insurance_csv) 
    for row in dict_reader:  
        insurance_person = Person(**row)
        insurance_info.add_person(insurance_person)    
insurance_info.size

0


1338

In [26]:
print(insurance_info.get_person(3))

Age: 33
Sex: male
BMI: 22.705
Children: 0
Smoker: no        
Region: northwest
Charges: 21984.47061


In [39]:
insurance_info.get_list("sex")[:3]

['female', 'male', 'male']

In [40]:
insurance_info.count("sex")

{'female': 662, 'male': 676}

### Using Pandas

In [15]:
import pandas as pd

In [16]:
data = pd.read_csv('insurance.csv')

In [17]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [19]:
data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [20]:
data.region.value_counts(), data.sex.value_counts(), data.smoker.value_counts()

(southeast    364
 southwest    325
 northwest    325
 northeast    324
 Name: region, dtype: int64,
 male      676
 female    662
 Name: sex, dtype: int64,
 no     1064
 yes     274
 Name: smoker, dtype: int64)

In [21]:
data.charges.agg(['max','mean','median', 'min'])

max       63770.428010
mean      13270.422265
median     9382.033000
min        1121.873900
Name: charges, dtype: float64

In [22]:
dollars = lambda x: round(x, 2)

In [23]:
data['cost'] = data.charges.apply(lambda x: round(x, 2))

In [24]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,cost
0,19,female,27.9,0,yes,southwest,16884.924,16884.92
1,18,male,33.77,1,no,southeast,1725.5523,1725.55
2,28,male,33.0,3,no,southeast,4449.462,4449.46
3,33,male,22.705,0,no,northwest,21984.47061,21984.47
4,32,male,28.88,0,no,northwest,3866.8552,3866.86


In [25]:
filt = ['age', 'children', 'bmi', "cost"]

In [26]:
data.groupby('sex')[filt].agg(['max','median', 'min'])

Unnamed: 0_level_0,age,age,age,children,children,children,bmi,bmi,bmi,cost,cost,cost
Unnamed: 0_level_1,max,median,min,max,median,min,max,median,min,max,median,min
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
female,64,40.0,18,5,1.0,0,48.07,30.1075,16.815,63770.43,9412.96,1607.51
male,64,39.0,18,5,1.0,0,53.13,30.6875,15.96,62592.87,9369.615,1121.87


In [27]:
data.groupby('sex')[filt].mean()

Unnamed: 0_level_0,age,children,bmi,cost
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,39.503021,1.074018,30.377749,12569.578731
male,38.91716,1.115385,30.943129,13956.751317
