# U.S. Medical Insurance Costs

### Plan

Perform simple correlations of the Smoker variable with each of the other variables in the Medical Insurance dataset.

### Loading dataset

In [1]:
import csv
medical_insurance_data = []
with open('insurance.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        fixed_row = { key:(float(value) if value[0].isnumeric() else value) for key, value in row.items() }
        medical_insurance_data.append(fixed_row)
dataset_size = len(medical_insurance_data)
print('Dataset size = {}'.format(dataset_size))
print('The records look like this one')
medical_insurance_data[:1]

Dataset size = 1338
The records look like this one


[{'age': 19.0,
  'sex': 'female',
  'bmi': 27.9,
  'children': 0.0,
  'smoker': 'yes',
  'region': 'southwest',
  'charges': 16884.924}]

### Common functions

In [2]:
def get_average_for_column(column_name):
    sum = 0
    for record in medical_insurance_data:
        sum += record[column_name]
    return sum / dataset_size

def get_binary_value_percentage(column_name, value):
    count = 0
    for record in medical_insurance_data:
        if record[column_name] == value:
            count += 1
    return 100 * count / dataset_size

def is_smoker(medical_record):
    return medical_record['smoker'] == 'yes'

def get_field_average_for_smokers(numeric_field_name):
    smoker_values = []
    non_smoker_values = []
    for record in medical_insurance_data:
        numeric_value = record[numeric_field_name]
        if is_smoker(record):
            smoker_values.append(numeric_value)
        else:
            non_smoker_values.append(numeric_value)
    return sum(smoker_values) / len(smoker_values), sum(non_smoker_values) / len(non_smoker_values)

def get_binary_value_percentage_for_smokers(column_name, value):
    count_smokers = 0
    count_positives = 0
    for record in medical_insurance_data:
        if is_smoker(record):
            count_smokers += 1
            if record[column_name] == value:
                count_positives += 1
    return 100 * count_positives / count_smokers


### The smoker variable

In [3]:
'{:.2f}% of the people in the dataset are smokers'.format(get_binary_value_percentage('smoker', 'yes'))

'20.48% of the people in the dataset are smokers'

### Smokers and age

In [10]:
print('The average age in the dataset is {:.2f}'.format(get_average_for_column('age')))

average1, average2 = get_field_average_for_smokers('age')
'The average age in the dataset for smokers is {:.2f}, for non-smokers {:.2f}'.format(average1, average2)

The average age in the dataset is 39.21


'The average age in the dataset for smokers is 38.51, for non-smokers 39.39'

### Smokers and sex

In [11]:
print('{:.2f}% of the people in the dataset are females'.format(get_binary_value_percentage('sex', 'female')))

'{:.2f}% of the smokers in the dataset are females'.format(get_binary_value_percentage_for_smokers('sex', 'female'))

49.48% of the people in the dataset are females


'41.97% of the smokers in the dataset are females'

### Smokers and BMI

In [13]:
print('The average BMI in the dataset is {:.2f}'.format(get_average_for_column('bmi')))

average1, average2 = get_field_average_for_smokers('bmi')
'The average BMI in the dataset for smokers is {:.2f}, for non-smokers {:.2f}'.format(average1, average2)

The average BMI in the dataset is 30.66


'The average BMI in the dataset for smokers is 30.71, for non-smokers 30.65'

### Playground

In [8]:
# print('11.2'.isdecimal())
# print('11.2'.isdigit())
# print('11.2'.isnumeric())
# float('11.2a', 0.0)


False
False
False


TypeError: float expected at most 1 argument, got 2