# Lecture 18: Why the mean matters

## 8.2: Module 8 Notebook 2

In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

## Chebyshev's Bounds

In [None]:
births = Table.read_table('baby.csv')
births.show(3)

In [None]:
# let's visualize all the columns in births in separate histograms
b = births.drop("Maternal Smoker")
b.hist(overlay = False)

In [None]:
# now, let's compute the mean and sd of 
# the maternal pregrancy weight column
mpw = births.column('Maternal Pregnancy Weight')
mean = np.mean(mpw)
sd = np.std(mpw)
mean, sd

In [None]:
# let's test Chebyshev's theory

# Ex. 1: let's get all maternal pregrancy weight values within 3SDs
within_3_SDs = births.where('Maternal Pregnancy Weight', 
                            are.between(mean - 3*sd, mean + 3*sd))

In [None]:
# Ex.1: Proportion within 3 SDs of the mean

within_3_SDs.num_rows / births.num_rows

In [None]:
# Chebyshev's bound: 
# The proportion we calculated above should be at least

1 - 1/(3**2)

In [None]:
# Now, let's check whether Chebyshev's bounds extend to other columns

# first, we print all the column names
births.labels

In [None]:
# See if Chebyshev's bounds work for distributions with various shapes

# Remember the histograms above have different distributions, 
# i.e., each column's distribution has a different shape

for feature in births.labels:
    values = births.column(feature)
    mean = np.mean(values)
    sd = np.std(values)
    print()
    print(feature)
    for z in make_array(2, 3, 4, 5):
        chosen = births.where(feature, are.between(mean - z*sd, mean + z*sd))
        proportion = chosen.num_rows / births.num_rows
        percent = round(proportion * 100, 2)
        print('Average plus or minus', z, 'SDs:', percent, '% of the data')

## Standard Units ##

In [None]:
# since we will use standard units often, 
# let's define a function to compute them
def standard_units(x):
    """Convert array x to standard units."""
    return (x - np.mean(x)) / np.std(x)

In [None]:
ages = births.column('Maternal Age')

In [None]:
ages_standard_units = standard_units(ages)

In [None]:
# When values are in standard units: average = 0, SD = 1
np.mean(ages_standard_units), np.std(ages_standard_units)

In [None]:
# let's compare the age as is, and age in standard units
both = Table().with_columns(
    'Age in Years', ages,
    'Age in Standard Units', ages_standard_units
)
both

In [None]:
# let's compute the mean and SD of age
np.mean(ages), np.std(ages)

In [None]:
# now, let's visualize the data
# first, age as is
both.hist('Age in Years', bins = np.arange(15, 46, 2))

In [None]:
# then, age in standard units
both.hist('Age in Standard Units', bins = np.arange(-2.2, 3.4, 0.35))
plots.xlim(-2, 3.1);

## The SD and Bell-Shaped Curves

In [None]:
# the mothers's height are distributed in a bell-shaped curve
# let's visualize it
births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))

In [None]:
# now, let's compure the mean and SD
heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)

In [None]:
# let's obtain the values within 1 SD of the mean
np.mean(heights) + np.std(heights), np.mean(heights) - np.std(heights)