In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Overlaid Histograms

**Please run all cells before this cell, including the import cell at the top of the notebook.**

In [None]:
temps = Table.read_table('temperatures.csv')
temps

In [None]:
temps.hist('TMAX', unit='degree')

In [None]:
temps.select('TMIN', 'TMAX').hist(unit='degree')

## Functions

**Please run all cells before this cell, including the previous example cells and the import cell at the top of the notebook.**

In [None]:
def double(x):
    """ Double x """
    return 2*x

In [None]:
double(3)

In [None]:
double(-4)

In [None]:
y = 5
double(y/4)

In [None]:
y

In [None]:
double(y)

In [None]:
y

In [None]:
z = double(y)

In [None]:
z

In [None]:
counts = make_array(1, 2, 3)
total = sum(counts)
np.round((counts/total)*100, 2)

In [None]:
def percents(s):
    """Convert the counts to percents out of the total."""
    total = sum(s)
    return np.round((s/total)*100, 2)

In [None]:
percents(counts)

In [None]:
percents(make_array(1, 1, 1, 1))

In [None]:
counts

In [None]:
# percents(2)

In [None]:
# sum(2)

In [None]:
def percents(counts, decimal_places=2):
    """Convert the counts to percents out of the total."""
    total = sum(counts)
    return np.round((counts/total)*100, decimal_places)

parts = make_array(2, 1, 4)
print("Rounded to 1 decimal place:", percents(parts, 1), "or", percents(parts, decimal_places=1))
print("Rounded to the default number of decimal places:", percents(parts))

## Apply

**Please run all cells before this cell, including the previous example cells and the import cell at the top of the notebook.**

In [None]:
def cut_off_at_a_billion(x):
    """The smaller of x and 1,000,000,000"""
    return min(x, 1e9)

In [None]:
cut_off_at_a_billion(12)

In [None]:
cut_off_at_a_billion(123456)

In [None]:
cut_off_at_a_billion(1234567890)

In [None]:
top = Table.read_table('top_movies_2017.csv').where('Studio', 'Fox')
top

In [None]:
cut_off = top.apply(cut_off_at_a_billion, 'Gross (Adjusted)')
top.with_column('Adjusted but cut', cut_off)

In [None]:
cut_off_at_a_billion

In [None]:
type(cut_off_at_a_billion)

In [None]:
help(cut_off_at_a_billion)

## Prediction

**Please run all cells before this cell, including the previous example cells and the import cell at the top of the notebook.**

In [None]:
families = Table.read_table('family_heights.csv')
families

In [None]:
parent_avgs = (families.column('father') + families.column('mother'))/2

In [None]:
heights = Table().with_columns(
    'Parent Average', parent_avgs,
    'Child', families.column('child'),
    'M/F', families.column('m/f')
)
heights

In [None]:
heights.scatter('Parent Average', 'Child')

In [None]:
heights.scatter('Parent Average', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2);

In [None]:
nearby = heights.where('Parent Average', are.between(67.5, 68.5))
nearby_mean = np.average(nearby.column('Child'))
nearby_mean

In [None]:
heights.scatter('Parent Average', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2)
plots.scatter(68, nearby_mean, color='red', s=50);

In [None]:
def predict(h):
    nearby = heights.where('Parent Average', are.between(h - 1/2, h + 1/2))
    return np.average(nearby.column('Child'))

In [None]:
predict(68)

In [None]:
predict(70)

In [None]:
predict(73)

In [None]:
predicted_heights = heights.apply(predict, 'Parent Average')

In [None]:
heights = heights.with_column('Prediction', predicted_heights)

In [None]:
heights.select('Parent Average', 'Child', 'Prediction').scatter('Parent Average')

## Prediction Accuracy ##

In [None]:
def difference(x, y):
    return x - y

In [None]:
pred_errs = heights.apply(difference, 'Prediction', 'Child')
heights = heights.with_column('errors',pred_errs)
heights

In [None]:
heights.hist('errors')

In [None]:
heights.hist('errors', group='M/F')

# Discussion Question

In [None]:
def predict_smarter(h, s):
    nearby = heights.where('Parent Average', are.between(h - 1/2, h + 1/2))
    nearby_same_sex = nearby.where('M/F', s)
    return np.average(nearby_same_sex.column('Child'))

In [None]:
predict_smarter(68, 'female')

In [None]:
predict_smarter(68, 'male')

In [None]:
smarter_predicted_heights = heights.apply(predict_smarter, 'Parent Average', 'M/F')
heights = heights.with_column('Smarter Prediction', smarter_predicted_heights)

In [None]:
smarter_pred_errs = heights.apply(difference, 'Child', 'Smarter Prediction')
heights = heights.with_column('Smarter Errors', smarter_pred_errs)

In [None]:
heights.hist('Smarter Errors', group='M/F')