In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
import sys
print(sys.path)

## Overlaid Histograms

In [None]:
temps = Table.read_table('temperatures.csv')
temps.show(3)

In [None]:
temps.hist('TMIN', unit='degree')

In [None]:
temps.select('TMAX', 'TMIN').relabeled('TMAX', 'High Temperature').relabeled('TMIN', 'Low Temperature').hist(unit='degree')

## Prediction

In [None]:
families = Table.read_table('family_heights.csv')
families

In [None]:
families.where('m/f', 'male').select(1, 2, 3).hist(unit='inch')

In [None]:
parent_avgs = (families.column('father') + families.column('mother'))/2

In [None]:
heights = Table().with_columns(
    'Parent Average', parent_avgs,
    'Child', families.column('child'),
    'M/F', families.column('m/f')
)
heights

In [None]:
heights.scatter(0, 1)

In [None]:
heights.scatter('Parent Average', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2);

In [None]:
nearby = heights.where('Parent Average', are.between(67.5, 68.5))
nearby_mean = np.average(nearby.column('Child'))
nearby_mean

In [None]:
def predict(h):
    nearby = heights.where('Parent Average', are.between(h - 0.5, h + 0.5))
    nearby_mean = np.average(nearby.column('Child'))
    return nearby_mean 

In [None]:
predict(68)

In [None]:
predict(70)

In [None]:
predict(73)

In [None]:
predict(62)

In [None]:
heights.scatter('Parent Average', 'Child')
plots.plot([69.5, 69.5], [50, 85], color='red', lw=2)
plots.plot([70.5, 70.5], [50, 85], color='red', lw=2)
plots.scatter(70, predict(70), color='red', s=50);

In [None]:
predicted_heights = heights.apply(predict, 'Parent Average')

In [None]:
heights = heights.with_column('Prediction', predicted_heights)

In [None]:
heights.select('Parent Average', 'Child', 'Prediction').scatter('Parent Average')

## Prediction Accuracy ##

In [None]:
def difference(x, y):
    return x - y

In [None]:
pred_errs = heights.apply(difference, 'Prediction', 'Child')
heights = heights.with_column('errors', pred_errs)
heights

In [None]:
heights.hist('errors')

# Discussion Question

In [None]:
def predict_smarter(h, s):
    nearby = heights.where('Parent Average', are.between(h - 1/2, h + 1/2))
    nearby_same_sex = ...
    return ...

In [None]:
predict_smarter(68, 'female')

In [None]:
predict_smarter(68, 'male')

In [None]:
smarter_predicted_heights = heights.apply(predict_smarter, 'Parent Average', 'M/F')
heights = heights.with_column('Smarter Prediction', smarter_predicted_heights)

In [None]:
smarter_pred_errs = heights.apply(difference, 'Child', 'Smarter Prediction')
heights = heights.with_column('Smarter Errors', smarter_pred_errs)

In [None]:
heights.hist('Smarter Errors', group='M/F')