# Lecture 16의 데모

### Lecture 16의 데모에 대한 코드

참고자료: 

https://github.com/data-8/materials-sp22/blob/main/lec/lec29.ipynb

https://github.com/data-8/materials-sp22/blob/main/lec/lec30.ipynb


In [None]:
import os
from google.colab import drive

drive.mount('/content/gdrive')

%cd /content/gdrive/MyDrive/ITEC419-fa22/lec

Mounted at /content/gdrive
/content/gdrive/MyDrive/ITEC419-fa22/lec


In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
# Some functions for plotting. You don't have to understand how any
# of the functions in this cell work, since they use things we 
# haven't learned about in Data 8.


def resize_window(lim=3.5):
    plots.xlim(-lim, lim)
    plots.ylim(-lim, lim)
    
def draw_line(slope=0, intercept=0, x=make_array(-4, 4), color='#1e90ff'):
    y = x*slope + intercept
    plots.plot(x, y, color=color, lw=3)
    
def draw_vertical_line(x_position, color='black'):
    x = make_array(x_position, x_position)
    y = make_array(-4, 4)
    plots.plot(x, y, color=color, lw=3)
    
def make_correlated_data(r):
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    return x, y

def r_scatter(r):
    """Generate a scatter plot with a correlation approximately r"""
    plots.figure(figsize=(5,5))
    x, y = make_correlated_data(r)
    plots.scatter(x, y, color='darkblue', s=20)
    plots.xlim(-4, 4)
    plots.ylim(-4, 4)
    
def r_table(r):
    """
    Generate a table of 1000 data points with a correlation approximately r
    """
    np.random.seed(8)
    x, y = make_correlated_data(r)
    return Table().with_columns('x', x, 'y', y)

## **Approach to Prediction** (Lecture 7)

In [None]:
# Note: Child heights are the **adult** heights of children in a family
families = Table.read_table('family_heights.csv')
families

In [None]:
# convert inches to cm


In [None]:
parent_avgs = (families.column('father') + families.column('mother'))/2

In [None]:
heights = Table().with_columns(
    'Parent Average', parent_avgs,
    'Child', families.column('child'),
)
heights

In [None]:
heights.scatter('Parent Average', 'Child')

In [None]:
heights.scatter('Parent Average', 'Child')
plots.plot([171.0, 171.0], [135.0, 205.0], color='red', lw=2)
plots.plot([173.0, 173.0], [135.0, 205.0], color='red', lw=2);

In [None]:
nearby = heights.where('Parent Average', are.between(171.0, 173.0))
nearby_mean = np.average(nearby.column('Child'))
nearby_mean

In [None]:
heights.scatter('Parent Average', 'Child')
plots.plot([171.0, 171.0], [135.0, 205.0], color='red', lw=2)
plots.plot([173.0, 173.0], [135.0, 205.0], color='red', lw=2)
plots.scatter(172.0, nearby_mean, color='red', s=50);

In [None]:
def predict_child(h):
    """Predict the height of a child whose parents have a parent average height of p_avg.
    
    The prediction is the average height of the children whose parent average height is
    in the range p_avg plus or minus 1.0.
    """
    nearby = heights.where('Parent Average', are.between(h - 1.0, h + 1.0))
    return np.average(nearby.column('Child'))

In [None]:
heights_with_predictions = heights.with_columns(
    'Prediction', heights.apply(predict_child, 'Parent Average'))

In [None]:
heights_with_predictions.scatter('Parent Average')

## **Association**

In [None]:
hybrid = Table.read_table('hybrid.csv')
hybrid

In [None]:
hybrid.sort('msrp', descending=True)

In [None]:
hybrid.scatter('mpg', 'msrp')

In [None]:
hybrid.scatter('acceleration', 'msrp')

In [None]:
suv = hybrid.where('class', 'SUV')
suv.num_rows

In [None]:
suv.scatter('acceleration', 'msrp')

In [None]:
suv.scatter('mpg', 'msrp')

In [None]:
hybrid.with_column('SUV', hybrid.column('class') == 'SUV').scatter('mpg', 'msrp', group='SUV')

In [None]:
def standard_units(x):
    "Convert any array of numbers to standard units."
    return (x - np.average(x)) / np.std(x)

In [None]:
Table().with_columns(
    'mpg (standard units)',  standard_units(suv.column('mpg')), 
    'msrp (standard units)', standard_units(suv.column('msrp'))
).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3);

In [None]:
suv.scatter('acceleration', 'msrp')

In [None]:
Table().with_columns(
    'acceleration (standard units)', standard_units(suv.column('acceleration')), 
    'msrp (standard units)',         standard_units(suv.column('msrp'))
).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3);

## **Correlation**

In [None]:
r_scatter(-0.5)

In [None]:
x = np.arange(1, 7, 1)
y = make_array(2, 3, 1, 5, 2, 7)
t = Table().with_columns(
        'x', x,
        'y', y
    )
t

In [None]:
t.scatter('x', 'y', s=30, color='red')

In [None]:
t = t.with_columns(
        'x (standard units)', standard_units(x),
        'y (standard units)', standard_units(y)
    )
t

In [None]:
t.scatter(2, 3, s=30, color='red')

In [None]:
t = t.with_columns(
    'product of standard units', t.column(2) * t.column(3))
t

In [None]:
# r is the average of the products of the standard units

r = np.average(t.column(2) * t.column(3))
r

In [None]:
def correlation(t, x, y):
    """t is a table; x and y are column labels"""
    x_in_standard_units = standard_units(t.column(x))
    y_in_standard_units = standard_units(t.column(y))
    return np.average(x_in_standard_units * y_in_standard_units)

In [None]:
correlation(t, 'x', 'y')

In [None]:
suv.scatter('mpg', 'msrp')

In [None]:
correlation(suv, 'mpg', 'msrp')

In [None]:
suv.scatter('acceleration', 'msrp')

In [None]:
correlation(suv, 'acceleration', 'msrp')

### **Switching Axes**

In [None]:
correlation(t, 'x', 'y')

In [None]:
t.scatter('x', 'y', s=30, color='red')

In [None]:
t.scatter('y', 'x', s=30, color='red')

In [None]:
correlation(t, 'y', 'x')

### **Nonlinearity**

In [None]:
new_x = np.arange(-4, 4.1, 0.5)
nonlinear = Table().with_columns(
        'x', new_x,
        'y', new_x**2
    )
nonlinear.scatter('x', 'y', s=30, color='r')

In [None]:
correlation(nonlinear, 'x', 'y')

### **Outliers**

In [None]:
line = Table().with_columns(
        'x', make_array(1, 2, 3, 4),
        'y', make_array(1, 2, 3, 4)
    )
line.scatter('x', 'y', s=30, color='r')

In [None]:
correlation(line, 'x', 'y')

In [None]:
outlier = Table().with_columns(
        'x', make_array(1, 2, 3, 4, 5),
        'y', make_array(1, 2, 3, 4, 0)
    )
outlier.scatter('x', 'y', s=30, color='r')

In [None]:
correlation(outlier, 'x', 'y')

### **Ecological Correlations**

In [None]:
sat2014 = Table.read_table('sat2014.csv').sort('State')
sat2014

In [None]:
sat2014.scatter('Critical Reading', 'Math')

In [None]:
correlation(sat2014, 'Critical Reading', 'Math')

In [None]:
def rate_code(x):
    if x <= 25:
        return 'low'
    elif x <= 75:
        return 'medium'
    else:
        return 'high'

In [None]:
rate_codes = sat2014.apply(rate_code, 'Participation Rate')

In [None]:
sat2014 = sat2014.with_columns('Rate Code', rate_codes)
sat2014

In [None]:
sat2014.scatter('Critical Reading', 'Math', group='Rate Code')

In [None]:
sat2014.where('Rate Code', 'low').show()

## **Prediction Lines**

### **r = 0.99**

In [None]:
example = r_table(0.99)
example.show(3)

In [None]:
example.scatter('x', 'y')
resize_window()

In [None]:
def nn_prediction_example(x_val):
    """ Predicts y-value for x based on the example table """
    neighbors = example.where('x', are.between(x_val - .25, x_val + .25))
    return np.mean(neighbors.column('y'))   

In [None]:
nn_prediction_example(-2.25)

In [None]:
example = example.with_columns(
    'Predicted y', 
    example.apply(nn_prediction_example, 'x'))

In [None]:
example.scatter('x')
resize_window()

In [None]:
example.scatter('x')
draw_line(slope=1)
resize_window()

### **r = 0**

In [None]:
example = r_table(0)
example.scatter('x', 'y')
resize_window()

In [None]:
example = example.with_columns(
    'Predicted y', 
    example.apply(nn_prediction_example, 'x'))

In [None]:
example.scatter('x')
draw_line(slope = 0)
resize_window()

### **r = 0.5**

In [None]:
example = r_table(0.5)
example.scatter('x', 'y')
resize_window()

In [None]:
example = r_table(0.5)
example.scatter('x', 'y')
resize_window()
draw_vertical_line(1.5)
draw_line(slope=1, intercept=0, color='red')

In [None]:
example = example.with_column('Predicted y', example.apply(nn_prediction_example, 'x'))
example.scatter('x')
draw_line(slope=1, color='red')
draw_vertical_line(1.5)
resize_window()

In [None]:
example.scatter('x')
draw_line(slope=1, intercept=0, color='red')
draw_line(slope=0.5, intercept=0)
resize_window()

###  **r = 0.7**

In [None]:
example = r_table(0.7)
example = example.with_column('Predicted y', example.apply(nn_prediction_example, 'x'))
example.scatter('x')
draw_line(slope=1, intercept=0, color='red')
draw_line(slope=0.7, intercept=0, color='dodgerblue')
resize_window()