# Lecture 20의 데모

### Lecture 20의 데모에 대한 코드

참고자료: 

https://github.com/data-8/materials-sp22/blob/main/lec/lec37.ipynb


In [None]:
import os
from google.colab import drive

drive.mount('/content/gdrive')

%cd /content/gdrive/MyDrive/ITEC419-fa22/lec

In [None]:
from datascience import *
import numpy as np
import matplotlib

from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

In [None]:
def standard_units(x):
    return (x - np.mean(x))/np.std(x)

In [None]:
def distance(point1, point2):
    """The distance between two arrays of numbers."""
    return np.sqrt(np.sum((point1 - point2)**2))

def all_distances(training, point):
    """The distance between p (an array of numbers) and the numbers in row i of attribute_table."""
    attributes = training.drop('Class')
    def distance_from_point(row):
        return distance(point, np.array(row))
    return attributes.apply(distance_from_point)

def table_with_distances(training, point):
    """A copy of the training table with the distance from each row to array p."""
    return training.with_column('Distance', all_distances(training, point))

def closest(training, point, k):
    """A table containing the k closest rows in the training table to array p."""
    with_dists = table_with_distances(training, point)
    sorted_by_distance = with_dists.sort('Distance')
    topk = sorted_by_distance.take(np.arange(k))
    return topk

def majority(topkclasses):
    """1 if the majority of the "Class" column is 1s, and 0 otherwise."""
    ones = topkclasses.where('Class', are.equal_to(1)).num_rows
    zeros = topkclasses.where('Class', are.equal_to(0)).num_rows
    if ones > zeros:
        return 1
    else:
        return 0

def classify(training, p, k):
    """Classify an example with attributes p using k-nearest neighbor classification with the given training table."""
    closestk = closest(training, p, k)
    topkclasses = closestk.select('Class')
    return majority(topkclasses)

## **Standardize if Necessary**

In [None]:
ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')

In [None]:
ckd_new = ckd.select('Class').with_columns(
    'Glucose_su', standard_units(ckd.column('Glucose')),
    'Hemoglobin_su', standard_units(ckd.column('Hemoglobin')),
    'WBC_su', standard_units(ckd.column('White Blood Cell Count'))
)

In [None]:
ckd_new

In [None]:
shuffled = ckd_new.sample(with_replacement=False) 
training_set = shuffled.take(np.arange(74))
test_set  = shuffled.take(np.arange(74, 148))

In [None]:
def evaluate_accuracy(training, test, k):
    """Return the proportion of correctly classified examples 
    in the test set"""
    test_attributes = test.drop('Class')
    num_correct = 0
    for i in np.arange(test.num_rows):
        c = classify(training, test_attributes.row(i), k)
        num_correct = num_correct + (c == test.column('Class').item(i))
    return num_correct / test.num_rows

In [None]:
evaluate_accuracy(training_set, test_set, 3)

In [None]:
ckd_ori = ckd.select('Class', 'Glucose', 'Hemoglobin', 'White Blood Cell Count')

In [None]:
ckd_ori

Class,Glucose,Hemoglobin,White Blood Cell Count
1,117,11.2,6700
1,70,9.5,12100
1,380,10.8,4500
1,157,5.6,11000
1,173,7.7,9200
1,95,9.8,6900
1,264,12.5,9600
1,70,10.0,18900
1,253,10.5,7200
1,163,9.8,14600


In [None]:
shuffled = ckd_ori.sample(with_replacement=False) 
training_set = shuffled.take(np.arange(74))
test_set  = shuffled.take(np.arange(74, 148))

In [None]:
evaluate_accuracy(training_set, test_set, 3)

**.**

## **More Likely Than Not**

In [None]:
# np.array(list) converts list to an array
# provided all the elements of list are of the same type

n = 100

year = ...
major = ...
                 
students = Table().with_columns(
    'Year', year,
    'Major', major
)

In [None]:
students.show(3)

In [None]:
students.pivot('Major', 'Year')

In [None]:
# Verify: 60% of students are Second years, 40% are Third years


In [None]:
# Verify: 50% of Second years have Declared


In [None]:
# Verify: 80% of Third years have Declared


In [None]:
# Chance of second year, given that they have declared
# P(second year | declared)


In [None]:
# P(third year | declared)


**.**

## **Tree Diagram Calculation**

In [None]:
# P(second year | declared), from tree diagram


**.**

## **Decisions** ##

In [None]:
def create_population(prior_disease_prob, n):
    disease = round(n * prior_disease_prob)
    no_disease = round(n * (1 - prior_disease_prob))

    status = ...
    result = ...
                 
    t = Table().with_columns(
    'Status', status,
    'Test Result', result
    )
    return t.pivot('Test Result', 'Status')

In [None]:
create_population(1/1000, 10000)

In [None]:
10 / 510

In [None]:
# P(disease | tested +)
#   = P(disease & tested +) / P(tested +)

# if prior probability of disease is 1/10


In [None]:
create_population(1/10, 10000)

**.**

In [None]:
# P(disease | tested +)
# if prior probability of disease is 1/10


In [None]:
create_population(1/10, 10000)

In [None]:
# P(disease | tested +)
# if prior probability of disease is 0.5


In [None]:
create_population(0.5, 10000)