In [2]:
from collections import Counter
import math
from scipy import spatial
import numpy as np 
import pandas as pd

In [1]:
def knn(data, query, k, choice_fn):
    neighbor_distances_and_indices = []
    
    # 3. For each example in the data
    for index, example in enumerate(data):
        # 3.1 Calculate the distance between the query example and the current
        # example from the data.
        # get the last value - because it is weight 
        distance = spatial.distance.euclidean(example[0], query)
        # 3.2 Add the distance and the index of the example to an ordered collection
        neighbor_distances_and_indices.append((distance, index))
    # 4. Sort the ordered collection of distances and indices from
    # smallest to largest (in ascending order) by the distances
    sorted_neighbor_distances_and_indices = sorted(neighbor_distances_and_indices)
    # 5. Pick the first K entries from the sorted collection
    k_nearest_distances_and_indices = sorted_neighbor_distances_and_indices[:k]
    # 6. Get the labels of the selected K entries
    # get weight values 
    k_nearest_labels = [data[i][1] for distance, i in k_nearest_distances_and_indices]

    # 7. If regression (choice_fn = mean), return the average of the K labels
    # 8. If classification (choice_fn = mode), return the mode of the K labels
    return k_nearest_distances_and_indices , choice_fn(k_nearest_labels)

In [3]:
'''
# Regression Data
# 
# Column 0: height (inches)
# Column 1: weight (pounds)
'''
reg_data = [
    [65.75, 112.99],
    [71.52, 136.49],
    [69.40, 153.03],
    [68.22, 142.34],
    [67.79, 144.30],
    [68.70, 123.30],
    [69.80, 141.49],
    [70.01, 136.46],
    [67.90, 112.37],
    [66.49, 127.45],
]

# Question:
# Given the data we have, what's the best-guess at someone's weight if they are 60 inches tall?
reg_query = [60]
reg_k_nearest_neighbors, reg_prediction = knn(
    reg_data, reg_query, k=4, choice_fn=np.mean
)

print(f"Height: {reg_query[0]}, Prediction: {reg_prediction}")

Height: 60, Prediction: 124.2775


In [4]:
import statistics 

'''
# Classification Data
# 
# Column 0: age
# Column 1: likes pineapple
'''
clf_data = [
    [22, 1],
    [23, 1],
    [21, 1],
    [18, 1],
    [19, 1],
    [25, 0],
    [27, 0],
    [29, 0],
    [31, 0],
    [45, 0],
]
# Question:
# Given the data we have, does a 33 year old like pineapples on their pizza?
clf_query = [33]
clf_k_nearest_neighbors, clf_prediction = knn(
    clf_data, clf_query, k=3, choice_fn=statistics.mode
)
print(f"Does a {clf_query[0]} y.o. like pineapples?\t{'Yes' if clf_prediction else 'No'}")

Does a 33 y.o. like pineapples?	No


In [5]:
# movie recommendation system 
ratings = pd.read_csv('./IMDb/title.ratings.tsv', sep='\t')
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.6,1656
1,tt0000002,6.1,201
2,tt0000003,6.5,1369
3,tt0000004,6.2,122
4,tt0000005,6.2,2151


In [6]:
basic = pd.read_csv('./IMDb/title.basics.tsv', sep='\t')
basic.head(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [None]:
genres = basic.genres.str.get_dummies()
genres.head(5)

In [None]:
basic_cut = pd.DataFrame([basic['originalTitle'], 
data = pd.concat()