In [3]:
from collections import Counter
import math


def knn(data, query, k, distance_fn, choice_fn):
    neighbor_distances_and_indices = []

    # 3.  データをループ
    for index, example in enumerate(data):
        # 3.1 ターゲットと各データの距離を測定
        distance = distance_fn(example[:-1], query)
        
        # 3.2 距離とindexを配列に格納
        neighbor_distances_and_indices.append((distance, index))
    
    # 4. 距離の近い順にソート
    sorted_neighbor_distances_and_indices = sorted(neighbor_distances_and_indices)
    
    # 5. ソート済みのコレクションからk個のデータを抽出
    k_nearest_distances_and_indices = sorted_neighbor_distances_and_indices[:k]
    
    # 6. Kのラベルを取得
    k_nearest_labels = [data[i][1] for distance, i in k_nearest_distances_and_indices]

    # 7. If 回帰 (choice_fn = mean), return the average of the K labels
    # 8. If 分類 (choice_fn = mode), return the mode of the K labels
    return k_nearest_distances_and_indices , choice_fn(k_nearest_labels)


def mean(labels):
    return sum(labels) / len(labels)

def mode(labels):
    return Counter(labels).most_common(1)[0][0]


def euclidean_distance(point1, point2):
    sum_squared_distance = 0
    for i in range(len(point1)):
        sum_squared_distance += math.pow(point1[i] - point2[i], 2)
    return math.sqrt(sum_squared_distance)


def main():
    '''
    # Regression Data
    # 
    # Column 0: height (inches)
    # Column 1: weight (pounds)
    '''
    
    reg_data = [
       [65.75, 112.99],
       [71.52, 136.49],
       [69.40, 153.03],
       [68.22, 142.34],
       [67.79, 144.30],
       [68.70, 123.30],
       [69.80, 141.49],
       [70.01, 136.46],
       [67.90, 112.37],
       [66.49, 127.45],
    ]
    
    # Question:
    # 60inchの場合の体重は?
    reg_query = [60]
    
    reg_k_nearest_neighbors, reg_prediction = knn(
        reg_data, reg_query, k=3, distance_fn=euclidean_distance, choice_fn=mean
    )
    print(reg_k_nearest_neighbors)
    print(reg_prediction)
    
    '''
    # Classification Data
    # 
    # Column 0: age
    # Column 1: likes pineapple
    '''
    clf_data = [
       [22, 1],
       [23, 1],
       [21, 1],
       [18, 1],
       [19, 1],
       [25, 0],
       [27, 0],
       [29, 0],
       [31, 0],
       [45, 0],
    ]
    # Question:
    # 33歳は ピザのトッピングにパイナップルをのせたがるか?
    clf_query = [33]
    
    clf_k_nearest_neighbors, clf_prediction = knn(
        clf_data, clf_query, k=3, distance_fn=euclidean_distance, choice_fn=mode
    )
    print(clf_k_nearest_neighbors)
    print(clf_prediction)
    
if __name__ == '__main__':
    main()

[(5.75, 0), (6.489999999999995, 9), (7.790000000000006, 4)]
128.24666666666667
[(2.0, 8), (4.0, 7), (6.0, 6)]
0


In [5]:
def recommend_movies(movie_query, k_recommendations):
    
    raw_movies_data = []
    with open('./ml_data/movies_recommendation_data.csv', 'r') as md:
        # ヘッダーを除去
        next(md)

        # データを配列に展開
        for line in md.readlines():
            data_row = line.strip().split(',')
            raw_movies_data.append(data_row)

    # knnアルゴリズム用にデータを整形
    # 関連するコラムをデータ型を数値に変更
    movies_recommendation_data = []
    for row in raw_movies_data:
        data_row = list(map(float, row[2:]))
        movies_recommendation_data.append(data_row)

    # KNN algorithm を使って「The Post」に最も近い映画 ベスト5 を抽出
    recommendation_indices, _ = knn(
        movies_recommendation_data, movie_query, k=k_recommendations,
        distance_fn=euclidean_distance, choice_fn=lambda x: None
    )

    movie_recommendations = []
    for _, index in recommendation_indices:
        movie_recommendations.append(raw_movies_data[index])

    return movie_recommendations

if __name__ == '__main__':
    the_post = [7.2, 1, 1, 0, 0, 0, 0, 1, 0] # 「The Post」の特徴データ
    recommended_movies = recommend_movies(movie_query=the_post, k_recommendations=5)

    # 勧め映画タイトルをprint
    for recommendation in recommended_movies:
        print(recommendation[1])

12 Years a Slave
Hacksaw Ridge
Queen of Katwe
The Wind Rises
A Beautiful Mind
