<a href="https://colab.research.google.com/github/rojong00/colab_test/blob/master/movielens_knn_ipynb%EC%9D%98_%EC%82%AC%EB%B3%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file1 = '/content/drive/My drive/ratings.csv'
file2 = '/content/drive/My drive/movies.csv'

In [None]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import time

In [None]:
df_ratings = pd.read_csv(file1)
df_movies = pd.read_csv(file2)

In [None]:
df= pd.merge(df_ratings.drop('timestamp', axis=1), df_movies.drop('genres', axis=1), how='outer', on='movieId') [['movieId','userId','rating']].sort_values(by=['movieId']).fillna(0)

In [None]:
genre_list = ['Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Fantasy',
 'Romance',
 'Drama',
 'Action',
 'Crime',
 'Thriller',
 'Horror',
 'Mystery',
 'Sci-Fi',
 'IMAX',
 'Documentary',
 'War',
 'Musical',
 'Western',
 'Film-Noir',
 '(no genres listed)']

In [None]:
df.head()

Unnamed: 0,movieId,userId,rating
12440,1,1745.0,4.0
12262,1,1244.0,4.0
12261,1,1242.0,4.5
12260,1,1240.0,5.0
12259,1,1236.0,3.0


In [None]:
from scipy.sparse import csr_matrix
# pivot ratings into movie features
df_movie_features = df_ratings.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)
# convert dataframe of movie features to scipy sparse matrix
mat_movie_features = csr_matrix(df_movie_features.values)

In [None]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

In [None]:
movieProperties = df.groupby('movieId').agg({'rating': [np.size, np.mean]})
 
movieNumRatings = pd.DataFrame(movieProperties['rating']['size'])
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))

In [None]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
movieDict = {}
for index, row in df_movies.iterrows():    
  movieID = int(row['movieId'])
  name = row['title']
  genres = row[2:].str.split('|')
  genres = list(genres.values)  
  temp = []
  for gen in genre_list:    
    if gen in genres[0]:
      temp.append(1)
    else:
      temp.append(0)
  movieDict[movieID] = (name, np.array(temp), movieNormalizedNumRatings.loc[movieID].get('size'), movieProperties.loc[movieID].rating.get('mean'))

In [None]:
from scipy import spatial
 
# 장르, 인기도의 cosine 유사도 적용
def ComputeDistance(a, b):
    genresA = a[1]
    genresB = b[1]
    genreDistance = spatial.distance.cosine(genresA, genresB)
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    return genreDistance + popularityDistance
 
ComputeDistance(movieDict[1], movieDict[4])

1.4080668949792368

In [None]:
import operator
 
# neighbors 출력
def getNeighbors(movieID, K):
    distances = []
    for movie in movieDict:
        # 같은 movie가 아닐때만 movie distance를 구함
        if (movie != movieID):
            dist = ComputeDistance(movieDict[movieID], movieDict[movie])
            distances.append((movie, dist))
    # movie distance를 sort시켜주어 가장 가까운 영화들을 추천
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors
 
 
# 최종 추천
def recommend(movieID,K):
    avgRating = 0
    print(movieDict[movieID], '\n')
    neighbors = getNeighbors(movieID, K) # Toy Story (1995)
    for neighbor in neighbors:
        # neigbor의 평균 rating을 더해줌
        avgRating += movieDict[neighbor][3]
        print (movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))
    avgRating /= K
    print("평균 Rating: ",avgRating)
 
recommend(1,10)

('Toy Story (1995)', array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.6867107636800962, 3.910761154855643) 

Shrek (2001) 3.7559456398640996
Monsters, Inc. (2001) 3.873502994011976
Aladdin (1992) 3.680957683741648
Toy Story 2 (1999) 3.825
Finding Nemo (2003) 3.8204379562043798
Lord of the Rings: The Fellowship of the Ring, The (2001) 4.052607502287283
Monty Python and the Holy Grail (1975) 4.166453265044814
Lord of the Rings: The Two Towers, The (2002) 4.002994011976048
Bug's Life, A (1998) 3.557446808510638
Incredibles, The (2004) 3.8423645320197046
평균 Rating:  3.8577710393660594
