In [1]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds
from datetime import datetime

import matplotlib.pyplot as plt 
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
df_ratings = pd.read_csv('ratings.csv')
df_movies = pd.read_csv('movies.csv')

# Matrix Factorization using SVD
## with Feature scaling using Standardization

In [3]:
def MatrixFactorizaion(df_ratings, df_movies):
    #df_ratings = pd.read_csv('ratings.csv')
    #df_movies = pd.read_csv('movies.csv')
    
    #df_ratings=df_ratings[:2000000]
    df_movie_features = df_ratings.pivot(
        index='userId',
        columns='movieId',
        values='rating'
    ).fillna(0)
    
    ## Data normalizaiion ##
    # pivot table을 matrix로 변환
    #R = df_movie_features.as_matrix()
    #R=df_movie_features.values
    R = df_movie_features.to_numpy()
    
    # 각 user들의 rating 평균을 구함
    user_ratings_mean = np.mean(R, axis = 1)
    # R_demeaned : 사용자-영화에 대해 사용자 평균 평점을 뺀 것.
    R_demeaned = R - user_ratings_mean.reshape(-1, 1)
    
    M_demeaned = pd.DataFrame(R_demeaned, columns = df_movie_features.columns)
    
    
    # U 행렬, sigma 행렬, V 전치 행렬을 반환.
    ############
    U, sigma, Vt = svds(M_demeaned, k = 50)
    # 0이 포함된 대칭행렬로 변환
    sigma = np.diag(sigma)
    
    # U, Sigma, Vt의 내적을 수행하면, 다시 원본 행렬로 복원 + 사용자 평균 rating을 적용 
    svd_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
    all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
    
    preds_df = pd.DataFrame(all_user_predicted_ratings, columns = df_movie_features.columns)
    
    return preds_df

In [4]:
preds_df = MatrixFactorizaion(df_ratings, df_movies)
preds_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
0,-0.054239,0.04513,-0.004835,-0.019817,-0.011284,0.041373,-0.007822,-0.017188,0.012246,0.03767,...,-0.005258,-0.005453,0.012369,-0.004991,-0.004639,-0.019055,0.021402,-0.006365,-0.006098,-0.004819
1,0.419835,1.40644,-0.188807,0.156658,0.268032,0.414698,0.052172,0.044728,-0.020198,2.220256,...,-0.005909,-0.003974,-0.012555,-0.003555,-0.002711,-0.071621,-0.016212,0.001047,-0.001468,-0.006577
2,1.345619,0.266505,-0.011962,0.012278,0.079508,0.09096,-0.122094,0.031327,-0.018023,0.141176,...,-0.002647,-0.002364,-0.010153,0.000277,-0.000116,-0.018063,-0.015761,0.010611,0.006792,-0.006357
3,1.133455,1.046982,0.141275,0.081841,-0.339675,-1.484659,-0.263096,-0.16975,-0.021862,1.611664,...,0.020805,0.00041,0.05604,-0.002817,-0.000767,0.159159,0.087519,-0.030854,-0.021279,0.048529
4,1.389578,1.466495,0.605557,-0.029647,0.72938,-0.118539,-0.026017,0.065577,-0.156655,0.307926,...,-0.007422,-0.01181,0.006644,-0.005159,-0.001249,-0.034658,0.016456,0.00171,-0.004166,-0.001864


# All user prediction system

In [5]:
def AllUserPrediction(preds_df, userID, df_movies, df_ratings, num_recommendations=5):
    df_ratings = pd.read_csv('ratings.csv')
    df_movies = pd.read_csv('movies.csv')

    user_row_number = userID - 1 # UserID starts at 1, not 0
    
    # 최종적으로 만든 pred_df에서 사용자 index에 따라 영화 데이터 정렬 -> 영화 평점이 높은 순으로 정렬
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False) # UserID starts at 1
    
    # 원본 평점 데이터에서 userId에 해당하는 데이터를 뽑아냄
    user_data = df_ratings[df_ratings.userId == (userID)]
    
    # 위에서 뽑은 user_data와 원본 영화 데이터를 합침
    user_full = (user_data.merge(df_movies, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False))
    user_full = user_data.merge(df_movies, on = 'movieId').sort_values(['rating'], ascending=False)
    
    # 원본 영화 데이터에서 사용자가 본 영화 데이터를 제외한 데이터를 추출
    recommendations = df_movies[~df_movies['movieId'].isin(user_full['movieId'])]
    # 사용자의 영화 평점이 높은 순으로 정렬된 데이터와 위 recommendations을 합친다. 
    recommendations = recommendations.merge( pd.DataFrame(sorted_user_predictions).reset_index(), on = 'movieId')
    # 컬럼 이름 바꾸고 정렬해서 return
    recommendations = recommendations.rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :]

    return recommendations

# 330번 user에게 추천해줄 movie list

In [6]:
predictions = AllUserPrediction(preds_df, 330, df_movies, df_ratings, 10)
predictions.head()

Unnamed: 0,movieId,title,genres,Predictions
246,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,2.190762
487,590,Dances with Wolves (1990),Adventure|Drama|Western,1.924099
294,356,Forrest Gump (1994),Comedy|Drama|Romance|War,1.73299
40,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1.579084
316,380,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller,1.345957


# Generate 'user - time(6hour term)' Matrix

In [7]:
def UserTimeMatrix(df_ratings):
    #df_ratings = pd.read_csv('ratings.csv')
    #df_movies = pd.read_csv('movies.csv')
    
    # original dataframe에서 moiveId, rating, drop,
    rd = df_ratings
    rd.drop('movieId', axis = 1, inplace = True)
    rd.drop('rating', axis = 1, inplace = True)
    # timestamp2datetime
    rd['Hour_6']=[datetime.fromtimestamp(x).strftime('%H') for x in rd['timestamp']]
    rd.drop('timestamp', axis = 1, inplace = True)
    rd = rd.reindex(columns=['Hour_6','userId'])
    # datetime 기준으로 정렬
    rd = rd.sort_values(by='Hour_6')
    
    # datetime2Hour_6term
    rd.loc[(rd['Hour_6']=="00") | (rd["Hour_6"]=="01") | 
           (rd["Hour_6"]=="02") | (rd["Hour_6"]=="03"),"Hour_6"] = "h1"
    rd.loc[(rd['Hour_6']=="04") | (rd["Hour_6"]=="05") | 
           (rd["Hour_6"]=="06") | (rd["Hour_6"]=="07"),"Hour_6"] = "h2"
    rd.loc[(rd['Hour_6']=="08") | (rd["Hour_6"]=="09") | 
           (rd["Hour_6"]=="10") | (rd["Hour_6"]=="11"),"Hour_6"] = "h3"
    rd.loc[(rd['Hour_6']=="12") | (rd["Hour_6"]=="13") | 
           (rd["Hour_6"]=="14") | (rd["Hour_6"]=="15"),"Hour_6"] = "h4"
    rd.loc[(rd['Hour_6']=="16") | (rd["Hour_6"]=="17") | 
           (rd["Hour_6"]=="18") | (rd["Hour_6"]=="19"),"Hour_6"] = "h5"
    rd.loc[(rd['Hour_6']=="20") | (rd["Hour_6"]=="21") | 
           (rd["Hour_6"]=="22") | (rd["Hour_6"]=="23"),"Hour_6"] = "h6"
    
    return rd

In [8]:
M1 = UserTimeMatrix(df_ratings)
M1.head()

Unnamed: 0,Hour_6,userId
75933,h1,528
60987,h1,442
60986,h1,442
60985,h1,442
60984,h1,442


# Generate 'user - prediction(top 5)' Matrix

In [9]:
def UserPredictionMatrix():
    df_ratings = pd.read_csv('ratings.csv')
    df_movies = pd.read_csv('movies.csv') 
    
    usern=max(df_ratings.userId)
    user_title = {}
    for i in range(1, usern+1):
        predictions = AllUserPrediction(preds_df, i, df_movies, df_ratings, 10)
        df = predictions[:5]
        title = df[['title']]
        #title.drop(index='userId')
        title.reset_index(drop=True)
        title.rename(columns = {"title": i}, inplace = True)
        dict_from_df = title.to_dict('list')
        user_title.update(dict_from_df)
    df = pd.DataFrame(user_title)
    df = pd.DataFrame(list(user_title.items()),columns =['userId','title'])
    
    return df

In [10]:
M2 = UserPredictionMatrix()
M2.head()

Unnamed: 0,userId,title
0,1,"[Star Trek II: The Wrath of Khan (1982), Rocky..."
1,2,"[True Lies (1994), Shawshank Redemption, The (..."
2,3,"[Matrix, The (1999), Lord of the Rings: The Fe..."
3,4,"[Beauty and the Beast (1991), Raising Arizona ..."
4,5,"[Men in Black (a.k.a. MIB) (1997), Sleepless i..."


In [11]:
def main(M1,M2):
    # Hour_6 변수 선언
    h1 = M1.loc[M1["Hour_6"] == "h1", :]  # h1 : 00 ~ 03
    h2 = M1.loc[M1["Hour_6"] == "h2", :]  # h2 : 04 ~ 07
    h3 = M1.loc[M1["Hour_6"] == "h3", :]  # h3 : 08 ~ 11
    h4 = M1.loc[M1["Hour_6"] == "h4", :]  # h4 : 12 ~ 15
    h5 = M1.loc[M1["Hour_6"] == "h5", :]  # h5 : 16  ~19
    h6 = M1.loc[M1["Hour_6"] == "h6", :]  # h6 : 20 ~ 23
    
    h1 = pd.merge(h1, M2, on="userId")
    h1 = M2.drop_duplicates(["userId"])
    
    lst = ["Toy Story (1995)"] # filter list
    m = h1["title"].apply(lambda s: len(set(s) & set(lst)) > 0)
    
    return h1[m]

In [12]:
pd.set_option('display.max.colwidth', 200)
main(M1,M2)

Unnamed: 0,userId,title
2,3,"[Matrix, The (1999), Lord of the Rings: The Fellowship of the Ring, The (2001), Lord of the Rings: The Two Towers, The (2002), Star Wars: Episode IV - A New Hope (1977), Toy Story (1995)]"
13,14,"[Star Wars: Episode IV - A New Hope (1977), Who Framed Roger Rabbit? (1988), Toy Story (1995), Star Wars: Episode VI - Return of the Jedi (1983), Shakespeare in Love (1998)]"
17,18,"[Willy Wonka & the Chocolate Factory (1971), Toy Story (1995), Star Trek: First Contact (1996), Grumpier Old Men (1995), Jerry Maguire (1996)]"
24,25,"[Rock, The (1996), Twister (1996), Mission: Impossible (1996), Toy Story (1995), Mr. Holland's Opus (1995)]"
41,42,"[Silence of the Lambs, The (1991), Back to the Future (1985), Toy Story (1995), Gladiator (2000), Terminator, The (1984)]"
60,61,"[Toy Story (1995), Good Will Hunting (1997), Eternal Sunshine of the Spotless Mind (2004), Truman Show, The (1998), Apollo 13 (1995)]"
136,137,"[Jurassic Park (1993), Forrest Gump (1994), Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981), Toy Story (1995), Princess Bride, The (1987)]"
158,159,"[One Flew Over the Cuckoo's Nest (1975), Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964), Toy Story (1995), Dances with Wolves (1990), Twelve Monkeys (a.k.a. 12 Monkeys..."
165,166,"[Shrek (2001), Sixth Sense, The (1999), Toy Story (1995), Finding Nemo (2003), Incredibles, The (2004)]"
189,190,"[Fight Club (1999), Blair Witch Project, The (1999), Toy Story (1995), Bug's Life, A (1998), Casino (1995)]"
