In [13]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

#### Data load for feature scaling

In [129]:
df_ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
df_movies = pd.read_csv('data/ml-latest-small/movies.csv')

## Feature scaling using Standardization
 Standardization이 (X - X의 평균값) / (X의 표준편차) 이렇게 하는건데 여기서는 표준편차를 안 나눴네..? 나중에 해보자

In [132]:
def UserMoiveMatirx_demeaned(df_ratings, df_movies):
    df_ratings=df_ratings[:2000000]
    df_movie_features = df_ratings.pivot(
        index='userId',
        columns='movieId',
        values='rating'
    ).fillna(0)
    
    ## Data normalizaiion ##
    # pivot table을 matrix로 변환
    R = df_movie_features.as_matrix()
    # 각 user들의 rating 평균을 구함
    user_ratings_mean = np.mean(R, axis = 1)
    # R_demeaned : 사용자-영화에 대해 사용자 평균 평점을 뺀 것.
    R_demeaned = R - user_ratings_mean.reshape(-1, 1)
    
    M_demeaned = pd.DataFrame(R_demeaned, columns = df_movie_features.columns)
    
    return M_demeaned

In [134]:
Md = UserMoiveMatirx_demeaned(df_ratings, df_movies)
Md

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
0,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,...,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625
1,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,3.970770,...,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230
2,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,...,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075
3,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,3.902162,...,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838
4,-0.043128,-0.043128,3.956872,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,...,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128
5,-0.015828,-0.015828,-0.015828,-0.015828,-0.015828,-0.015828,-0.015828,-0.015828,-0.015828,-0.015828,...,-0.015828,-0.015828,-0.015828,-0.015828,-0.015828,-0.015828,-0.015828,-0.015828,-0.015828,-0.015828
6,2.966358,-0.033642,-0.033642,-0.033642,-0.033642,-0.033642,-0.033642,-0.033642,-0.033642,2.966358,...,-0.033642,-0.033642,-0.033642,-0.033642,-0.033642,-0.033642,-0.033642,-0.033642,-0.033642,-0.033642
7,-0.049471,-0.049471,-0.049471,-0.049471,-0.049471,-0.049471,-0.049471,-0.049471,-0.049471,-0.049471,...,-0.049471,-0.049471,-0.049471,-0.049471,-0.049471,-0.049471,-0.049471,-0.049471,-0.049471,-0.049471
8,3.981359,-0.018641,-0.018641,-0.018641,-0.018641,-0.018641,-0.018641,-0.018641,-0.018641,-0.018641,...,-0.018641,-0.018641,-0.018641,-0.018641,-0.018641,-0.018641,-0.018641,-0.018641,-0.018641,-0.018641
9,-0.018751,-0.018751,-0.018751,-0.018751,-0.018751,-0.018751,-0.018751,-0.018751,-0.018751,-0.018751,...,-0.018751,-0.018751,-0.018751,-0.018751,-0.018751,-0.018751,-0.018751,-0.018751,-0.018751,-0.018751


#### Data load for matrix factorization

In [154]:
df_ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
df_movies = pd.read_csv('data/ml-latest-small/movies.csv')

## Matrix Factorization using SVD

In [155]:
def MatrixFactorizaion(R_demeaned):
    # U 행렬, sigma 행렬, V 전치 행렬을 반환.
    U, sigma, Vt = svds(R_demeaned, k = 50)
    # 0이 포함된 대칭행렬로 변환
    sigma = np.diag(sigma)
    
    # U, Sigma, Vt의 내적을 수행하면, 다시 원본 행렬로 복원 + 사용자 평균 rating을 적용 
    svd_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
    all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
    
    preds_df = pd.DataFrame(all_user_predicted_ratings, columns = df_movie_features.columns)
    
    return preds_df

In [156]:
preds_df = MatrixFactorizaion(R_demeaned)
preds_df

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
0,-0.054239,0.045130,-0.004835,-0.019817,-0.011284,0.041373,-0.007822,-0.017188,0.012246,0.037670,...,-0.005258,-0.005453,0.012369,-0.004991,-0.004639,-0.019055,0.021402,-0.006365,-0.006098,-0.004819
1,0.419835,1.406440,-0.188807,0.156658,0.268032,0.414698,0.052172,0.044728,-0.020198,2.220256,...,-0.005909,-0.003974,-0.012555,-0.003555,-0.002711,-0.071621,-0.016212,0.001047,-0.001468,-0.006577
2,1.345619,0.266505,-0.011962,0.012278,0.079508,0.090960,-0.122094,0.031327,-0.018023,0.141176,...,-0.002647,-0.002364,-0.010153,0.000277,-0.000116,-0.018063,-0.015761,0.010611,0.006792,-0.006357
3,1.133455,1.046982,0.141275,0.081841,-0.339675,-1.484659,-0.263096,-0.169750,-0.021862,1.611664,...,0.020805,0.000410,0.056040,-0.002817,-0.000767,0.159159,0.087519,-0.030854,-0.021279,0.048529
4,1.389578,1.466495,0.605557,-0.029647,0.729380,-0.118539,-0.026017,0.065577,-0.156655,0.307926,...,-0.007422,-0.011810,0.006644,-0.005159,-0.001249,-0.034658,0.016456,0.001710,-0.004166,-0.001864
5,0.351379,0.147783,-0.226190,0.024425,-0.028854,0.052569,-0.095954,-0.013454,-0.050000,0.020672,...,-0.008031,-0.004661,0.003584,-0.004411,-0.003628,0.006245,0.008364,-0.013672,-0.010594,-0.010085
6,2.710704,0.684846,0.702097,0.141979,0.025992,0.079456,0.386194,-0.038723,0.126546,1.075512,...,0.008354,0.003333,0.004745,0.004281,0.003670,0.064590,0.004365,0.016075,0.011847,0.011205
7,1.286794,-0.263419,-0.147525,0.056464,0.612119,0.048448,0.055058,0.004062,-0.090489,0.294803,...,0.009358,0.012997,0.028965,0.009099,0.006378,-0.012434,0.036176,0.020553,0.018148,0.004175
8,1.803813,-0.024808,-0.113694,-0.026134,0.188041,0.660766,0.039552,-0.021128,-0.016371,-0.331059,...,0.007091,0.002046,0.014177,-0.000965,-0.003194,-0.046815,0.019518,0.012264,0.008756,0.010689
9,0.498709,-0.216453,-0.262796,0.001221,-0.233903,-0.085897,-0.181951,-0.047679,-0.105678,0.104707,...,0.016209,0.003547,0.000557,-0.004506,-0.009630,0.042599,-0.002036,0.007153,0.006589,0.026675


#### Data load for M_recommend_movies

In [186]:
df_ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
df_movies = pd.read_csv('data/ml-latest-small/movies.csv')

## All user prediction system

In [187]:
def AllUserPrediction(preds_df, userID, df_movies, df_ratings, num_recommendations=5):
    
    user_row_number = userID - 1 # UserID starts at 1, not 0
    
    # 최종적으로 만든 pred_df에서 사용자 index에 따라 영화 데이터 정렬 -> 영화 평점이 높은 순으로 정렬
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False) # UserID starts at 1
    
    # 원본 평점 데이터에서 userId에 해당하는 데이터를 뽑아냄
    user_data = df_ratings[df_ratings.userId == (userID)]
    
    # 위에서 뽑은 user_data와 원본 영화 데이터를 합침
    user_full = (user_data.merge(df_movies, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False))
    user_full = user_data.merge(df_movies, on = 'movieId').sort_values(['rating'], ascending=False)
    
    # 원본 영화 데이터에서 사용자가 본 영화 데이터를 제외한 데이터를 추출
    recommendations = df_movies[~df_movies['movieId'].isin(user_full['movieId'])]
    # 사용자의 영화 평점이 높은 순으로 정렬된 데이터와 위 recommendations을 합친다. 
    recommendations = recommendations.merge( pd.DataFrame(sorted_user_predictions).reset_index(), on = 'movieId')
    # 컬럼 이름 바꾸고 정렬해서 return
    recommendations = recommendations.rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :]

    return recommendations

#### 330번 user에게 추천해줄 movie list

In [222]:
predictions = AllUserPrediction(preds_df, 330, df_movies, df_ratings, 10)
predictions

Unnamed: 0,movieId,title,genres,Predictions
246,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,2.190762
487,590,Dances with Wolves (1990),Adventure|Drama|Western,1.924099
294,356,Forrest Gump (1994),Comedy|Drama|Romance|War,1.73299
40,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1.579084
316,380,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller,1.345957
1977,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,1.340938
421,509,"Piano, The (1993)",Drama|Romance,1.334513
313,377,Speed (1994),Action|Romance|Thriller,1.208254
28,34,Babe (1995),Children|Drama,1.189291
497,608,Fargo (1996),Comedy|Crime|Drama|Thriller,1.186194


#### Data load for M1

In [82]:
df_ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
df_movies = pd.read_csv('data/ml-latest-small/movies.csv')

In [78]:
df_ratings.shape

(100004, 4)

## Generate 'user - time(6hour term)' Matrix 

In [83]:
def UserTimeMatrix(df_ratings):
    
    # original dataframe에서 moiveId, rating, drop,
    rd = df_ratings
    rd.drop('movieId', axis = 1, inplace = True)
    rd.drop('rating', axis = 1, inplace = True)
    # timestamp2datetime
    rd['Hour_6']=[datetime.fromtimestamp(x).strftime('%H') for x in rd['timestamp']]
    rd.drop('timestamp', axis = 1, inplace = True)
    rd = rd.reindex(columns=['Hour_6','userId'])
    # datetime 기준으로 정렬
    rd = rd.sort_values(by='Hour_6')
    
    # datetime2Hour_6term
    rd.loc[(rd['Hour_6']=="00") | (rd["Hour_6"]=="01") | 
           (rd["Hour_6"]=="02") | (rd["Hour_6"]=="03"),"Hour_6"] = "h1"
    rd.loc[(rd['Hour_6']=="04") | (rd["Hour_6"]=="05") | 
           (rd["Hour_6"]=="06") | (rd["Hour_6"]=="07"),"Hour_6"] = "h2"
    rd.loc[(rd['Hour_6']=="08") | (rd["Hour_6"]=="09") | 
           (rd["Hour_6"]=="10") | (rd["Hour_6"]=="11"),"Hour_6"] = "h3"
    rd.loc[(rd['Hour_6']=="12") | (rd["Hour_6"]=="13") | 
           (rd["Hour_6"]=="14") | (rd["Hour_6"]=="15"),"Hour_6"] = "h4"
    rd.loc[(rd['Hour_6']=="16") | (rd["Hour_6"]=="17") | 
           (rd["Hour_6"]=="18") | (rd["Hour_6"]=="19"),"Hour_6"] = "h5"
    rd.loc[(rd['Hour_6']=="20") | (rd["Hour_6"]=="21") | 
           (rd["Hour_6"]=="22") | (rd["Hour_6"]=="23"),"Hour_6"] = "h6"
    
   # Hour_6 변수 선언
    h1 = M1.loc[M1["Hour_6"] == "h1", :]  # h1 : 00 ~ 03
    h2 = M1.loc[M1["Hour_6"] == "h2", :]  # h2 : 04 ~ 07
    h3 = M1.loc[M1["Hour_6"] == "h3", :]  # h3 : 08 ~ 11
    h4 = M1.loc[M1["Hour_6"] == "h4", :]  # h4 : 12 ~ 15
    h5 = M1.loc[M1["Hour_6"] == "h5", :]  # h5 : 16  ~19
    h6 = M1.loc[M1["Hour_6"] == "h6", :]  # h6 : 20 ~ 23
    
    return rd

In [84]:
M1 = UserTimeMatrix(df_ratings)

In [85]:
M1

Unnamed: 0,Hour_6,userId
100003,h1,671
43614,h1,311
85514,h1,574
85513,h1,574
85512,h1,574
41699,h1,299
52319,h1,382
85507,h1,574
17093,h1,111
85506,h1,574


In [93]:
print("h1 =", h1.shape, "\nh2 =", h2.shape, "\nh3 =", h3.shape, "\nh4=", h4.shape, "\nh5 =", h5.shape,  "\nh6 =", h6.shape)

h1 = (19018, 2) 
h2 = (24608, 2) 
h3 = (16910, 2) 
h4= (15725, 2) 
h5 = (11413, 2) 
h6 = (12330, 2)


#### Data load for M2

In [221]:
df_ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
df_movies = pd.read_csv('data/ml-latest-small/movies.csv')

## Generate 'user - prediction(top 5)' Matrix

In [236]:
def UserPredictionMatrix(predictions):
    pred5 = predictions[:5]
    title = pred5[['title']]
    title.reset_index(drop=True)
    title.rename(columns = {"title": 330}, inplace = True)
    dict_from_df = title.to_dict('list')
    
    user_title = {}
    for i in range(1, 672):
        predictions = AllUserPrediction(preds_df, i, df_movies, df_ratings, 10)
        df = predictions[:5]
        title = df[['title']]
        #title.drop(index='userId')
        title.reset_index(drop=True)
        title.rename(columns = {"title": i}, inplace = True)
        dict_from_df = title.to_dict('list')
        user_title.update(dict_from_df)
        df = pd.DataFrame(user_title)
        df = pd.DataFrame(list(user_title.items()),columns =['userId','title'])
    
    return df

In [237]:
M2 = UserPredictionMatrix(predictions)

In [238]:
M2

Unnamed: 0,userId,title
0,1,"[Star Trek II: The Wrath of Khan (1982), Rocky..."
1,2,"[True Lies (1994), Shawshank Redemption, The (..."
2,3,"[Matrix, The (1999), Lord of the Rings: The Fe..."
3,4,"[Beauty and the Beast (1991), Raising Arizona ..."
4,5,"[Men in Black (a.k.a. MIB) (1997), Sleepless i..."
5,6,[Lord of the Rings: The Fellowship of the Ring...
6,7,"[Fugitive, The (1993), Aliens (1986), Men in B..."
7,8,"[Shrek (2001), Pirates of the Caribbean: The C..."
8,9,"[Pulp Fiction (1994), Usual Suspects, The (199..."
9,10,"[Star Wars: Episode IV - A New Hope (1977), Pu..."


-----

In [287]:
_h1 = pd.merge(h1, M2, on="userId")
_h1 = df.drop_duplicates(["userId"])
_h1.head()

Unnamed: 0,userId,title
0,1,"[Star Trek II: The Wrath of Khan (1982), Rocky..."
1,2,"[True Lies (1994), Shawshank Redemption, The (..."
2,3,"[Matrix, The (1999), Lord of the Rings: The Fe..."
3,4,"[Beauty and the Beast (1991), Raising Arizona ..."
4,5,"[Men in Black (a.k.a. MIB) (1997), Sleepless i..."


### 00 ~ 04시간에 자주 시청하는 유저들 중  "Toy Story (1995)" 와 비슷한 취향을 가지고 있어(predict) 초대장을 보낼 유저들

In [290]:
lst = ["Toy Story (1995)"] # filter list
m = _h1["title"].apply(lambda s: len(set(s) & set(lst)) > 0)
_h1[m]

Unnamed: 0,userId,title
2,3,"[Matrix, The (1999), Lord of the Rings: The Fe..."
13,14,"[Star Wars: Episode IV - A New Hope (1977), Wh..."
17,18,"[Willy Wonka & the Chocolate Factory (1971), T..."
24,25,"[Rock, The (1996), Twister (1996), Mission: Im..."
41,42,"[Silence of the Lambs, The (1991), Back to the..."
60,61,"[Toy Story (1995), Good Will Hunting (1997), E..."
136,137,"[Jurassic Park (1993), Forrest Gump (1994), Ra..."
158,159,"[One Flew Over the Cuckoo's Nest (1975), Dr. S..."
165,166,"[Shrek (2001), Sixth Sense, The (1999), Toy St..."
189,190,"[Fight Club (1999), Blair Witch Project, The (..."
