In [1]:
import numpy as np
import codecs, operator, sys, os
import pandas as pd
from numpy import linalg as LA

In [10]:
ratings_file = './res/small/ratings.csv'
movies_file = './res/small/movieInfo.csv'
movies_pickle = './res/pickles/movieInfo.pickle'
num_features = 8

In [11]:
def makeMovieDF(movies_file):
    #df = pd.read_csv(movies_file)
    
    df = pd.read_pickle(movies_file)
    movieDict = {}
    length = len(df['movieId'])
    for i in range(length):
        movieDict[int(df['movieId'][i])] = i
    return movieDict
    #print(df['movieId'].to_dict())

In [12]:
movieIdDict = makeMovieDF(movies_pickle)
#makeMovieDF(movies_pickle)

In [13]:
def makeNPArray(ratings_file, movieIdDict):
    num_users = 500
    num_movies = 3883
    
    userMovieMatrix = np.zeros( shape = (num_users, num_movies) )
    
    df = pd.read_csv(ratings_file)
    length = len(df['rating'])
    
    for i in range(length):
        user = int(df['userId'][i])
        movie = int(df['movieId'][i])
        rating = float(df['rating'][i])
        userMovieMatrix[user-1, movieIdDict[movie]] = rating
    
    return userMovieMatrix

In [14]:
userMovieRatingMatrix = makeNPArray(ratings_file, movieIdDict)

In [8]:
def matrix_factorization(X,P,Q,K,steps,alpha,beta):
    Q = Q.T
    for step in range(steps):
        print(step)
        #for each user
        for i in range(X.shape[0]):
            #for each item/movie
            for j in range(X.shape[1]):
                if X[i][j] > 0:
                    #calculate the error of the element
                    eij = X[i][j] - np.dot(P[i,:],Q[:,j])
                    #second norm of P and Q for regularilization
                    sum_of_norms = 0
                    for k in range(K):
                        sum_of_norms += LA.norm(P[:,k]) + LA.norm(Q[k,:])
                    #added regularized term to the error
                    #sum_of_norms += LA.norm(P) + LA.norm(Q)
                    #print sum_of_norms
                    eij += ((beta/2) * sum_of_norms)
                    #print eij
                    #compute the gradient from the error
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * ( 2 * eij * Q[k][j] - (beta * P[i][k]))
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - (beta * Q[k][j]))

        #compute total error
        error = 0
        num = 0
        #for each user
        for i in range(X.shape[0]):
            #for each item
            for j in range(X.shape[1]):
                if X[i][j] > 0:
                    error += np.power(X[i][j] - np.dot(P[i,:],Q[:,j]),2)
                    num+=1
        print(error, error/num)
        if error < 0.001:
            break
    return P, Q.T

In [35]:
def main(X,K):
    #no of users
    N= X.shape[0]
    #no of movies
    M = X.shape[1]
    #P: an initial matrix of dimension N x K, where is n is no of users \
    # and k is hidden latent features
    P = np.random.rand(N,K)
    #Q : an initial matrix of dimension M x K, where M is no of movies  \
    # and K is hidden latent features
    Q = np.random.rand(M,K)
    #steps : the maximum number of steps to perform the optimisation,   \
    # hardcoding the values
    #alpha : the learning rate, hardcoding the values
    #beta  : the regularization parameter, hardcoding the values
    steps = 5000
    alpha = 0.001
    beta = float(0.05)
    estimated_P, estimated_Q = matrix_factorization(X,P,Q,K,steps,alpha,beta)
    #Predicted numpy array of users and movie ratings
    modeled_X = np.dot(estimated_P,estimated_Q.T)
    np.savetxt('mf_result.txt', modeled_X, delimiter=',')

In [16]:
num_users = userMovieRatingMatrix.shape[0]
num_movies = userMovieRatingMatrix.shape[1]
#userFeatureMat = np.zeros(shape=(num_users, num_features))
userFeatureMat = np.random.rand(num_users, num_features)
#movieFeatureMat = np.zeros(shape=(num_movies, num_features))
movieFeatureMat = np.random.rand(num_movies, num_features)
#userFeatureMat += 2
#movieFeatureMat += 1.2
steps = 10
alpha = 0.0005   # Learning Rate
beta = 0.01    # Regularisation parameter

#print(userFeatureMat, movieFeatureMat)

In [34]:
estimateP, estimateQ = matrix_factorization(userMovieRatingMatrix, userFeatureMat, movieFeatureMat, num_features, steps, alpha, beta)

0


859611.85226
1


388543.85958
2


314373.807147
3


310516.918354
4


313104.387309
5


315080.837746
6


316935.845461
7


318956.219777
8


321137.231264
9


323429.345244


In [35]:
estimateP

array([[ 0.54649705,  0.74024534,  0.86167009, ...,  1.1050368 ,
         1.12608817,  0.70832583],
       [ 1.07955233,  0.96727255,  1.15056881, ...,  0.52789258,
         0.79763495,  0.3586398 ],
       [ 0.56543662,  1.02861268,  0.39897508, ...,  0.71653602,
         0.82991141,  1.16407388],
       ..., 
       [ 0.44140189,  0.79461946,  0.97953711, ...,  0.83819181,
         0.76105779,  0.41689438],
       [ 0.88841874,  0.77424878,  0.90841306, ...,  0.6823243 ,
         0.7540109 ,  0.64647482],
       [ 0.63659668,  0.51263531,  0.79262265, ...,  1.01976551,
         0.47258844,  0.95807274]])

In [36]:
estimateQ

array([[ 0.47287855,  0.61726183,  0.86590843, ...,  0.68338419,
         0.86631641,  0.42488132],
       [ 0.48581313,  0.24827958,  0.7728637 , ...,  0.29908098,
         0.2972337 ,  0.95659145],
       [ 0.72199455,  0.21032063,  1.03920081, ...,  0.84354828,
         0.33838513,  0.38205916],
       ..., 
       [ 0.75861156,  0.94540718,  0.42283614, ...,  0.81372962,
         0.34979636,  0.4008655 ],
       [ 0.89267317,  0.14601466,  0.19114451, ...,  0.84835715,
         0.59798486,  0.86710714],
       [ 0.38983834,  0.69276953,  0.83515621, ...,  0.41398026,
         0.573996  ,  0.93865979]])

In [37]:
ep = pd.DataFrame(estimateP)
eq = pd.DataFrame(estimateQ)

In [39]:
ep.to_csv('res/small/estimateP.csv')
eq.to_csv('res/small/estimateQ.csv')

In [320]:
model = np.dot(estimateP, estimateQ.T)

In [321]:
model

array([[ 5.09493251,  4.17105576,  4.11653395, ...,  3.81665939,
         4.85730177,  4.62776334],
       [ 4.71718558,  3.51228907,  3.59094142, ...,  4.14188096,
         4.6868414 ,  4.5213138 ],
       [ 4.5875312 ,  4.03868586,  3.79269198, ...,  4.3857373 ,
         5.17371226,  4.5510186 ],
       ..., 
       [ 4.34935281,  3.81915111,  3.38191325, ...,  3.68129748,
         4.50592731,  4.11652327],
       [ 5.00295859,  4.11305461,  4.17190253, ...,  4.40248939,
         5.09790512,  4.77951244],
       [ 4.52207949,  3.28528552,  3.85323358, ...,  3.43176193,
         4.32942977,  4.18671814]])

In [322]:
modelDF = pd.DataFrame(model)

In [16]:
modelDF.to_csv('res/small/model_2.csv')

NameError: name 'modelDF' is not defined

In [17]:
model_file = 'res/small/model_2.csv'
ratings_file = './res/small/ratings.csv'
movies_file = './res/small/movieInfo.csv'
movies_pickle = './res/pickles/movieInfo.pickle'

num_users = 500
num_movies = 3883

In [18]:
def makeMovieIdAndTitleDF(movies_file):
    #df = pd.read_csv(movies_file)
    df = pd.read_pickle(movies_file)
    movieIdDict = {}
    movieTitleDict = {}
    length = len(df['movieId'])
    for i in range(length):
        movieIdDict[int(df['movieId'][i])] = i
        movieTitleDict[int(df['movieId'][i])] = df['movieTitle'][i]
    return movieIdDict, movieTitleDict

In [19]:
def getUnratedMovies(ratings_file, movieIdDict, num_users, num_movies):
    userMovieMatrix = np.zeros( shape = (num_users, num_movies) )
    
    df = pd.read_csv(ratings_file)
    length = len(df['rating'])
    
    for i in range(length):
        user = int(df['userId'][i])
        movie = int(df['movieId'][i])
        rating = float(df['rating'][i])
        userMovieMatrix[user-1, movieIdDict[movie]] = rating
    
    unratedMovieIdDict = {}
    
    for i in range(userMovieMatrix.shape[0]):
        unratedMovieIds = np.nonzero( userMovieMatrix[i]==0 )
        #print(unratedMovieIds)
        unratedMovieIds = list(unratedMovieIds[0])
        #print(unratedMovieIds)
        unratedMovieIds = map(lambda x: x+1,unratedMovieIds)
        #print(unratedMovieIds)
        unratedMovieIdDict[i+1] = unratedMovieIds
    
    return unratedMovieIdDict

In [20]:
unrated = getUnratedMovies(ratings_file, movieIdDict, num_users, num_movies)

In [243]:
for i in unrated[1]:
    print(i)

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
2


794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
904
905
906
907
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1011
1012
1013
1014
1015
1018
1019
1020
1021
1022
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
10

2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941


In [21]:
def getPredictedNPArray(model_file, num_users, num_movies):
    userMovieMatrix = np.zeros( shape = (num_users, num_movies) )
    
    df = pd.read_csv(model_file)
    #length = len(df[0])

    for i in range(num_users):
        rating = df.loc[i]
        rating = [k for k in rating[1:]]
        for mId in range(num_movies):
            userMovieMatrix[i][mId] = rating[mId]
    
    return userMovieMatrix

In [230]:
predicted_ratings = getPredictedNPArray(model_file, num_users, num_movies)

In [22]:
currentUser = 10

In [23]:
def recommend(ratings_file,model_file,movies_file,user):
    num_users = 500
    num_movies = 3883
    
    movieIdDict, movieTitleDict = makeMovieIdAndTitleDF(movies_file)
    predicted_ratings = getPredictedNPArray(model_file, num_users, num_movies)
    unratedDict = getUnratedMovies(ratings_file, movieIdDict, num_users, num_movies)
    
    revMovieIdDict = {}
    
    for key, val in movieIdDict.items():
        revMovieIdDict[val] = key
    
    possibleRatings = {}
    
    unratedMovies = unratedDict[user]
    
    for movie in unratedMovies:
        possibleRatings[int(movie)] = predicted_ratings[user-1][int(movie)-1]
    
    sortedMovies = sorted(possibleRatings.items(), key=operator.itemgetter(1), reverse=True)
    
    num_recommend = 10
    
    for i in range(num_recommend):
        mId, rating = sortedMovies[i]
        movieId = revMovieIdDict[mId]
        print(movieTitleDict[movieId], rating)

In [24]:
#recommend(ratings_file, model_file, movies_pickle, currentUser)
recommend(ratings_file, model_file, movies_pickle, 20)

Bringing Up Baby (1938) 5.87215974371
Best in Show (2000) 5.81376762904
Dersu Uzala (1974) 5.75430752798
It Came from Hollywood (1982) 5.66390519613
Koyaanisqatsi (1983) 5.63220785872
Dear Jesse (1997) 5.6214102747
Orgazmo (1997) 5.61198983546
Hippie Revolution, The (1996) 5.59651037187
Roseanna's Grave (For Roseanna) (1997) 5.59330594787
JLG/JLG - autoportrait de décembre (1994) 5.57364059641


In [51]:
#ep = pd.read_csv('res/small/estimateP.csv').as_matrix()
#eq = pd.read_csv('res/small/estimateQ.csv').as_matrix()

ep = np.zeros()

In [85]:
#ep = ep[:,:]
#print(ep.shape[0], ep.shape[1])
ep

array([[ 0.5483045 ,  1.14182126,  0.28525745, ...,  0.94351605,
         0.89005035,  0.27526693],
       [ 1.16603597,  0.79442467,  1.15363245, ...,  0.64118483,
         0.03504948,  0.18914346],
       [ 0.51457882,  1.02177654,  0.18303403, ...,  0.88197035,
         0.42630002,  0.53568213],
       ..., 
       [ 0.09149655,  0.14278155,  0.63078413, ...,  0.6125388 ,
         0.79939824,  0.5452111 ],
       [ 0.32786595,  1.20476316,  0.41310296, ...,  0.85137918,
         0.71407236,  0.27286768],
       [-0.33529873,  0.67974395,  1.41461795, ...,  1.13425754,
         0.29724092,  1.13684847]])

In [84]:
#eq = eq[:,:]
#print(eq.shape[0], eq.shape[1])
eq

array([[ 0.93324093,  1.05069367,  0.26045974, ...,  1.01150577,
         0.83843392,  0.96890505],
       [ 1.00095718,  1.18852354,  0.34263049, ...,  0.74287968,
         0.40142949,  1.72696739],
       [ 1.23127553,  0.09465379,  0.7881101 , ...,  1.10413655,
         0.6779749 ,  0.28691316],
       ..., 
       [ 1.11161944,  1.30197156,  0.9494499 , ...,  1.25504028,
         0.70095536,  0.8146347 ],
       [ 1.41717182,  0.53226005,  0.72919692, ...,  1.25443313,
         1.12957211,  1.39218297],
       [ 0.19484237,  1.11581822,  0.41363897, ...,  1.0765235 ,
         1.06169444,  1.40743687]])

In [86]:
estimateP, estimateQ = matrix_factorization(userMovieRatingMatrix, ep, eq, num_features, 50, 0.0002, beta/50)

0


46975.0452929 0.635906448984
1


46941.9330571 0.63545820494
2


46908.8923222 0.635010928811
3


46875.9233989 0.634564624804
4


46843.0265885 0.634119296998
5


46810.2021832 0.633674949346
6


46777.4504657 0.63323158568
7


46744.7717101 0.632789209704
8


46712.1661809 0.632347825005
9


46679.6341343 0.631907435046
10


46647.1758172 0.631468043172
11


46614.7914678 0.631029652608
12


46582.4813158 0.630592266462
13


46550.2455822 0.630155887726
14


46518.0844795 0.629720519277
15


46485.9982119 0.629286163879
16


46453.9869752 0.628852824182
17


46422.0509569 0.628420502726
18


46390.1903365 0.62798920194
19


46358.4052854 0.627558924144
20


46326.6959673 0.627129671553
21


46295.0625376 0.626701446272
22


46263.5051444 0.626274250306
23


46232.0239279 0.625848085554
24


46200.619021 0.625422953811
25


46169.2905488 0.624998856774
26


46138.0386294 0.624575796041
27


46106.8633734 0.624153773111
28


46075.7648846 0.623732789384
29


46044.7432594 0.623312846169
30


46013.7985873 0.622893944678
31


45982.9309513 0.622476086032
32


45952.1404273 0.62205927126
33


45921.4270847 0.621643501302
34


45890.7909863 0.621228777008
35


45860.2321887 0.620815099142
36


45829.7507419 0.620402468382
37


45799.3466897 0.619990885323
38


45769.02007 0.619580350476
39


45738.7709144 0.619170864268
40


45708.5992487 0.618762427051
41


45678.5050928 0.618355039093
42


45648.4884611 0.617948700587
43


45618.549362 0.61754341165
44


45588.6877987 0.617139172324
45


45558.9037688 0.616735982575
46


45529.1972646 0.616333842301
47


45499.5682731 0.615932751325
48


45470.0167763 0.615532709403
49


45440.542751 0.615133716222


In [164]:
estimateP

array([[ 0.54725017,  0.7427698 ,  0.85758487, ...,  1.10753736,
         1.12821352,  0.71261738],
       [ 1.08316127,  0.97387342,  1.16461773, ...,  0.53402968,
         0.7947899 ,  0.36286959],
       [ 0.57190194,  1.03018156,  0.39540384, ...,  0.7244104 ,
         0.82974005,  1.16330683],
       ..., 
       [ 0.44555392,  0.78965336,  0.97471363, ...,  0.83516442,
         0.76094738,  0.42335713],
       [ 0.88133491,  0.78741246,  0.89883571, ...,  0.68541351,
         0.75702658,  0.6415537 ],
       [ 0.63131652,  0.51469638,  0.80021552, ...,  1.01722638,
         0.46543375,  0.94925505]])

In [165]:
estimateQ

array([[ 0.47235894,  0.61612991,  0.86392994, ...,  0.68291042,
         0.87005965,  0.43032863],
       [ 0.49138329,  0.25751226,  0.77074145, ...,  0.30244703,
         0.30157479,  0.9618421 ],
       [ 0.72108059,  0.2080146 ,  1.04007509, ...,  0.84450863,
         0.33791057,  0.37845919],
       ..., 
       [ 0.75972125,  0.94666285,  0.42448386, ...,  0.81528765,
         0.35067733,  0.40191971],
       [ 0.89907576,  0.15090558,  0.19739598, ...,  0.85316457,
         0.60354549,  0.87362172],
       [ 0.38559683,  0.69294408,  0.8282527 , ...,  0.41730614,
         0.57376712,  0.9341523 ]])

In [87]:
model3 = np.dot(estimateP, estimateQ.T)

In [88]:
model3

array([[ 4.48619061,  3.37543001,  3.65064835, ...,  3.9904843 ,
         4.41655598,  3.96317278],
       [ 3.6170846 ,  3.30227466,  3.84425928, ...,  4.03460639,
         3.84205824,  3.35777768],
       [ 4.46039192,  3.66011166,  2.98406446, ...,  3.91376045,
         4.57525878,  4.00987934],
       ..., 
       [ 3.6857478 ,  2.69535107,  3.14990823, ...,  3.07134414,
         3.94959177,  3.460928  ],
       [ 4.52496219,  3.4058259 ,  3.417538  , ...,  4.22377388,
         4.24239511,  4.15568173],
       [ 3.88222505,  3.08919226,  3.77885473, ...,  3.5960822 ,
         3.73576571,  3.62104353]])

In [89]:
ep = estimateP
eq = estimateQ

In [91]:
np.max(model3)

7.8231386878816362

In [92]:
mod = pd.DataFrame(model3)
mod.to_csv('res/small/models/model14.csv')

In [43]:
epd = pd.DataFrame(ep)
eqd = pd.DataFrame(eq)

epd.to_csv('res/small/models/ep8.csv')
eqd.to_csv('res/small/models/eq8.csv')

In [2]:
ep1 = pd.read_csv('res/small/models/ep5.csv').as_matrix()[:,1:]
eq1 = pd.read_csv('res/small/models/eq5.csv').as_matrix()[:,1:]

print(ep1.shape[0], ep1.shape[1], eq1.shape[0], eq1.shape[1])

500 8 3883 8


In [3]:
ep = ep1
eq = eq1

In [6]:
np.dot(ep, eq.T)

array([[ 4.58845777,  3.33942047,  3.69592771, ...,  3.55855812,
         3.60586455,  3.91393824],
       [ 4.1913    ,  3.15313564,  3.5599004 , ...,  3.50997203,
         3.19035651,  3.71596676],
       [ 4.18216671,  3.44425206,  3.09791878, ...,  3.49920065,
         3.66424039,  4.00040947],
       ..., 
       [ 4.06491628,  3.07902782,  3.2349767 , ...,  3.19140403,
         3.25709458,  3.70000513],
       [ 4.48807293,  3.60426838,  3.62033597, ...,  3.57212376,
         3.77493617,  4.08124283],
       [ 3.98736955,  3.37148173,  3.47856672, ...,  3.22318914,
         3.48606248,  3.64978782]])