In [1]:
import numpy as np
import pandas as pd

## Read Data

In [4]:
ratings = pd.read_csv("https://liangfgithub.github.io/MovieData/ratings.dat", sep="::", engine='python', header=None)
ratings.columns = ['UserID','MovieID','Rating','Timestamp']
ratings['UserID'] = ratings['UserID'].apply(lambda x:'u' + str(x))
ratings['MovieID'] = ratings['MovieID'].apply(lambda x:'m' + str(x))

In [5]:
ratings

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,u1,m1193,5,978300760
1,u1,m661,3,978302109
2,u1,m914,3,978301968
3,u1,m3408,4,978300275
4,u1,m2355,5,978824291
...,...,...,...,...
1000204,u6040,m1091,1,956716541
1000205,u6040,m1094,5,956704887
1000206,u6040,m562,5,956704746
1000207,u6040,m1096,4,956715648


In [8]:
users = pd.read_csv("https://liangfgithub.github.io/MovieData/users.dat",sep="::",engine='python', encoding="ISO-8859-1", header = None)
users.columns = ['UserID','Gender', 'Age', 'Occupation', 'Zip-code']
users['UserID'] = users['UserID'].apply(lambda x:'m' + str(x))

In [9]:
users

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,m1,F,1,10,48067
1,m2,M,56,16,70072
2,m3,M,25,15,55117
3,m4,M,45,7,02460
4,m5,M,25,20,55455
...,...,...,...,...,...
6035,m6036,F,25,15,32603
6036,m6037,F,45,1,76006
6037,m6038,F,56,1,14706
6038,m6039,F,45,0,01060


In [54]:
movies = pd.read_csv("https://liangfgithub.github.io/MovieData/movies.dat",sep="::",engine='python', encoding="ISO-8859-1", header = None)
movies.columns = ['MovieID','Title','Genres']
movies['MovieID'] = movies['MovieID'].apply(lambda x:'m' + str(x))

In [55]:
movies

Unnamed: 0,MovieID,Title,Genres
0,m1,Toy Story (1995),Animation|Children's|Comedy
1,m2,Jumanji (1995),Adventure|Children's|Fantasy
2,m3,Grumpier Old Men (1995),Comedy|Romance
3,m4,Waiting to Exhale (1995),Comedy|Drama
4,m5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,m3948,Meet the Parents (2000),Comedy
3879,m3949,Requiem for a Dream (2000),Drama
3880,m3950,Tigerland (2000),Drama
3881,m3951,Two Family House (2000),Drama


## System I: Recommendation Based on Genres
Imagine you know the user’s favorite movie genre. How would you recommend movies to them?

Propose a recommendation scheme along with all the technical details necessary to implement it. For example, you can recommend the **`top five most popular movies`** in that genre, but you need to define what you mean by “most popular.” Similarly, you can recommend the **`top five highly-rated movies`** in that genre, but you must specify how you define “highly-rated.” (Will the movie that receives only one 5-point review be considered highly rated?)

In [70]:
grouped_ratings = ratings.groupby('MovieID').agg({'Rating': ['count', 'mean'] })
grouped_ratings.columns = grouped_ratings.columns.droplevel(0)
grouped_ratings.reset_index(inplace=True)
grouped_ratings.rename(columns = {'count': 'RatingFreq', 'mean': 'RatingAvg'}, inplace=True)

In [71]:
genre_list = ["Action", "Adventure", "Animation","Children's", "Comedy", 
              "Crime","Documentary", "Drama", "Fantasy","Film-Noir", 
              "Horror", "Musical",  "Mystery", "Romance", "Sci-Fi",
               "Thriller", "War", "Western"]

In [72]:
movies_ratings = movies.merge(grouped_ratings, left_on='MovieID', right_on='MovieID')

In [73]:
movies_ratings

Unnamed: 0,MovieID,Title,Genres,RatingFreq,RatingAvg
0,m1,Toy Story (1995),Animation|Children's|Comedy,2077,4.146846
1,m2,Jumanji (1995),Adventure|Children's|Fantasy,701,3.201141
2,m3,Grumpier Old Men (1995),Comedy|Romance,478,3.016736
3,m4,Waiting to Exhale (1995),Comedy|Drama,170,2.729412
4,m5,Father of the Bride Part II (1995),Comedy,296,3.006757
...,...,...,...,...,...
3701,m3948,Meet the Parents (2000),Comedy,862,3.635731
3702,m3949,Requiem for a Dream (2000),Drama,304,4.115132
3703,m3950,Tigerland (2000),Drama,54,3.666667
3704,m3951,Two Family House (2000),Drama,40,3.900000


### Method 1 Top Five most Popular Movies

In [94]:
genre_movie_id_rating_freq = {}
genre_movie_title_rating_freq = {}
K = 5  # top 5
for genre in genre_list:
    true_table = movies_ratings['Genres'].apply(lambda x: True if genre in x else False)
    rating_freq_df = movies_ratings[true_table].sort_values('RatingFreq', ascending=False)
    genre_movie_id_rating_freq[genre] = rating_freq_df['MovieID'].values[:K]
    genre_movie_title_rating_freq[genre] = rating_freq_df['Title'].values[:K]

top5_rating_freq_with_movie_id = pd.DataFrame.from_dict(genre_movie_id_rating_freq).T
top5_rating_freq_with_movie_title = pd.DataFrame.from_dict(genre_movie_title_rating_freq).T 

In [95]:
top5_rating_freq_with_movie_id

Unnamed: 0,0,1,2,3,4
Action,m260,m1196,m1210,m480,m2028
Adventure,m260,m1196,m1210,m480,m1580
Animation,m1,m2987,m2355,m3114,m588
Children's,m1097,m1,m34,m919,m2355
Comedy,m2858,m1270,m1580,m2396,m1197
Crime,m608,m1617,m858,m296,m50
Documentary,m2064,m246,m162,m3007,m1147
Drama,m2858,m1196,m2028,m593,m608
Fantasy,m260,m1097,m2628,m2174,m2797
Film-Noir,m1617,m541,m2987,m1252,m913


In [96]:
top5_rating_freq_with_movie_title

Unnamed: 0,0,1,2,3,4
Action,Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode V - The Empire Strikes Back...,Star Wars: Episode VI - Return of the Jedi (1983),Jurassic Park (1993),Saving Private Ryan (1998)
Adventure,Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode V - The Empire Strikes Back...,Star Wars: Episode VI - Return of the Jedi (1983),Jurassic Park (1993),Men in Black (1997)
Animation,Toy Story (1995),Who Framed Roger Rabbit? (1988),"Bug's Life, A (1998)",Toy Story 2 (1999),Aladdin (1992)
Children's,E.T. the Extra-Terrestrial (1982),Toy Story (1995),Babe (1995),"Wizard of Oz, The (1939)","Bug's Life, A (1998)"
Comedy,American Beauty (1999),Back to the Future (1985),Men in Black (1997),Shakespeare in Love (1998),"Princess Bride, The (1987)"
Crime,Fargo (1996),L.A. Confidential (1997),"Godfather, The (1972)",Pulp Fiction (1994),"Usual Suspects, The (1995)"
Documentary,Roger & Me (1989),Hoop Dreams (1994),Crumb (1994),American Movie (1999),When We Were Kings (1996)
Drama,American Beauty (1999),Star Wars: Episode V - The Empire Strikes Back...,Saving Private Ryan (1998),"Silence of the Lambs, The (1991)",Fargo (1996)
Fantasy,Star Wars: Episode IV - A New Hope (1977),E.T. the Extra-Terrestrial (1982),Star Wars: Episode I - The Phantom Menace (1999),Beetlejuice (1988),Big (1988)
Film-Noir,L.A. Confidential (1997),Blade Runner (1982),Who Framed Roger Rabbit? (1988),Chinatown (1974),"Maltese Falcon, The (1941)"


### Method 2 Top Five high-rated Movies

In [97]:
genre_movie_id_rating_avg = {}
genre_movie_title_rating_avg = {}
K = 5  # top 5
for genre in genre_list:
    true_table = movies_ratings['Genres'].apply(lambda x: True if genre in x else False)
    rating_avg_df = movies_ratings[true_table].sort_values('RatingAvg', ascending=False)
    genre_movie_id_rating_avg[genre] = rating_avg_df['MovieID'].values[:K]
    genre_movie_title_rating_avg[genre] = rating_avg_df['Title'].values[:K]

top5_rating_avg_with_movie_id = pd.DataFrame.from_dict(genre_movie_id_rating_avg).T
top5_rating_avg_with_movie_title = pd.DataFrame.from_dict(genre_movie_title_rating_avg).T 

In [98]:
top5_rating_avg_with_movie_id

Unnamed: 0,0,1,2,3,4
Action,m2905,m2019,m858,m1198,m260
Adventure,m3172,m2905,m1198,m260,m1204
Animation,m745,m1148,m720,m1223,m3429
Children's,m919,m3114,m1,m2761,m1023
Comedy,m3233,m1830,m3607,m745,m1148
Crime,m3656,m858,m50,m3517,m3435
Documentary,m3881,m787,m3338,m2930,m128
Drama,m3382,m989,m3607,m3245,m53
Fantasy,m260,m792,m1097,m247,m1073
Film-Noir,m922,m3435,m913,m1252,m1267


In [99]:
top5_rating_avg_with_movie_title

Unnamed: 0,0,1,2,3,4
Action,Sanjuro (1962),Seven Samurai (The Magnificent Seven) (Shichin...,"Godfather, The (1972)",Raiders of the Lost Ark (1981),Star Wars: Episode IV - A New Hope (1977)
Adventure,Ulysses (Ulisse) (1954),Sanjuro (1962),Raiders of the Lost Ark (1981),Star Wars: Episode IV - A New Hope (1977),Lawrence of Arabia (1962)
Animation,"Close Shave, A (1995)","Wrong Trousers, The (1993)",Wallace & Gromit: The Best of Aardman Animatio...,"Grand Day Out, A (1992)",Creature Comforts (1990)
Children's,"Wizard of Oz, The (1939)",Toy Story 2 (1999),Toy Story (1995),"Iron Giant, The (1999)",Winnie the Pooh and the Blustery Day (1968)
Comedy,Smashing Time (1967),Follow the Bitch (1998),One Little Indian (1973),"Close Shave, A (1995)","Wrong Trousers, The (1993)"
Crime,Lured (1947),"Godfather, The (1972)","Usual Suspects, The (1995)","Bells, The (1926)",Double Indemnity (1944)
Documentary,Bittersweet Motel (2000),"Gate of Heavenly Peace, The (1995)",For All Mankind (1989),Return with Honor (1998),Jupiter's Wife (1994)
Drama,Song of Freedom (1936),Schlafes Bruder (Brother of Sleep) (1995),One Little Indian (1973),I Am Cuba (Soy Cuba/Ya Kuba) (1964),Lamerica (1994)
Fantasy,Star Wars: Episode IV - A New Hope (1977),"Hungarian Fairy Tale, A (1987)",E.T. the Extra-Terrestrial (1982),Heavenly Creatures (1994),Willy Wonka and the Chocolate Factory (1971)
Film-Noir,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),Double Indemnity (1944),"Maltese Falcon, The (1941)",Chinatown (1974),"Manchurian Candidate, The (1962)"


## System II: Recommendation Based on Item-Based Collaborative Filtering(IBCF)
For this system, follow these steps. Let `R` denote the 6040-by-3706 rating matrix.
1. Normalize the rating matrix by centering each row. This means subtracting row means from each row of the rating matrix `R`. Row means should be computed based on non-NA entries. For instance, the mean of a vector like `(2, 4, NA, NA)` should be `3`.
2. Compute the Cosine similarity among the `3,706` movies. For movies $i$ and $j$, let $\mathcal{I}_{ij}$ denote the set of users who rated both movies $i$ and $j$. We decide to ignore similarities computed based on less than three user ratings. Thus, define the similarity between movie $i$ and movie $j$ as follows, when the cardinality of $\mathcal{I}_{ij}$ is bigger than two,
$$
    S_{ij} = \frac{1}{2} + \frac{1}{2} \frac{\sum_{l \in \mathcal{I}_{ij}} R_{li} R_{lj}}{\sqrt{\sum_{l \in \mathcal{I}_{ij}} R^2_{li}} \sqrt{\sum_{l \in \mathcal{I}_{ij}} R^2_{lj}}}
$$

This transformation $(1 + cos)/2$ ensures that similarity measures are between 0 and 1. NA values may occur when 1) the set $\mathcal{I}_{ij}$ has a cardinality less than or equal to two (i.e., this pair of movies have been rated by only zero, one, or two users) or 2) one of the denominators is zero.

3. Let $S$ denote the `3706-by-3706` similarity matrix computed in previous step. For each row, sort the non-NA similarity measures and keep the top 30, setting the rest to NA. This new similarity matrix, still denoted as $S$, is no longer symmetric. Save this matrix online. 

Display the pairwise similarity values from the S matrix for the following specified movies: `“m1”, “m10”, “m100”, “m1510”, “m260”, “m3212”`. Please round the results to `7` decimal places.

4. Create a function named `myIBCF`:
- **Input**: `newuser`, a `3706-by-1` vector (denoted as $w$) containing ratings for the `3,706` movies from a new user. Many entries in this vector will be zero. The order of the movies in this vector should match the rating matrix $R$. (Should we center $w$? For IBCF, centering the new user ratings is not necessary.)

- **Inside the function**: Upon receiving this input, your function should download the similarity matrix and use it to compute predictions for movies that have not been rated by this new user yet. Use the following formula to compute the prediction for movie $l$:
$$
    \frac{1}{\sum_{i \in S(l)}S_{li}}{\sum \limits_{i \in S(l)} S_{li} w_i}
$$

where $S(l)$ denotes the set of movies in the 30-nearest neighborhood of movie $l$. Again NA values may occur.

- **Output**: Based on your predictions, recommend the `top 10 movies` to this new user, using the column names of the rating matrix R. Explain what your code should do if fewer than 10 predictions are non-NA. Provide a method to suggest additional movies that have not been rated by this user.


**Test your function**

For your function `myIBCF`, print the `top 10 recommendations` for the following three users:
- User “u1181” from the rating matrix $R$
- User “u1351” from the rating matrix $R$
- A hypothetical user who rates movie “m1613” with 5 and movie “m1755” with 4.

In [109]:
movie_rating_2d_df = pd.read_csv("Movie_Rmat.csv")

In [287]:
movie_rating_2d_df

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
u1,5.0,,,,,,,,,,...,,,,,,,,,,
u10,5.0,,,,,,,,,,...,,,,,,,,,,
u100,,,,,,,,,,,...,,,,,,,,,,
u1000,5.0,,,,,,,,,,...,,,,,,,,,,
u1001,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
u995,,,,,,,,,,,...,,,,,,,,,,
u996,4.0,,,,,,,,,,...,,,,,,,,,,3.0
u997,4.0,,,,,,,,,,...,,,,,,,,,,
u998,,,,,,,,,,,...,,,,,,,,,,


In [288]:
# Step 1: Normalize the rating matrix by centering each row
movie_rating_2d_normalized_df = movie_rating_2d_df.sub(movie_rating_2d_df.mean(axis=1, skipna=True), axis=0)

In [310]:
# Step 2: Compute cosine similarity between users for a given movie
def cosine_similarity(a: np.array,
                      b: np.array,
                      threshold: int = 3):
    masked_a = np.ma.array(a, mask=np.isnan(a)) # Use a mask to mark the NaNs; mask=True (NaN)
    masked_b = np.ma.array(b, mask=np.isnan(b))
    combined_nan_mask = masked_a.mask | masked_b.mask
    # Ignore similiarity if 
    # (1) the movie few than 3 ratings    
    if ((~combined_nan_mask).sum() < threshold):
        return np.nan
    a_valid_norm = np.sqrt((np.ma.array(a, mask=combined_nan_mask)**2).sum())
    b_valid_norm = np.sqrt((np.ma.array(b, mask=combined_nan_mask)**2).sum())
    # Ignore similiarity if 
    # (2) a_valid_norm or b_valid_norm is 0
    if (a_valid_norm > 0.0) and (b_valid_norm > 0.0):
        return (1.0 + np.ma.dot(masked_a, masked_b)/(a_valid_norm * b_valid_norm))/2
    else:
        return np.nan

In [311]:
movie_ids = movie_rating_2d_normalized_df.columns
n_movies = len(movie_ids)
S = np.ones((n_movies, n_movies)) * np.nan

In [313]:
for i in range(n_movies):
    #print(i)
    for j in range(i+1, n_movies):
        cosine_similarity_val = cosine_similarity(movie_rating_2d_normalized_df.iloc[:, i].values, movie_rating_2d_normalized_df.iloc[:, j].values)
        S[i][j] = cosine_similarity_val
        S[j][i] = cosine_similarity_val

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [380]:
report_movie_ids_list = ['m1', 'm10', 'm100', 'm1510', 'm260', 'm3212']
report_index_list = movie_ids.get_indexer(report_movie_ids_list)
S_report_dict = {}
for movie_index in report_index_list:
    S_report_dict[movie_ids[movie_index]] = S[movie_index, report_index_list]
S_report_df = pd.DataFrame.from_dict(S_report_dict)
S_report_df.index = report_movie_ids_list     

In [381]:
S_report_df

Unnamed: 0,m1,m10,m100,m1510,m260,m3212
m1,,0.512106,0.392,,0.741148,
m10,0.512106,,0.547458,,0.534334,
m100,0.392,0.547458,,,0.329694,
m1510,,,,,,
m260,0.741148,0.534334,0.329694,,,
m3212,,,,,,


In [377]:
# 3. For each row, sort the non-NA similarity measures and keep the top 30, setting the rest to NA. 
# This new similarity matrix, still denoted as $S$, is no longer symmetric. Save this matrix online.
K = 30
new_S = np.nan_to_num(S, copy=True, nan=0.0)   # replace nan with 0.0, otherise NaN will be list in the end

In [378]:
for i in range(n_movies):
    temp_nan_mask = np.ones(n_movies, dtype=bool)
    top_K_indices = np.argsort(new_S[i, :])[-K:]  # in ascending order
    temp_nan_mask[top_K_indices] = False   # protect the top K
    new_S[i, temp_nan_mask] = np.nan   # set the rest to NA

In [388]:
new_S_df = pd.DataFrame(new_S, columns=movie_ids, index=movie_ids)

In [391]:
new_S_df.to_csv('S.csv', na_rep='NA')