In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns

In [69]:
#https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset?select=rating.csv

df = pd.read_csv("rating.csv")

In [3]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [4]:
df.describe()

Unnamed: 0,userId,movieId,rating
count,20000260.0,20000260.0,20000260.0
mean,69045.87,9041.567,3.525529
std,40038.63,19789.48,1.051989
min,1.0,1.0,0.5
25%,34395.0,902.0,3.0
50%,69141.0,2167.0,3.5
75%,103637.0,4770.0,4.0
max,138493.0,131262.0,5.0


In [5]:
df.isnull().sum()/len(df)

userId       0.0
movieId      0.0
rating       0.0
timestamp    0.0
dtype: float64

In [6]:
print(f" Total unique user id: { df['userId'].nunique()}")
print(f" Total unique movie id: { df['movieId'].nunique()}")



 Total unique user id: 138493
 Total unique movie id: 26744


In [7]:
# Removing time stemp column
df.drop(columns = ['timestamp'],inplace = True)

In [8]:
# Checking userids whose reviews are less than 30
df_userid = df.groupby(['userId']).count()
df_userid[df_userid['movieId']<30]

Unnamed: 0_level_0,movieId,rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
4,28,28
6,24,24
17,26,26
20,28,28
36,20,20
...,...,...
138469,22,22
138476,26,26
138480,21,21
138488,24,24


In [9]:
# Checking userids whose reviews are less than 30
df_movieid = df.groupby(['movieId']).count()
df_movieid[df_movieid['userId']<30]

Unnamed: 0_level_0,userId,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
51,27,27
109,16,16
133,19,19
143,14,14
395,13,13
...,...,...
131254,1,1
131256,1,1
131258,1,1
131260,1,1


In [10]:
# pd.pivot_table(data= df, index = ['userId'],columns=['movieId'],values='rating')

In [12]:
df_userid.sort_values('movieId',ascending=False).head(10000).index

Index([118205,   8405,  82418, 121535, 125794,  74142,  34576, 131904,  83090,
        59477,
       ...
       130634,  57937,  64096, 110559,  79457,  43988,  31715,  62030,  84397,
       118813],
      dtype='int64', name='userId', length=10000)

### Creating a mapping for movie ids to make movieids sequential

In [70]:

unique_movie_ids = set(df['movieId'])
movie2idx = {}
count=0
for movies in unique_movie_ids:
    movie2idx[movies] = count
    count = count+1

In [71]:
df['movieId'] = df['movieId'].apply(lambda x :movie2idx[x])

In [72]:
df['userId'] = df['userId'] - 1

### Select subset of users and movieids

In [73]:


from collections import Counter
N = df['userId'].max() +1
M = df['movieId'].max()+1

In [74]:
# number of users and movies we would like to keep

n =10000
m = 2000

In [75]:
user_ids_count = Counter(df['userId'])
movie_ids_count = Counter(df['movieId'])

In [76]:
user_ids = [u for u,c in user_ids_count.most_common(n)]
movie_ids = [m for m,c in movie_ids_count.most_common(m)]

In [77]:
df_final  = df[df['userId'].isin(user_ids) & df['movieId'].isin(movie_ids)]

In [78]:
new_user_id_map = {}
i=0
for user in user_ids:
    new_user_id_map[user] = i
    i = i+1

new_movie_id_map = {}
j=0
for movie in movie_ids:
    new_movie_id_map[movie] = j
    j = j+1

In [79]:
df_final['userId'] = df_final['userId'].apply(lambda x : new_user_id_map[x])
df_final['movieId'] = df_final['movieId'].apply(lambda x : new_movie_id_map[x])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['userId'] = df_final['userId'].apply(lambda x : new_user_id_map[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['movieId'] = df_final['movieId'].apply(lambda x : new_movie_id_map[x])


In [82]:
df_final['userId'].max()
df_final['movieId'].max()

1999

In [84]:
df_final.to_csv("./final_rating.csv")

In [85]:
df_final.head()

Unnamed: 0,userId,movieId,rating,timestamp
960,7307,10,4.5,2009-01-02 01:13:41
961,7307,68,2.5,2009-01-02 01:15:59
962,7307,143,3.5,2009-01-01 04:21:44
963,7307,19,5.0,2009-01-01 04:11:35
964,7307,85,4.5,2009-01-02 01:17:12


In [None]:
## Creating dictionaries for faster lookup for below questions
# Given user i which movies j did they rate
# Given moive j which user i have rate it
# Given user i and movie j what is the rating

In [105]:
user2movies= {}
movies2user= {}
usermovies = {}

In [107]:
for i in range(len(df_final)):
    user,movie,rating = df_final.iloc[i][['userId','movieId','rating']]
    if user in user2movies:
        user2movies[user].append(movie)
    else:
        user2movies[user] = [movie]
    
    if movie in movies2user:
        movies2user[movie].append(user)
    else:
        movies2user[movie] = [user]

    usermovies[(user,movie)] = rating

