In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math

In [None]:
movies = pd.read_csv('./ml-1m/movies.dat', sep = '::', header = None, names = ["MovieID", "Title", "Genres"], engine='python',encoding='latin-1')
ratings = pd.read_csv('./ml-1m/ratings.dat',sep='::',header=None,engine='python',names=["UserID", "MovieID", "Rating", "Timestamp"])
users = pd.read_csv('./ml-1m/users.dat',sep='::',header=None,engine='python',names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])

In [None]:
display(movies.head())
display(ratings.head())
display(users.head())

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [None]:
movie_ratings = movies.set_index("MovieID").join(ratings.set_index("MovieID"), on = 'MovieID').reset_index()

In [None]:
movie_ratings.isnull().any()

MovieID      False
Title        False
Genres       False
UserID        True
Rating        True
Timestamp     True
dtype: bool

In [None]:
movie_train, movie_test = train_test_split(movie_ratings, test_size = 0.1)
movie_train.shape

(900347, 6)

In [None]:
movie_train

Unnamed: 0,MovieID,Title,Genres,UserID,Rating,Timestamp
113467,416,Bad Girls (1994),Western,6003.0,4.0,9.569801e+08
24301,50,"Usual Suspects, The (1995)",Crime|Thriller,3426.0,5.0,9.673501e+08
123650,457,"Fugitive, The (1993)",Action|Thriller,2898.0,4.0,1.001213e+09
336360,1238,Local Hero (1983),Comedy,4869.0,4.0,9.628867e+08
234110,942,Laura (1944),Crime|Film-Noir|Mystery,1899.0,4.0,9.799429e+08
...,...,...,...,...,...,...
176354,628,Primal Fear (1996),Drama|Thriller,2041.0,3.0,9.791806e+08
812521,3024,Piranha (1978),Horror|Sci-Fi,2228.0,1.0,9.745977e+08
983671,3822,"Girl on the Bridge, The (La Fille sur le Pont)...",Drama|Romance,3049.0,4.0,9.701838e+08
942349,3623,Mission: Impossible 2 (2000),Action|Thriller,3625.0,4.0,9.665620e+08


    Note: movie_index (in matrix) != movie_id since some movie_ids are missing. Use movie_ind to access the movie index 
    using movid ids or ind_movie to access the movie ids using movie index.

In [None]:
movie_ind = {} # movieID as key, index as val
ind_movie = {} # index as key, movieID as val
for ind, row in movies.iterrows():
    movie_ind[row.MovieID] = ind
    ind_movie[ind] = row.MovieID

In [None]:
mat = [[np.nan for _ in range(len(movies))] for _ in range(len(users))]
mat = np.asarray(mat)
for ind, row in movie_train.iterrows():
    if row.UserID>0:
        mat[int(row.UserID)-1][int(movie_ind[row.MovieID])-1] = row.Rating

In [None]:
def user_mean(mat):
    new_mat = []
    for row in mat:
        if np.nanmean(row) > 0:
            row_mean = math.ceil(np.nanmean(row))
        else:
            row_mean = np.random.randint(low = 1, high = 5, size = 1)
        new_row = np.nan_to_num(row,copy = True, nan = row_mean)
        new_mat.append(new_row.tolist())
    return np.asarray(new_mat)
def film_mean(mat):
    mat_t = np.transpose(mat)
    new_mat_t = user_mean(mat_t)
    new_mat = np.transpose(new_mat_t)
    return new_mat

In [None]:
user_mean_mat = user_mean(mat)

In [None]:
film_mean_mat = film_mean(mat)

  if np.nanmean(row) > 0:


In [None]:
def mse(inp_mat,test):
    movie_ids = list(test['MovieID'])
    user_ids = list(test['UserID'])
    ratings = list(test['Rating'])
    err_lst = []
    for i in range(len(movie_ids)):
        if user_ids[i] > 0: # not nan
            err_val = (inp_mat[int(user_ids[i])-1][movie_ind[int(movie_ids[i])]-1] - ratings[i])**2
            err_lst.append(err_val)
    return np.mean(err_lst)

In [None]:
print("User Mean MSE: ", mse(user_mean_mat,movie_test))
print("Film Mean MSE: ", mse(film_mean_mat,movie_test))

User Mean MSE:  1.380542475230697
Film Mean MSE:  1.290793117582956
