In [1]:
import mga.dataset as dataset
from pathlib import Path
import pandas as pd

In [None]:
dataset.download_ml_small(force_download=True)

In [None]:
dataset.download_ml_25m(force_download=True)

In [2]:
movie_df = pd.read_csv('movies.csv')
link_df = pd.read_csv('links.csv')

In [3]:
movie_df['imdb_id'] = link_df['imdbId']
movie_df['tmdb_id'] = link_df['tmdbId']
movie_df = movie_df.rename(columns={'movieId': 'movielens_id'})

In [4]:
genre_set = set()
for genre_str in movie_df['genres']:
    genre_lst = genre_str.split('|')
    for genre in genre_lst:
        genre_set.add(genre)

no_genre_key = '(no genres listed)'

count_movie_with_no_genre = sum(movie_df['genres'] == no_genre_key)
print(f'Total number of movies with no genre: {count_movie_with_no_genre}')
genre_set.remove(no_genre_key)

ignored_genre_set = {'IMAX', 'Film-Noir', 'Animation', 'Documentary'}

print(f'Genres ({len(genre_set)}): {genre_set}')
print(f'Ignored genres: {ignored_genre_set}')

with open('genre.txt', 'w') as file:
    for genre in genre_set:
        file.write(f'{genre}\n')

Total number of movies with no genre: 5062
Genres (19): {'Film-Noir', 'Comedy', 'Action', 'Crime', 'Western', 'Fantasy', 'Mystery', 'Documentary', 'Drama', 'Thriller', 'Musical', 'Children', 'War', 'Sci-Fi', 'Romance', 'Animation', 'IMAX', 'Horror', 'Adventure'}
Ignored genres: {'Film-Noir', 'Animation', 'IMAX', 'Documentary'}


In [5]:
rating_df = pd.read_csv('ratings.csv')
del rating_df['timestamp']
rating_df = rating_df.rename(columns={'userId': 'user_id', 'movieId': 'movie_id'})

# Get movies with view count larger than the threshold
MIN_MOVIE_VIEW_THRESHOLD = 15

df_count = rating_df[['movie_id', 'rating']].groupby('movie_id').aggregate(['count'])
df_count.columns = df_count.columns.droplevel()
df_count = df_count.loc[df_count['count'] > MIN_MOVIE_VIEW_THRESHOLD]

# Filter rating df
rating_df = rating_df[rating_df['movie_id'].isin(df_count.index)]

# Filter movie df and reset index
old_movie_count = movie_df.shape[0]
movie_df = movie_df[movie_df['movielens_id'].isin(df_count.index)]
movie_df = movie_df.reset_index(drop=True)
new_movie_count = movie_df.shape[0]
dropped_movie_count = old_movie_count - new_movie_count
print(f'Dropped movie count: {dropped_movie_count}')
print(f'New movie count: {new_movie_count}')

movilens2id = pd.DataFrame(data=movie_df.index, index=movie_df['movielens_id'])
rating_df['movie_id'] = movilens2id.loc[rating_df['movie_id']].values

id2movielens = pd.Series(rating_df['user_id'].drop_duplicates().sort_values().values)
movielens2id = pd.Series(data=id2movielens.index, index=id2movielens)
rating_df['user_id'] = movielens2id.loc[rating_df['user_id']].values

movie_df.to_csv('movies.csv', index_label='id')

Dropped movie count: 42389
New movie count: 20034


In [6]:
row_count = rating_df['user_id'].max() + 1
col_count = rating_df['movie_id'].max() + 1
nnz_count = rating_df.shape[0]

with open('ratings.mtx', 'w') as file:
    file.write(f'{row_count} {col_count} {nnz_count}\n')
    rating_df.to_csv(file, index=False, sep=' ', header=False)

In [7]:
Path("ratings.csv").unlink()
Path("links.csv").unlink()