In [2]:
%matplotlib inline
from itertools import chain
import os

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

plt.style.use("seaborn-darkgrid")

In [3]:
DATA_DIR = "data"
MOVIELENS_DIR = os.path.join(DATA_DIR, "ml-latest-small")

## Ratings

In [4]:
ratings = pd.read_csv(
    os.path.join(MOVIELENS_DIR, "ratings.csv"),
    dtype=dict(userId=int, movieId=int, rating=np.float32),
    usecols=["userId", "movieId", "rating"]
)

In [5]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [6]:
ratings.dtypes

userId       int64
movieId      int64
rating     float32
dtype: object

In [7]:
%%time

(
    ratings
    .pivot(index="userId", columns="movieId", values="rating")
    .to_pickle(os.path.join(DATA_DIR, "pivoted_ratings.pickle"))
)

CPU times: user 49.1 ms, sys: 42.3 ms, total: 91.3 ms
Wall time: 155 ms


## Movie metadata

In [None]:
movies = pd.read_csv(
    os.path.join(MOVIELENS_DIR, "movies.csv")
)

In [18]:
movies.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movies = movies.assign(
    genres=movies.genres.str.split("|")
)

genre_lens = movies.genres.map(len)

movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"


In [6]:
flattened_genre_column = list(chain.from_iterable(movies.genres))

In [7]:
movies = pd.DataFrame(
    np.repeat(movies[["movieId", "title"]].values, genre_lens, axis=0),
    columns=["movieId", "title"]
)

movies = movies.assign(genre=flattened_genre_column)

movies = pd.concat(
    [movies.drop(columns=["genre"]), pd.get_dummies(movies.genre)], axis=1
)