In [None]:
# default_exp datasets.movielens

# MovieLens Dataset
> Implementation of MovieLens datasets.

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

In [None]:
#export
from typing import Any, Iterable, List, Optional, Tuple, Union, Callable
import os

import pandas as pd

from recohut.utils.common_utils import *
from recohut.datasets.bases.common import Dataset
from recohut.datasets.bases.interactions import InteractionsDataset, InteractionsDataModule

## ML1m Rating Dataset

In [None]:
#export
class ML1mDataset(InteractionsDataset):
    url = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"

    @property
    def raw_file_names(self):
        return 'ratings.dat'

    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        from shutil import move, rmtree
        move(os.path.join(self.raw_dir, 'ml-1m', self.raw_file_names), self.raw_dir)
        rmtree(os.path.join(self.raw_dir, 'ml-1m'))
        os.unlink(path)

    def load_ratings_df(self):
        df = pd.read_csv(self.raw_paths[0], sep='::', header=None, engine='python')
        df.columns = ['uid', 'sid', 'rating', 'timestamp']
        # drop duplicate user-item pair records, keeping recent ratings only
        df.drop_duplicates(subset=['uid', 'sid'], keep='last', inplace=True)
        return df

In [None]:
#export
class ML1mDataModule(InteractionsDataModule):
    dataset_cls = ML1mDataset

In [None]:
class Args:
    def __init__(self):
        self.data_dir = '/content/data'
        self.min_rating = 4
        self.num_negative_samples = 99
        self.min_uc = 5
        self.min_sc = 5
        self.val_p = 0.2
        self.test_p = 0.2
        self.seed = 42
        self.split_type = 'stratified'

args = Args()

In [None]:
ds = ML1mDataModule(**args.__dict__)
ds.prepare_data()

Processing...


Turning into implicit ratings
Filtering triplets
Densifying index


Done!


In [None]:
!tree -h --du -C "{args.data_dir}"

[01;34m/content/data[00m
├── [ 11M]  [01;34mprocessed[00m
│   ├── [2.3M]  data_test_neg.pt
│   ├── [ 95K]  data_test_pos.pt
│   ├── [6.5M]  data_train.pt
│   ├── [2.3M]  data_valid_neg.pt
│   └── [ 95K]  data_valid_pos.pt
└── [ 23M]  [01;34mraw[00m
    └── [ 23M]  ratings.dat

  35M used in 2 directories, 6 files


## ML100k Dataset

In [None]:
#export
class ML100kDataset(Dataset):
    url = 'https://files.grouplens.org/datasets/movielens/ml-100k.zip'
    
    def __init__(self, root):
        super().__init__(root)
    
    @property
    def raw_file_names(self) -> str:
        return ['u1.base', 'u1.test', 'u4.test', 'allbut.pl', 'u.item', 
                'ua.test', 'u.occupation', 'u3.test', 'u5.base', 'ub.test', 
                'u2.test', 'u3.base', 'u.genre', 'u.data', 'u4.base', 
                'u5.test', 'u.info', 'README', 'ub.base', 'mku.sh', 'u2.base', 
                'u.user', 'ua.base']

    @property
    def processed_file_names(self) -> str:
        raise NotImplementedError

    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        from shutil import move, rmtree
        file_names = os.listdir(osp.join(self.raw_dir, 'ml-100k'))   
        for file_name in file_names:
            move(osp.join(self.raw_dir, 'ml-100k', file_name), self.raw_dir)
        rmtree(osp.join(self.raw_dir, 'ml-100k'))
        os.unlink(path)

    def process(self):
        raise NotImplementedError

In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2022-01-10 10:04:23

recohut: 0.0.10

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

IPython: 5.5.0
pandas : 1.1.5

