In [None]:
# default_exp utils.data

# Data Utilities
> Implementation of data utilities.

In [None]:
#export
import pandas as pd
import tempfile
import os

from recohut.utils.common_utils import download_url

## List datasets

In [None]:
#export
def list_datasets(top_n:int=None):
    """
    Retruns a pandas dataframe of all the available datasets and info

    Args:
        top_n (int): returns only top_n rows
    """
    url = 'https://docs.google.com/spreadsheets/d/1wY_83y2ltu6tzMNHFOQRNslrgb0VWH_wa7zP7lT6AvM/export?gid=0&format=csv'
    df = pd.read_csv(url, index_col=[0]).fillna('NA')
    if top_n:
        return df.head(top_n)
    return df

In [None]:
list_datasets(top_n=3)

Unnamed: 0_level_0,url,nrows,columns,type,format,size (kb)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
music30,https://github.com/RecoHut-Datasets/30music/ra...,3408682,"['session_id','user_id','ts','playtime']",pandas_df,parquet.snappy,32200.0
music30_sample,https://github.com/RecoHut-Datasets/30music/ra...,343182,"['session_id','user_id','ts','playtime']",pandas_df,parquet.snappy,10100.0
sample_session,https://github.com/RecoHut-Datasets/sample/raw...,5,"['session_id','sequence','ts','user_id']",pandas_df,parquet.snappy,


## Load dataset

In [None]:
#export
def load_dataset(data_id, data_dir=None, log=False):
    dataset_list = list(list_datasets().index)
    assert data_id in dataset_list, f'data id not exist, available ids are {dataset_list}'

    if data_dir is None:
        data_dir = os.path.join(tempfile.gettempdir(), data_id)
    
    data_info = list_datasets().loc[data_id]
    path = download_url(data_info.url, data_dir, log=log)

    if data_info.format == 'parquet.snappy':
        df = pd.read_parquet(path)
    
    return df

In [None]:
df = load_dataset('music30_sample')
df.head()

Unnamed: 0,session_id,user_id,song_id,ts,playtime
0,1902204,4,16,1421163674,274
1,1902204,4,17,1421163948,250
2,1902204,4,18,1421164198,271
3,780919,10,60,1411009500,228
4,780919,10,61,1411014936,206
