In [1]:

import datasets
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# loading existing datasets from the hub

In [2]:
emotions = datasets.load_dataset("emotion")

Downloading builder script: 100%|██████████| 3.97k/3.97k [00:00<00:00, 844kB/s]
Downloading metadata: 100%|██████████| 3.28k/3.28k [00:00<00:00, 1.28MB/s]
Downloading readme: 100%|██████████| 8.78k/8.78k [00:00<00:00, 5.18MB/s]
No config specified, defaulting to: emotion/split


Downloading and preparing dataset emotion/split to /Users/spayot/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd...


Downloading data: 100%|██████████| 592k/592k [00:00<00:00, 1.99MB/s]
Downloading data: 100%|██████████| 74.0k/74.0k [00:00<00:00, 603kB/s]
Downloading data: 100%|██████████| 74.9k/74.9k [00:00<00:00, 588kB/s]
Downloading data files: 100%|██████████| 3/3 [00:05<00:00,  1.81s/it]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 172.16it/s]
                                                                                      

Dataset emotion downloaded and prepared to /Users/spayot/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 1138.00it/s]


In [3]:
emotions["train"].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

# Generating own datasets
## generating the data

In [24]:
FPATH = "../data/simple_dataset.csv"

data={"text": ["i love kyries", "best running shoes", "cleats for forwards", "everyday cool looking snkrs", "new pegs"], 
"category": ["basketball", "running", "global football", "lifestyle", "running"]}

df = pd.DataFrame(data)

df.to_csv(FPATH, index=False)

df

Unnamed: 0,text,category
0,i love kyries,basketball
1,best running shoes,running
2,cleats for forwards,global football
3,everyday cool looking snkrs,lifestyle
4,new pegs,running


## loading CSV directly in a dataset format

In [None]:
ds_pd3 = datasets.load_dataset("csv", 
    data_files="../data/simple_dataset.csv", 
    download_mode='force_redownload') # useful if previous version of dataset has already been cached in the past

Using custom data configuration default-d1697df20e98b33b


Downloading and preparing dataset csv/default (download: 157 bytes, generated: 173 bytes, post-processed: Unknown size, total: 330 bytes) to /Users/spayot/.cache/huggingface/datasets/csv/default-d1697df20e98b33b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 10754.63it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 737.40it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Dataset csv downloaded and prepared to /Users/spayot/.cache/huggingface/datasets/csv/default-d1697df20e98b33b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 1169.31it/s]


In [None]:
ds_pd3

DatasetDict({
    train: Dataset({
        features: ['text', 'category'],
        num_rows: 5
    })
})

## creating a Dataset from a pandas dataframe

In [25]:
from collections import OrderedDict

label2id =  OrderedDict({"basketball": 0, "running": 1, "global football": 2, "lifestyle": 3, "other": 4})

In [26]:
ds_pd = datasets.Dataset.from_pandas(df) #, info="a toy example of dataset")


def convert_category(example: dict) -> dict[str, int]:
    return {"label": label2id.get(example["category"])}
    
ds_pd = ds_pd.map(convert_category)

ds_pd

100%|██████████| 5/5 [00:00<00:00, 15534.46ex/s]


Dataset({
    features: ['text', 'category', 'label'],
    num_rows: 5
})

In [27]:
def print_dict(d: dict) -> None:
    print(*[f"\t{k}: {v}" for k, v in d.items()], sep='\n')
    
def print_ds(ds):
    print_dict(ds.features)
    print()
    print_dict(ds[:])

print_ds(ds_pd)

	text: Value(dtype='string', id=None)
	category: Value(dtype='string', id=None)
	label: Value(dtype='int64', id=None)

	text: ['i love kyries', 'best running shoes', 'cleats for forwards', 'everyday cool looking snkrs', 'new pegs']
	category: ['basketball', 'running', 'global football', 'lifestyle', 'running']
	label: [0, 1, 2, 3, 1]


## setting right feature types and saving

In [28]:
df["label"] = df.category.map(label2id)

In [29]:
features = datasets.Features({
    "label": datasets.ClassLabel(names=list(label2id.keys())),
    "text": datasets.Value(dtype="string"),
    "category": datasets.Value(dtype="string"),
    })

ds_pd2 = datasets.Dataset.from_pandas(df, features=features)
print_ds(ds_pd2)

	label: ClassLabel(names=['basketball', 'running', 'global football', 'lifestyle', 'other'], id=None)
	text: Value(dtype='string', id=None)
	category: Value(dtype='string', id=None)

	label: [0, 1, 2, 3, 1]
	text: ['i love kyries', 'best running shoes', 'cleats for forwards', 'everyday cool looking snkrs', 'new pegs']
	category: ['basketball', 'running', 'global football', 'lifestyle', 'running']


In [30]:
# split
ds_pd2 = ds_pd2.shuffle(seed=42).train_test_split(.2)

In [42]:
# save to disk in arrow format
PATH_TO_DATASET = "../data/example_dataset/"
ds_pd2.save_to_disk(PATH_TO_DATASET)




Flattening the indices: 100%|██████████| 1/1 [00:00<00:00, 960.45ba/s]



[A[A[A


[A[A[A


[A[A[A


Flattening the indices: 100%|██████████| 1/1 [00:00<00:00, 1422.28ba/s]



[A[A[A


[A[A[A


[A[A[A

## Loading a local dataset from disk

In [48]:
ds_pd4 = datasets.load_from_disk("../data/example_dataset")
ds_pd4



DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'category'],
        num_rows: 4
    })
    test: Dataset({
        features: ['label', 'text', 'category'],
        num_rows: 1
    })
})