# What if my dataset isn't on the Hub

## Loading a local dataset

In [None]:
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz

In [None]:
!gzip -dkv SQuAD_it-*.json.gz

In [None]:
from datasets import load_dataset
squad_it_dataset = load_dataset("json", data_files="SQuAD_it-train.json", field="data")
squad_it_dataset

In [None]:
squad_it_dataset["train"][0]

In [None]:
data_files = {"train":"SQuAD_it-train.json", "test":"SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

In [None]:
data_files = {"train":"SQuAD_it-train.json.gz", "test":"SQuAD_it-test.json.gz"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

## Loading a remote dataset

In [None]:
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

# Time to slice and dice

## Slicing and dicing our data

In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

In [None]:
from datasets import load_dataset

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

In [None]:
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
drug_sample[:3]

In [None]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [None]:
drug_dataset = drug_dataset.rename_column(
    original_column_name = "Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

In [None]:
len(drug_dataset["train"].unique('drugName'))

In [None]:
len(drug_dataset["test"].unique('drugName'))

In [None]:
len(drug_dataset["train"].unique('condition'))

In [None]:
len(drug_dataset["test"].unique('condition'))

In [None]:
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

In [None]:
drug_dataset.map(lowercase_condition)

In [None]:
def filter_nones(x):
    return x["condition"] is not None

In [None]:
drug_dataset = drug_dataset.filter(filter_nones)

In [None]:
drug_dataset = drug_dataset.map(lowercase_condition)

In [None]:
len(drug_dataset["train"].unique('condition'))

In [None]:
len(drug_dataset["test"].unique('condition'))

## Creating new columns

In [None]:
# check lengths
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

In [None]:
drug_dataset = drug_dataset.map(compute_review_length)
drug_dataset["train"][0]

In [None]:
drug_dataset["train"].sort("review_length")[:3]

In [None]:
# filter out short reviews
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

In [None]:
drug_dataset["train"].sort("review_length", reverse=True)[:3]

In [None]:
# unescape html character codes
import html
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

## The `map()` method's superpowers

In [None]:
# batched=True for increased performance
new_drug_dataset = drug_dataset.map(
    lambda x: {"review":[html.unescape(o) for o in x["review"]]}, batched=True
)

In [None]:
# fast tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [None]:
%time drug_dataset.map(tokenize_function, batched=True)

In [None]:
%time drug_dataset.map(tokenize_function, batched=False)

In [None]:
# slow tokenizer
from transformers import AutoTokenizer
slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
def slow_tokenize_function(examples):
    return slow_tokenizer(examples["review"], truncation=True)

In [None]:
%time drug_dataset.map(slow_tokenize_function, batched=True)

In [None]:
%time drug_dataset.map(slow_tokenize_function, batched=False)

In [None]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

In [None]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

In [None]:
# error above as 1000 examples but 1463 sets of tokens, so remove old columns
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names)

In [None]:
len(tokenized_dataset["train"]), len(drug_dataset["train"])

In [None]:
# alternatively to keep old columns use `overflow_to_sample_mapping` from tokenizer to generate new columns for the extra tokens that overflowed
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key]=[values[i] for i in sample_map]
    return result    
    

In [None]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

## From `Datasets` to `DataFrames` and back

In [None]:
drug_dataset.set_format("pandas")

In [None]:
drug_dataset["train"][:3]

In [None]:
train_df = drug_dataset["train"][:]
train_df.head()

In [None]:
frequencies = (train_df["condition"].value_counts().to_frame().reset_index().rename(columns={"index":"condition", "condition":"frequency"}))
frequencies.head()

In [None]:
from datasets import Dataset
freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

In [None]:
avg_rating_df = train_df[["drugName","rating"]].groupby(train_df["drugName"]).agg({"rating":"mean"})

avg_rating_df.head()

In [None]:
avg_rating_dataset = Dataset.from_pandas(avg_rating_df)
avg_rating_dataset

In [None]:
drug_dataset.reset_format()

## Creating a validation set

In [None]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)

In [None]:
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")

In [None]:
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean
                   

## Saving a dataset

In [None]:
drug_dataset_clean.save_to_disk("drug-reviews")

In [None]:
from datasets import load_from_disk
drug_dataset_reloaded = load_from_disk("drug-reviews")
drug_dataset_reloaded

In [None]:
for split, dataset in drug_dataset_clean.items():
    dataset.to_json(f"drug-reviews-{split}.jsonl")

In [None]:
!head -n 1 drug-reviews-train.jsonl

In [None]:
data_files = {
    "train": "drug-reviews-train.jsonl",
    "validation": "drug-reviews-validation.jsonl",
    "test": "drug-reviews-test.jsonl",
}
drug_dataset_reloaded = load_dataset("json", data_files=data_files)

# Big Data

## What is the Pile

In [None]:
!pip install zstandard

In [None]:
from datasets import load_dataset

# This takes a few minutes to run, so go grab a tea or coffee while you wait :)
data_files = "https://mystic.the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst"
pubmed_dataset = load_dataset("json", data_files=data_files, split="train")
pubmed_dataset

# Creating your own dataset

## Getting the data

In [None]:
import requests

url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)

In [None]:
response.status_code

In [None]:
response.json()

In [None]:
import getpass
GITHUB_TOKEN = getpass.getpass()  # Copy your GitHub token here


In [None]:

headers = {"Authorization": f"token {GITHUB_TOKEN}"}

In [None]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [None]:
fetch_issues()

In [1]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, datasets
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.12.0
    Uninstalling huggingface-hub-0.12.0:
      Successfully uninstalled huggingface-hub-0.12.0
  Attempting uninstall: datasets
    Found existing installation: datasets 2.4.0
    Uninstalling datasets-2.4.0:
      Successfully uninstalled datasets-2.4.0
Successfully installed datasets-2.14.4 huggingface-hub-0.16.4
[0m

In [3]:
import pandas as pd
from datasets import Dataset
df = pd.read_json('datasets-issues.jsonl', orient='records', lines=True)
issues_dataset = Dataset.from_pandas(df[:2000], split="train", )
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'draft', 'pull_request', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason'],
    num_rows: 2000
})

## Cleaning up the data

In [4]:
sample = issues_dataset.shuffle(seed=666).select(range(3))

# Print out the URL and pull request entries
for url, pr in zip(sample["html_url"], sample["pull_request"]):
    print(f">> URL: {url}")
    print(f">> Pull request: {pr}\n")

>> URL: https://github.com/huggingface/datasets/issues/5442
>> Pull request: None

>> URL: https://github.com/huggingface/datasets/issues/5385
>> Pull request: None

>> URL: https://github.com/huggingface/datasets/issues/6111
>> Pull request: None



In [5]:
issues_dataset = issues_dataset.map(
    lambda x: {"is_pull_request": False if x["pull_request"] is None else True}
)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

## Augmenting the dataset

In [6]:
import getpass
GITHUB_TOKEN = getpass.getpass()  # Copy your GitHub token here

headers = {"Authorization": f"token {GITHUB_TOKEN}"}

 ········


In [7]:
import requests
issue_number = 2792
url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/comments/897594128',
  'html_url': 'https://github.com/huggingface/datasets/pull/2792#issuecomment-897594128',
  'issue_url': 'https://api.github.com/repos/huggingface/datasets/issues/2792',
  'id': 897594128,
  'node_id': 'IC_kwDODunzps41gDMQ',
  'user': {'login': 'bhavitvyamalik',
   'id': 19718818,
   'node_id': 'MDQ6VXNlcjE5NzE4ODE4',
   'avatar_url': 'https://avatars.githubusercontent.com/u/19718818?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/bhavitvyamalik',
   'html_url': 'https://github.com/bhavitvyamalik',
   'followers_url': 'https://api.github.com/users/bhavitvyamalik/followers',
   'following_url': 'https://api.github.com/users/bhavitvyamalik/following{/other_user}',
   'gists_url': 'https://api.github.com/users/bhavitvyamalik/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/

In [17]:
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    try:
        returnValue = [r["body"] for r in response.json()]
        return returnValue
    except:
        print(response.json())
        return []

# Test our function works as expected
get_comments(2792)

["@albertvillanova my tests are failing here:\r\n```\r\ndataset_name = 'gooaq'\r\n\r\n    def test_load_dataset(self, dataset_name):\r\n        configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\r\n>       self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\r\n\r\ntests/test_dataset_common.py:234: \r\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \r\ntests/test_dataset_common.py:187: in check_load_dataset\r\n    self.parent.assertTrue(len(dataset[split]) > 0)\r\nE   AssertionError: False is not true\r\n```\r\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?",
 'Thanks for the help, @albertvillanova! All tests are passing now.']

In [18]:
issues_with_comments_dataset = issues_dataset.map(
    lambda x: {"comments": get_comments(x["number"])}
)                                     
issues_with_comments_dataset            

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'message': 'Not Found', 'documentation_url': 'https://docs.github.com/rest/issues/comments#list-issue-comments'}
{'message': 'Not Found', 'documentation_url': 'https://docs.github.com/rest/issues/comments#list-issue-comments'}


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'draft', 'pull_request', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'is_pull_request'],
    num_rows: 2000
})

## Uploading the dataset to the Hugging Face Hub

In [10]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
issues_with_comments_dataset.push_to_hub("github-issues")


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/6.61k [00:00<?, ?B/s]

In [13]:
from datasets import load_dataset
remote_dataset = load_dataset("ptah23/github-issues", split="train")
remote_dataset

Downloading readme:   0%|          | 0.00/6.23k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/4.27M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'draft', 'pull_request', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'is_pull_request'],
    num_rows: 2000
})

# Semantic search with FAISS

## Loading and preparing the dataset

In [1]:
from datasets import load_dataset

issues_dataset = load_dataset("ptah23/github-issues", split="train")
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'draft', 'pull_request', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'is_pull_request'],
    num_rows: 2000
})

In [2]:
issues_dataset = issues_dataset.filter(
    lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0)
)
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'draft', 'pull_request', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'is_pull_request'],
    num_rows: 837
})

In [3]:
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 837
})

In [4]:
#use pandas to explode comments arrays in each row to (html_url, title, body, comment) tuple.
issues_dataset.set_format("pandas")
df = issues_dataset[:]

In [5]:
df["comments"][0].tolist()

['Thanks for reporting, but we can only fix this issue if you can provide a reproducer that consistently reproduces it.',
 '@mariosasko Ok. What exactly does it mean to provide a reproducer',
 'To provide a code that reproduces the issue :)',
 '@mariosasko I complete the above code, is it enough?']

In [6]:
comments_df = df.explode("comments", ignore_index=True)
comments_df.head(4)

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,DatasetInfo.__init__() got an unexpected keywo...,"Thanks for reporting, but we can only fix this...",### Describe the bug\n\nWhen I was in load_dat...
1,https://github.com/huggingface/datasets/issues...,DatasetInfo.__init__() got an unexpected keywo...,@mariosasko Ok. What exactly does it mean to p...,### Describe the bug\n\nWhen I was in load_dat...
2,https://github.com/huggingface/datasets/issues...,DatasetInfo.__init__() got an unexpected keywo...,To provide a code that reproduces the issue :),### Describe the bug\n\nWhen I was in load_dat...
3,https://github.com/huggingface/datasets/issues...,DatasetInfo.__init__() got an unexpected keywo...,"@mariosasko I complete the above code, is it e...",### Describe the bug\n\nWhen I was in load_dat...


In [7]:
from datasets import Dataset
comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2991
})

In [8]:
comments_dataset = comments_dataset.map(
    lambda x: {"comment_length": len(x["comments"].split())}
)

Map:   0%|          | 0/2991 [00:00<?, ? examples/s]

In [9]:
comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 15)
comments_dataset[887]

Filter:   0%|          | 0/2991 [00:00<?, ? examples/s]

{'html_url': 'https://github.com/huggingface/datasets/issues/5434',
 'title': 'sample_dataset module not found',
 'comments': "working on the setfit example script\r\n\r\n   from setfit import SetFitModel, SetFitTrainer, sample_dataset\r\n\r\nImportError: cannot import name 'sample_dataset' from 'setfit' (C:\\Python\\Python38\\lib\\site-packages\\setfit\\__init__.py)\r\n\r\n apart from that, I also had to hack these loads to import thses modules:\r\n    from datasets.load import load_dataset                        \r\n    from datasets.arrow_dataset import Dataset\r\n    from datasets.dataset_dict import DatasetDict",
 'body': None,
 'comment_length': 46}

In [10]:
def concatenate_text(examples):
    return {
        "text": str('' if examples["title"] is None else examples["title"])
        + " \n "
        +  str('' if examples["body"] is None else examples["body"])
        + " \n "
        +  str('' if examples["comments"] is None else examples["comments"])
    }


comments_dataset = comments_dataset.map(concatenate_text)

Map:   0%|          | 0/2189 [00:00<?, ? examples/s]

## Creating text embeddings

In [11]:
# Load model directly
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en")
model = AutoModel.from_pretrained("BAAI/bge-base-en")

In [12]:
import torch
device = torch.device("cuda")
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [13]:
def get_embeddings(text_list):
    encoded_input = tokenizer( text_list, padding=True, truncation=True, return_tensors="pt")
    encoded_input ={k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return torch.nn.functional.normalize(model_output[0][:,0], p=2, dim=1)

In [14]:
embedding = get_embeddings(comments_dataset["text"][0])
embedding.shape

torch.Size([1, 768])

In [15]:
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/2189 [00:00<?, ? examples/s]

## Using FAISS for efficient similarity search

In [51]:
!pip install faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4
[0m

In [16]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 2189
})

In [17]:
question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [18]:
scores, samples = embeddings_dataset.get_nearest_examples("embeddings", question_embedding,k=5)

In [19]:
import pandas as pd
samples_df = pd.DataFrame.from_dict(samples)

In [20]:
samples_df["scores"] = scores

In [21]:
samples_df.sort_values("scores", ascending=False, inplace=True)

In [22]:
for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.comments}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)
    print()

COMMENT: Hi, thanks for the suggestion. It's not possible at the moment. The viewer is part of the Hub codebase and only works on public datasets. Also, it relies on [Datasets Server](https://github.com/huggingface/datasets-server/), which prepares the data and provides an API to access the rows, size, etc.

If you're interested in hosting your data as a private dataset on the Hub, you might want to look at https://github.com/huggingface/datasets-server/issues/39.
SCORE: 0.2756989300251007
TITLE: Offline dataset viewer
URL: https://github.com/huggingface/datasets/issues/6139

COMMENT: Hi, thanks for the suggestion. It's not possible at the moment. The viewer is part of the Hub codebase and only works on public datasets. Also, it relies on [Datasets Server](https://github.com/huggingface/datasets-server/), which prepares the data and provides an API to access the rows, size, etc.

If you're interested in hosting your data as a private dataset on the Hub, you might want to look at https: