In [None]:
!pip install -q datasets

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K

In [None]:
from datasets import load_dataset, get_dataset_config_names, Dataset
import random
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.utils.data import DataLoader
import pandas as pd
from datasets import Dataset
from sklearn.utils import resample
from sklearn.utils import resample
from copy import copy
from torch.utils.data import DataLoader
import argparse
from tqdm import tqdm

In [None]:
SEED = 42
NUM_PROC=5
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
CACHE=None

In [None]:
def fill_template(templates, values):
    temp = random.sample(templates,1)[0]
    for i in range(len(values)):
        #print(f"i: {i}, values: {values[i]}")
        temp = temp.replace("${"+str(i+1)+"}", values[i])
    return temp

def generate_eval(dataset_name, templates, split, label_column, choices, question=None, template_key=None, template_labels=None, generate_other=False, input_sentences=[]):
  """
    - dataset_name: Path name to HuggingFace repo
    - templates: 2D list of templates
    - split: list of dataset split we are going to use, i.e. ['split']
    - question: column label
    - choices: optional choices (choice1 vs choice2 that we embed)
    - template_key: 'question' for XCOPA, 'label' for XNLI
    - template_labels: ['cause', 'effect'] for XCOPA, [0, 1, 2] for XNLI
    - generate_other: needed for XNLI
    - input_sentences: needed for XStoryCloze
  """
  langs = get_dataset_config_names(dataset_name)
  # sanity check: remove any langs that have more than 2 letters (should only be 2 letter code)
  langs = [lang for lang in langs if len(lang) == 2]

  data = {}
  for lang in langs:
    print(f"loading dataset lang: {lang}")
    data[lang] = load_dataset(dataset_name, lang, split=split, cache_dir=CACHE)

  col_names = copy(data[langs[0]][0].column_names)
  col_names.remove(label_column)

  def create_statements_labels(example):
    template=""
    # XCOPA, XNLI
    if template_key:
      if template_labels:
        for idx, val in enumerate(template_labels):
          if example[template_key] == val:
            template = templates[idx]

    # XWinograd
    if not template:
      template=templates

    # Choose from templates given
    temp = random.choice(template)

    statements = []

    # for chosen template fill, and append to statements
    for i in range(len(choices)):
      values = []
      if question:
        values.append(example[question])
      if input_sentences:
        for ip in input_sentences:
          values.append(example[ip])
      values.append(example[choices[i]])
      statements.append(fill_template([temp], values))

    # XNLI, need to process
    if generate_other:
      # wrong answers
      other_indices = [i for i in range(3) if i != example[template_key]]
      xnli_dict = {i:"" for i in range(3)}
      xnli_dict[example[template_key]] = statements[0]

      for other_idx in other_indices:
        temp = random.choice(templates[other_idx])
        xnli_dict[other_idx] = fill_template([temp], [example[question], example['hypothesis']])

        # fetch sorted statements
        statements = [xnli_dict[i] for i in range(3)]

    # now actually make the changes to example['statement1']
    for i in range(len(statements)):
      example[f'statement{i+1}'] = statements[i]

    # if not XCOPA, XNLI, we have to adjust labels to match (0-indexed for argmax)
    if not template_key:
      example[label_column] = int(example[label_column]) - 1

    return example

  resulting_statements = {}
  for lang in langs:
    print(f"Processing {lang}...")
    resulting_statements[lang] = [split.map(create_statements_labels, remove_columns=col_names, num_proc=NUM_PROC) for split in data[lang]][0]

  return resulting_statements, langs

In [None]:
def create_dataloaders(statements, langs):
  dataloader_dict = {}
  for lang in langs:
    dataloader_dict[lang] = DataLoader(statements[lang], batch_size=32, shuffle=False)
  return dataloader_dict

In [None]:
# XCOPA
print("Processing XCOPA...")
dataset = "xcopa"
templates = [["The cause of \"${1}\" is that \"${2}\"", "\"${1}\" because \"${2}\"", "\"${1}\" due to \"${2}\""], ["The effect of \"${1}\" is that \"${2}\"", "\"${1}\" therefore \"${2}\"", "\"${1}\", so \"${2}\""]]
split = ['test']
label_column = 'label'
question = 'premise'
choices = ['choice1', 'choice2']
template_key = 'question'
template_labels = ['cause', 'effect']

xcopa_statements, xcopa_langs = generate_eval(dataset, templates, split, label_column, choices, question, template_key, template_labels)
xcopa_dataloaders = create_dataloaders(xcopa_statements, xcopa_langs)

Processing XCOPA...
loading dataset lang: et
loading dataset lang: ht
loading dataset lang: id
loading dataset lang: it
loading dataset lang: qu
loading dataset lang: sw
loading dataset lang: ta
loading dataset lang: th
loading dataset lang: tr
loading dataset lang: vi
loading dataset lang: zh
Processing et...


Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Processing ht...


Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Processing id...


Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Processing it...


Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Processing qu...


Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Processing sw...


Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Processing ta...


Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Processing th...


Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Processing tr...


Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Processing vi...


Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Processing zh...


Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
print(type(xcopa_statements['et']))

<class 'datasets.arrow_dataset.Dataset'>


In [None]:
print(xcopa_langs)

['et', 'ht', 'id', 'it', 'qu', 'sw', 'ta', 'th', 'tr', 'vi', 'zh']


In [None]:
print(xcopa_statements['et'][0])

{'label': 0, 'statement1': '"Ese oli mullikilesse mässitud." due to "See oli õrn."', 'statement2': '"Ese oli mullikilesse mässitud." due to "See oli väike."'}


In [None]:
print(Dataset.from_dict(xcopa_statements['et']))

AttributeError: 'Dataset' object has no attribute 'items'

In [None]:
def push_dataset(statements, langs, dataset_name):
  for lang_code in langs:
    statements[lang_code].push_to_hub(f"mbzuai-ugrip-statement-tuning/{dataset_name}", lang_code, split='test')

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
push_dataset(xcopa_statements, xcopa_langs, 'xcopa')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/353 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/675 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/997 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.64k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.29k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.61k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.93k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.25k [00:00<?, ?B/s]

In [None]:
  # XNLI
print("Processing XNLI...")
dataset = "facebook/xnli"
# templates: 'Entailment', 'Neutral', 'Contradiction'
templates = [["\"${1}\" entails \"${2}\"", "\"${1}\"? yes, \"${2}\"", "Premise: \"${1}\", Hypothesis: \"${2}\", label: Entailment"],
            ["\"${1}\" is neutral with regards to \"${2}\"", "\${1}\? maybe, \"${2}\"", "Premise: \"${1}\", Hypothesis: \"${2}\", label: Neutral"],
            ["\"${1}\" contradicts \"${2}\"", "\"${1}\"? no, \"${2}\"", "Premise: \"${1}\", Hypothesis: \"${2}\", label: Contradiction"]]
split = ['test']
label_column = 'label'
question = 'premise'
choices = ['hypothesis']
# 0: Entailment, 1: Neutral, 2: Contradiction
template_key = 'label'
template_labels = [0, 1, 2]

xnli_statements, xnli_langs = generate_eval(dataset, templates, split, label_column, choices, question, template_key, template_labels, generate_other=True)
xnli_dataloaders = create_dataloaders(xnli_statements, xnli_langs)

Processing XNLI...


Downloading readme:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

loading dataset lang: ar


Downloading data:   0%|          | 0.00/58.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/392k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/194k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

loading dataset lang: bg


Downloading data:   0%|          | 0.00/65.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/447k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/223k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

loading dataset lang: de


Downloading data:   0%|          | 0.00/55.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/356k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

loading dataset lang: el


Downloading data:   0%|          | 0.00/73.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/490k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/247k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

loading dataset lang: en


Downloading data:   0%|          | 0.00/50.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/157k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

loading dataset lang: es


Downloading data:   0%|          | 0.00/53.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/342k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/173k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

loading dataset lang: fr


Downloading data:   0%|          | 0.00/55.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/360k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/183k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

loading dataset lang: hi


Downloading data:   0%|          | 0.00/70.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/249k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

loading dataset lang: ru


Downloading data:   0%|          | 0.00/70.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/477k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/239k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

loading dataset lang: sw


Downloading data:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/158k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

loading dataset lang: th


Downloading data:   0%|          | 0.00/76.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/503k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/252k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

loading dataset lang: tr


Downloading data:   0%|          | 0.00/48.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/338k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/172k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

loading dataset lang: ur


Downloading data:   0%|          | 0.00/46.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/428k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/216k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

loading dataset lang: vi


Downloading data:   0%|          | 0.00/57.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/364k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/186k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

loading dataset lang: zh


Downloading data:   0%|          | 0.00/47.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/310k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/157k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Processing ar...


Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Processing bg...


Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Processing de...


Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Processing el...


Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Processing en...


Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Processing es...


Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Processing fr...


Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Processing hi...


Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Processing ru...


Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Processing sw...


Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Processing th...


Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Processing tr...


Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Processing ur...


Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Processing vi...


Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Processing zh...


Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

In [None]:
print(xnli_statements['en'][0])

{'label': 2, 'statement1': '"Well, I wasn\'t even thinking about that, but I was so frustrated, and, I ended up talking to him again."? yes, "I havent spoken to him again."', 'statement2': '\\Well, I wasn\'t even thinking about that, but I was so frustrated, and, I ended up talking to him again.\\? maybe, "I havent spoken to him again."', 'statement3': 'Premise: "Well, I wasn\'t even thinking about that, but I was so frustrated, and, I ended up talking to him again.", Hypothesis: "I havent spoken to him again.", label: Contradiction'}


In [None]:
push_dataset(xnli_statements, xnli_langs, 'xnli')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/505 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/979 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.88k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.35k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/4.30k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/4.77k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/5.25k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/5.72k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/6.19k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/6.67k [00:00<?, ?B/s]

In [None]:
# XWinograd
print("Processing XWinograd...")
dataset = "Muennighoff/xwinograd"
templates = ["In \"${1}\", _ is: \"${2}\"", "Q:\"${1}\", A: \"${2}\"", "The missing word in \"${1}\" is \"${2}\"", "_ in: \"${1}\" is \"${2}\"", "\"${1}\", _ is: \"${2}\""]
split = ['test']
label_column = 'answer'
question = 'sentence'
choices = ['option1', 'option2']

xwinograd_statements, xwinograd_langs = generate_eval(dataset, templates, split, label_column, choices, question)
xwinograd_dataloaders = create_dataloaders(xwinograd_statements, xwinograd_langs)

Processing XWinograd...
loading dataset lang: en


Downloading data:   0%|          | 0.00/136k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2325 [00:00<?, ? examples/s]

loading dataset lang: fr


Downloading data:   0%|          | 0.00/7.76k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/83 [00:00<?, ? examples/s]

loading dataset lang: jp


Downloading data:   0%|          | 0.00/71.5k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/959 [00:00<?, ? examples/s]

loading dataset lang: pt


Downloading data:   0%|          | 0.00/17.9k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/263 [00:00<?, ? examples/s]

loading dataset lang: ru


Downloading data:   0%|          | 0.00/30.8k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/315 [00:00<?, ? examples/s]

loading dataset lang: zh


Downloading data:   0%|          | 0.00/82.1k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/504 [00:00<?, ? examples/s]

Processing en...


Map (num_proc=5):   0%|          | 0/2325 [00:00<?, ? examples/s]

Processing fr...


Map (num_proc=5):   0%|          | 0/83 [00:00<?, ? examples/s]

Processing jp...


Map (num_proc=5):   0%|          | 0/959 [00:00<?, ? examples/s]

Processing pt...


Map (num_proc=5):   0%|          | 0/263 [00:00<?, ? examples/s]

Processing ru...


Map (num_proc=5):   0%|          | 0/315 [00:00<?, ? examples/s]

Processing zh...


Map (num_proc=5):   0%|          | 0/504 [00:00<?, ? examples/s]

In [None]:
push_dataset(xwinograd_statements, xwinograd_langs, 'xwinograd')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/358 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/680 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
# XStoryCloze
print("Processing XStoryCloze...")
dataset = "juletxara/xstory_cloze"
templates = ["\"${1}\" \"${2}\" \"${3}\" \"${4}\" \"${5}\""]
split = ['eval']
label_column = 'answer_right_ending'
input_sentences=['input_sentence_1', 'input_sentence_2', 'input_sentence_3', 'input_sentence_4']
choices=['sentence_quiz1', 'sentence_quiz2']

xstorycloze_statements, xstorycloze_langs = generate_eval(dataset, templates, split, label_column, choices, input_sentences=input_sentences)
xstorycloze_dataloaders = create_dataloaders(xstorycloze_statements, xstorycloze_langs)

Processing XStoryCloze...
loading dataset lang: ar


Downloading data:   0%|          | 0.00/112k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/430k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/360 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/1511 [00:00<?, ? examples/s]

loading dataset lang: en


Downloading data:   0%|          | 0.00/92.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/357k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/360 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/1511 [00:00<?, ? examples/s]

loading dataset lang: es


Downloading data:   0%|          | 0.00/100k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/388k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/360 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/1511 [00:00<?, ? examples/s]

loading dataset lang: eu


Downloading data:   0%|          | 0.00/97.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/373k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/360 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/1511 [00:00<?, ? examples/s]

loading dataset lang: hi


Downloading data:   0%|          | 0.00/134k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/532k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/360 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/1511 [00:00<?, ? examples/s]

loading dataset lang: id


Downloading data:   0%|          | 0.00/96.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/369k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/360 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/1511 [00:00<?, ? examples/s]

loading dataset lang: my


Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/607k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/360 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/1511 [00:00<?, ? examples/s]

loading dataset lang: ru


Downloading data:   0%|          | 0.00/125k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/496k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/360 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/1511 [00:00<?, ? examples/s]

loading dataset lang: sw


Downloading data:   0%|          | 0.00/94.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/362k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/360 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/1511 [00:00<?, ? examples/s]

loading dataset lang: te


Downloading data:   0%|          | 0.00/135k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/533k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/360 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/1511 [00:00<?, ? examples/s]

loading dataset lang: zh


Downloading data:   0%|          | 0.00/96.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/374k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/360 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/1511 [00:00<?, ? examples/s]

Processing ar...


Map (num_proc=5):   0%|          | 0/1511 [00:00<?, ? examples/s]

Processing en...


Map (num_proc=5):   0%|          | 0/1511 [00:00<?, ? examples/s]

Processing es...


Map (num_proc=5):   0%|          | 0/1511 [00:00<?, ? examples/s]

Processing eu...


Map (num_proc=5):   0%|          | 0/1511 [00:00<?, ? examples/s]

Processing hi...


Map (num_proc=5):   0%|          | 0/1511 [00:00<?, ? examples/s]

Processing id...


Map (num_proc=5):   0%|          | 0/1511 [00:00<?, ? examples/s]

Processing my...


Map (num_proc=5):   0%|          | 0/1511 [00:00<?, ? examples/s]

Processing ru...


Map (num_proc=5):   0%|          | 0/1511 [00:00<?, ? examples/s]

Processing sw...


Map (num_proc=5):   0%|          | 0/1511 [00:00<?, ? examples/s]

Processing te...


Map (num_proc=5):   0%|          | 0/1511 [00:00<?, ? examples/s]

Processing zh...


Map (num_proc=5):   0%|          | 0/1511 [00:00<?, ? examples/s]

In [None]:
push_dataset(xstorycloze_statements, xstorycloze_langs, 'xstorycloze')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/373 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/713 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.76k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.10k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.44k [00:00<?, ?B/s]