In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.5-py3-none-any.whl (7.8 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.5


In [2]:
import datasets
from pprint import pprint

lamini_doc = datasets.load_dataset('kotzeje/lamini_docs.jsonl', split='train')

Downloading readme:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/283k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1400 [00:00<?, ? examples/s]

In [3]:
pprint(lamini_doc)

Dataset({
    features: ['question', 'answer'],
    num_rows: 1400
})


In [4]:
import pandas as pd


lamini_df = pd.DataFrame(lamini_doc)

lamini_df.head()

Unnamed: 0,question,answer
0,How can I evaluate the performance and quality...,There are several metrics that can be used to ...
1,Can I find information about the code's approa...,"Yes, the code includes methods for submitting ..."
2,How does Lamini AI handle requests for generat...,Lamini AI offers features for generating text ...
3,Does the `submit_job()` function expose any ad...,It is unclear which `submit_job()` function is...
4,Does the `add_data()` function support differe...,"No, the `add_data()` function does not support..."


In [5]:
examples = lamini_df.to_dict()

pprint(f"Question: {examples['question'][0]}")

print()

pprint(f"answer: {examples['answer'][0]}")


('Question: How can I evaluate the performance and quality of the generated '
 'text from Lamini models?')

('answer: There are several metrics that can be used to evaluate the '
 'performance and quality of generated text from Lamini models, including '
 'perplexity, BLEU score, and human evaluation. Perplexity measures how well '
 'the model predicts the next word in a sequence, while BLEU score measures '
 'the similarity between the generated text and a reference text. Human '
 'evaluation involves having human judges rate the quality of the generated '
 'text based on factors such as coherence, fluency, and relevance. It is '
 'recommended to use a combination of these metrics for a comprehensive '
 "evaluation of the model's performance.")


In [6]:
prompt_template = """### Question:
{question}

### Answer:
{answer}"""


question = examples["question"][0]
answer = examples['answer'][0]

text = prompt_template.format(question=question, answer=answer)

pprint(text)


('### Question:\n'
 'How can I evaluate the performance and quality of the generated text from '
 'Lamini models?\n'
 '\n'
 '### Answer:\n'
 'There are several metrics that can be used to evaluate the performance and '
 'quality of generated text from Lamini models, including perplexity, BLEU '
 'score, and human evaluation. Perplexity measures how well the model predicts '
 'the next word in a sequence, while BLEU score measures the similarity '
 'between the generated text and a reference text. Human evaluation involves '
 'having human judges rate the quality of the generated text based on factors '
 'such as coherence, fluency, and relevance. It is recommended to use a '
 "combination of these metrics for a comprehensive evaluation of the model's "
 'performance.')


In [7]:
prompt_template = """### Question:
{question}

### Answer:"""

num_examples = len(examples["question"])
finetuning_dataset = []
for i in range(num_examples):
  question = examples["question"][i]
  answer = examples["answer"][i]
  text_with_prompt_template = prompt_template.format(question=question)
  finetuning_dataset.append({"question": text_with_prompt_template, "answer": answer})


print("One datapoint in the finetuning dataset:")
pprint(finetuning_dataset[0])

One datapoint in the finetuning dataset:
{'answer': 'There are several metrics that can be used to evaluate the '
           'performance and quality of generated text from Lamini models, '
           'including perplexity, BLEU score, and human evaluation. Perplexity '
           'measures how well the model predicts the next word in a sequence, '
           'while BLEU score measures the similarity between the generated '
           'text and a reference text. Human evaluation involves having human '
           'judges rate the quality of the generated text based on factors '
           'such as coherence, fluency, and relevance. It is recommended to '
           'use a combination of these metrics for a comprehensive evaluation '
           "of the model's performance.",
 'question': '### Question:\n'
             'How can I evaluate the performance and quality of the generated '
             'text from Lamini models?\n'
             '\n'
             '### Answer:'}


In [8]:
from transformers import AutoTokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [9]:
text = finetuning_dataset[0]["question"] + finetuning_dataset[0]["answer"]


In [10]:
pprint(text)

('### Question:\n'
 'How can I evaluate the performance and quality of the generated text from '
 'Lamini models?\n'
 '\n'
 '### Answer:There are several metrics that can be used to evaluate the '
 'performance and quality of generated text from Lamini models, including '
 'perplexity, BLEU score, and human evaluation. Perplexity measures how well '
 'the model predicts the next word in a sequence, while BLEU score measures '
 'the similarity between the generated text and a reference text. Human '
 'evaluation involves having human judges rate the quality of the generated '
 'text based on factors such as coherence, fluency, and relevance. It is '
 'recommended to use a combination of these metrics for a comprehensive '
 "evaluation of the model's performance.")


In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

tokenizer.pad_token = tokenizer.eos_token


tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    padding=True
)

pprint(tokenized_inputs)

In [14]:
pprint(tokenized_inputs)

{'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'input_ids': array([[ 4118, 19782,    27,   187,  2347,   476,   309,  7472,   253,
         3045,   285,  3290,   273,   253,  4561,  2505,   432,   418,
         4988,    74,  3210,    32,   187,   187,  4118, 37741,    27,
         2512,   403,  2067, 17082,   326,   476,   320,   908,   281,
         7472,   253,  3045,   285,  3290,   273,  4561,  2505,   432,
          418,  4988,    74,  3210,    13,  1690, 44229,   414,    13,
          378,  1843,    54,  4868,    13,   285,  1966,

In [16]:
max_length = 2048
max_length = min(
    tokenized_inputs["input_ids"].shape[1],
    max_length,
)

print(max_length)

142


In [10]:
def tokenize_function(examples):
    if "question" in examples and "answer" in examples:
      text = examples["question"][0] + examples["answer"][0]
    elif "input" in examples and "output" in examples:
      text = examples["input"][0] + examples["output"][0]
    else:
      text = examples["text"][0]

    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [16]:
type(finetuning_dataset)

print

list

In [36]:
# Restructure data into a dictionary of lists
finetuning_data_dict = {key: [dic[key] for dic in finetuning_dataset] for key in finetuning_dataset[0]}

# Create the Hugging Face Dataset
training_dataset = datasets.Dataset.from_dict(finetuning_data_dict)

print(training_dataset[0])

{'question': '### Question:\nHow can I evaluate the performance and quality of the generated text from Lamini models?\n\n### Answer:', 'answer': "There are several metrics that can be used to evaluate the performance and quality of generated text from Lamini models, including perplexity, BLEU score, and human evaluation. Perplexity measures how well the model predicts the next word in a sequence, while BLEU score measures the similarity between the generated text and a reference text. Human evaluation involves having human judges rate the quality of the generated text based on factors such as coherence, fluency, and relevance. It is recommended to use a combination of these metrics for a comprehensive evaluation of the model's performance."}


In [30]:
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = dataset['train']

type(train_dataset)

list

In [28]:
pprint(train_dataset[0])

{'answer': 'There are several metrics that can be used to evaluate the '
           'performance and quality of generated text from Lamini models, '
           'including perplexity, BLEU score, and human evaluation. Perplexity '
           'measures how well the model predicts the next word in a sequence, '
           'while BLEU score measures the similarity between the generated '
           'text and a reference text. Human evaluation involves having human '
           'judges rate the quality of the generated text based on factors '
           'such as coherence, fluency, and relevance. It is recommended to '
           'use a combination of these metrics for a comprehensive evaluation '
           "of the model's performance.",
 'question': '### Question:\n'
             'How can I evaluate the performance and quality of the generated '
             'text from Lamini models?\n'
             '\n'
             '### Answer:'}


In [37]:
# Restructure data into a dictionary of lists
finetuning_data_dict = {key: [dic[key] for dic in finetuning_dataset] for key in finetuning_dataset[0]}

# Create the Hugging Face Dataset
training_dataset = datasets.Dataset.from_dict(finetuning_data_dict)

tokenized_dataset = training_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

print(tokenized_dataset)

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask'],
    num_rows: 1400
})


In [40]:
pprint(tokenized_dataset['question'][0])

('### Question:\n'
 'How can I evaluate the performance and quality of the generated text from '
 'Lamini models?\n'
 '\n'
 '### Answer:')


In [42]:
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])




ValueError: ignored

In [44]:
print(tokenized_dataset)

pprint(tokenized_dataset['labels'][0])

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1400
})
[4118,
 19782,
 27,
 187,
 2347,
 476,
 309,
 7472,
 253,
 3045,
 285,
 3290,
 273,
 253,
 4561,
 2505,
 432,
 418,
 4988,
 74,
 3210,
 32,
 187,
 187,
 4118,
 37741,
 27,
 2512,
 403,
 2067,
 17082,
 326,
 476,
 320,
 908,
 281,
 7472,
 253,
 3045,
 285,
 3290,
 273,
 4561,
 2505,
 432,
 418,
 4988,
 74,
 3210,
 13,
 1690,
 44229,
 414,
 13,
 378,
 1843,
 54,
 4868,
 13,
 285,
 1966,
 7103,
 15,
 3545,
 12813,
 414,
 5593,
 849,
 973,
 253,
 1566,
 26295,
 253,
 1735,
 3159,
 275,
 247,
 3425,
 13,
 1223,
 378,
 1843,
 54,
 4868,
 5593,
 253,
 14259,
 875,
 253,
 4561,
 2505,
 285,
 247,
 3806,
 2505,
 15,
 8801,
 7103,
 8687,
 1907,
 1966,
 16006,
 2281,
 253,
 3290,
 273,
 253,
 4561,
 2505,
 1754,
 327,
 2616,
 824,
 347,
 25253,
 13,
 2938,
 1371,
 13,
 285,
 17200,
 15,
 733,
 310,
 8521,
 281,
 897,
 247,
 5019,
 273,
 841,
 17082,
 323,
 247,
 11088,
 7103,
 273,
 253,

In [45]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})


In [49]:
finetuning_dataset_path2 = "lamini/lamini_docs"
finetuning_dataset2 = datasets.load_dataset(finetuning_dataset_path2, split='train')
pprint(finetuning_dataset2)

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1260
})


In [50]:
pprint(finetuning_dataset2['question'][0])

('How can I evaluate the performance and quality of the generated text from '
 'Lamini models?')


In [51]:
!pip install huggingface_hub
#



In [58]:
!huggingface-cli login
# split_dataset.push_to_hub(dataset_path_hf)


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
[1m[31mCannot authenticate through 

In [54]:
split_dataset.push_to_hub(tokenized_dataset)

HFValidationError: ignored

In [55]:
type(tokenized_dataset)

datasets.arrow_dataset.Dataset

In [59]:
tokenized_dataset.push_to_hub("relhousieny/tokenized_lamini_template")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [62]:
rany_dataset = datasets.load_dataset("relhousieny/tokenized_lamini_template")

print(rany_dataset)

Downloading readme:   0%|          | 0.00/453 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/693k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1400
    })
})
