In [1]:
import sys

import numpy as np
from transformers import AutoTokenizer

sys.path.append("../")
from src import TaskAdapter, Params

In [3]:
config_path = "../configs/train_config_samsum.json"
config = Params(config_path)

In [4]:
model_name = config.MODEL.BASE_MODEL_NAME
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
def visualize_samples(dataset_dict):
    example_indices = np.random.randint(0, len(dataset_dict["train"]), 2).tolist()

    dash_line = "-".join(" " for _ in range(100))

    for i, index in enumerate(example_indices):
        print(dash_line)
        print(f"Example {i+1}")
        print(dash_line)
        print("TEXT:")
        print(dataset_dict["train"][index]["text"])
        print(dash_line)
        print("LABEL:")
        print(dataset_dict["train"][index]["label"])
        print(dash_line)
        print()

# WikiSQL Dataset

In [6]:
wikisql_datasetname = "wikisql"
wikisql_dataset_adapter = TaskAdapter(wikisql_datasetname, tokenizer)

**Text and Label**

In [40]:
visualize_samples(wikisql_dataset_adapter.dataset_dict)

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Example 1
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
TEXT:
Kim Weiskopf and Jeff Franklin wrote all the no. in series.
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
LABEL:
SELECT No. in series FROM table WHERE Written by = Kim Weiskopf and Jeff Franklin
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

 - - - - - - - - - - - - - - - - -

**Start and End Prompts**

In [8]:
start_prompt = wikisql_dataset_adapter.start_prompt.replace("\n", "\\n")
end_prompt = wikisql_dataset_adapter.end_prompt.replace("\n", "\\n")

print(f"Start prompt: {start_prompt}")
print(f"End prompt: {end_prompt}")

Start prompt: Translate this query into SQL:\n\n
End prompt: \n\nSQL:


# Samsum Dataset

In [9]:
samsum_datasetname = "samsum"
samsum_dataset_adapter = TaskAdapter(samsum_datasetname, tokenizer)

**Text and Label**

In [61]:
visualize_samples(samsum_dataset_adapter.dataset_dict)

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Example 1
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
TEXT:
George: Hey, tomek!
George: How is it going? 
Tomek: Hi George! Long time no see! It's going great, I'm back home, going to the gym, eating a lot and enjoying life of a jobless bum ^^. How are you?
George: I'm glad to hear that. You deserve a little break from everything that's on the other side (of the world)
George: You're coming to Ania's we'd, right?
George: wedding*
Tomek: Sure thing! Are you :d?
George: It would be weird if I won't
George: Yep, of course :D
Tomek: Haha! Awesome!
Tomek: I was hoping we could meet there:D
George: Aaam, I want ask you about the tradition of 

**Start and End Prompts**

In [11]:
start_prompt = samsum_dataset_adapter.start_prompt.replace("\n", "\\n")
end_prompt = samsum_dataset_adapter.end_prompt.replace("\n", "\\n")

print(f"Start prompt: {start_prompt}")
print(f"End prompt: {end_prompt}")

Start prompt: Summarize the following conversation:\n\n
End prompt: \n\nSummary:


# SST2 Dataset

In [12]:
sst2_datasetname = "sst2"
sst2_dataset_adapter = TaskAdapter(sst2_datasetname, tokenizer)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

**Text and Label**

In [66]:
visualize_samples(sst2_dataset_adapter.dataset_dict)

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Example 1
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
TEXT:
leaves us wondering less about its ideas and more about its characterization of hitler and the contrived nature of its provocative conclusion . 
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
LABEL:
negative
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

 - - - - - - - - - - -

**Start and End Prompts**

In [14]:
start_prompt = sst2_dataset_adapter.start_prompt.replace("\n", "\\n")
end_prompt = sst2_dataset_adapter.end_prompt.replace("\n", "\\n")

print(f"Start prompt: {start_prompt}")
print(f"End prompt: {end_prompt}")

Start prompt: Analyze the sentiment of the following sentence:\n\n
End prompt: \n\nSentiment:
