### explore data

In [7]:
import json

# read the json file 
with open('./datasets/persona_chat.json') as f:
    data = json.load(f)
    print(data.keys())
    for key in data.keys():
        print(key, len(data[key]))
        print(data[key][0].keys())

dict_keys(['train', 'valid'])
train 17878
dict_keys(['personality', 'utterances'])
valid 1000
dict_keys(['personality', 'utterances'])


In [17]:
data['train'][12]['personality']

['i watch basketball .',
 'i go to a local college .',
 'i work at a smoothie shop .',
 'i listen to classic rock .']

In [1]:
from dimweb_persona_bot.datasets_transformers.persona_chat_dataset_transformer import persona_chat_dataset_tranformer_v1

persona_chat_dataset_tranformer_v1(
    initial_dataset_path="./datasets/persona_chat/persona_chat.json",
    output_folder="./datasets/persona_chat",
)

Dataset lengths: train 17878, valid 500, test 500
Datasets saved.


In [27]:
# четный ответ в history содержит персону
data['train'][12]['utterances'][-1]['history']

['hey how are you today ?',
 'great ! just go off work at the smoothie shop . you ?',
 'i have been eating tacos and getting ready to move to school .',
 'are you going to college ? i go to a local one .',
 'yes , i am going to university of michigan . what year are you ?',
 'first yr ! do you have any hobbies ?',
 'i love doing anything outdoors . especially in summer . you ?',
 'i love watching college basketball and rocking out to classic rock .',
 'fun . have you decided on your major for school ?',
 'not yet . have you decided ?',
 "pre med . i'd love to be a doctor"]

### default dataset

In [32]:
from dimweb_persona_bot.dataloaders.persona_chat_dataloaders import PersonaChatDatasetV1
dataset = PersonaChatDatasetV1(
    input_dataset_path="./datasets/persona_chat/train.json",
)
dataset[12]

{'persona': ['my mom is my best friend .',
  'i have four sisters .',
  'i believe that mermaids are real .',
  'i love iced tea .'],
 'history': ['hi , how are you doing today ?',
  'i am spending time with my 4 sisters what are you up to',
  'wow , four sisters . just watching game of thrones .',
  'that is a good show i watch that while drinking iced tea',
  'i agree . what do you do for a living ?',
  "i'm a researcher i'm researching the fact that mermaids are real",
  "interesting . i'm a website designer . pretty much spend all my time on the computer .",
  "that's cool my mom does the same thing",
  "that's awesome . i have always had a love for technology .",
  'tell me more about yourself',
  'i really enjoy free diving , how about you , have any hobbies ?']}

### Causal datasets

In [4]:
from dimweb_persona_bot.dataloaders.persona_chat_dataloaders import PersonaChatDatasetV1
from dimweb_persona_bot.dataloaders.causal_samplers import CausalTrainPersonaSampleV1, CausalValidPersonaSampleV1
from dimweb_persona_bot.dataloaders.lighting import LightningDataModuleV1
from dimweb_persona_bot.hyperparameters.causal_modeling_hyperparameters import (
    PersonaChatHyperparametersV1,
)

from transformers import AutoTokenizer

hyperparameters = PersonaChatHyperparametersV1()
tokenizer = AutoTokenizer.from_pretrained(hyperparameters.model_name)

lighting_data = LightningDataModuleV1(
	train_path_dataset="./datasets/persona_chat/train.json",
	valid_path_dataset="./datasets/persona_chat/valid.json",
	hyperparameters=hyperparameters,
	tokenizer=tokenizer,
	base_train_dataset_class=PersonaChatDatasetV1,
	base_valid_dataset_class=PersonaChatDatasetV1,
	base_train_sample_class=CausalTrainPersonaSampleV1,
	base_valid_sample_class=CausalValidPersonaSampleV1,
)
lighting_data.setup()
next(iter(lighting_data.train_dataloader()))

{'input_ids': tensor([[50256, 15332,   287,  ...,  -100,  -100,  -100],
         [50256,  1820,  4004,  ...,  -100,  -100,  -100],
         [50256,    72,   588,  ...,  -100,  -100,  -100],
         ...,
         [50256,    72,  1842,  ...,  -100,  -100,  -100],
         [50256,    72,  1842,  ...,  -100,  -100,  -100],
         [50256,    72,   588,  ...,  -100,  -100,  -100]]),
 'labels': tensor([[50256, 15332,   287,  ...,  -100,  -100,  -100],
         [50256,  1820,  4004,  ...,  -100,  -100,  -100],
         [50256,    72,   588,  ...,  -100,  -100,  -100],
         ...,
         [50256,    72,  1842,  ...,  -100,  -100,  -100],
         [50256,    72,  1842,  ...,  -100,  -100,  -100],
         [50256,    72,   588,  ...,  -100,  -100,  -100]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0,

In [2]:
from dimweb_persona_bot.dataloaders.persona_chat_dataloaders import PersonaChatDatasetV1


train_dataset = PersonaChatDatasetV1(
    input_dataset_path="./datasets/persona_chat/train.json",
)

In [6]:
train_dataset[96]

{'persona': ['i like to snowboard .',
  'my favorite food is popcorn .',
  'i like to ride horses .',
  'i live in rural wisconsin .'],
 'history': ['i am frank . nice to meet you . what is your name ?',
  'my name is gary . great to meet you too .',
  'i work as a general manager at a grocery store . what about you ?',
  "i'm an insurance salesman"],
 'sample_id': '15_2'}

In [7]:
valid_dataset = PersonaChatDatasetV1(
    input_dataset_path="./datasets/persona_chat/valid.json",
)

In [8]:
valid_dataset[0]

{'persona': ['i read twenty books a year .',
  "i'm a stunt double as my second job .",
  'i only eat kosher .',
  'i was raised in a single parent household .'],
 'history': ['hello what are doing today ?',
  'i am good , i just got off work and tired , i have two jobs .'],
 'sample_id': '0_1'}

In [12]:
valid_dataset[2]['history']

['hello what are doing today ?',
 'i am good , i just got off work and tired , i have two jobs .',
 'i just got done watching a horror movie',
 "i rather read , i've read about 20 books this year .",
 'wow ! i do love a good horror movie . loving this cooler weather',
 'but a good movie is always good .']

In [17]:
valid_dataset[2]['history'][-2:]

['wow ! i do love a good horror movie . loving this cooler weather',
 'but a good movie is always good .']