In [16]:
import pandas as pd
import json
from datasets import load_dataset

In [17]:
jsonl_file_path = 'amzn_massive_dataset/data/en-US.jsonl'

data = []

with open(jsonl_file_path, 'r') as jsonl_file:
    for line in jsonl_file.readlines():
        data.append(json.loads(line))
        
data[0]

{'id': '0',
 'locale': 'en-US',
 'partition': 'test',
 'scenario': 'alarm',
 'intent': 'alarm_set',
 'utt': 'wake me up at five am this week',
 'annot_utt': 'wake me up at [time : five am] [date : this week]',
 'worker_id': '1'}

In [18]:
#https://stackoverflow.com/questions/24284342/insert-a-row-to-pandas-dataframe
columns = ['prompt', 'intent']
df_intent = pd.DataFrame(columns=columns)

for d in data:
    prompt = d['utt']
    intent = d['intent']
    df_intent = pd.concat([pd.DataFrame([[prompt, intent]], columns=df_intent.columns), df_intent], ignore_index=True)

df_intent = df_intent.sample(frac=1.0, random_state=42).reset_index(drop=True)
print(f'Dataframe Shape: {df_intent.shape}')
df_intent.head()

Dataframe Shape: (16521, 2)


Unnamed: 0,prompt,intent
0,hey olly mute the speakers,audio_volume_mute
1,present time in new york,datetime_query
2,add diapers in cosmetic list,lists_createoradd
3,remove noodles from the list,lists_remove
4,olly tell me where the local shopping district is,recommendation_locations


In [19]:
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

In [20]:
train_index = int(df_intent.shape[0]*train_ratio) #train set: [0, train_index]
val_index = train_index + int(df_intent.shape[0]*val_ratio) #val set: [train_index, val_index]
#test set: [val_index, last row]

In [21]:
intent_df_train = df_intent.iloc[0: train_index]
intent_df_val = df_intent.iloc[train_index: val_index]
intent_df_test = df_intent.iloc[val_index:]

print(f'Size of Train df: {len(intent_df_train)}, size of Validation df: {len(intent_df_val)}, \
size of Test df: {len(intent_df_test)}')

Size of Train df: 13216, size of Validation df: 1652, size of Test df: 1653


In [22]:
intent_df_train.to_csv('intent_train.csv', index=False)
intent_df_val.to_csv('intent_val.csv', index=False)
intent_df_test.to_csv('intent_test.csv', index=False)

In [23]:
train = load_dataset("csv", data_files="intent_train.csv")
val = load_dataset("csv", data_files="intent_val.csv")
test = load_dataset("csv", data_files="intent_test.csv")
train, val, test

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

(DatasetDict({
     train: Dataset({
         features: ['prompt', 'intent'],
         num_rows: 13216
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['prompt', 'intent'],
         num_rows: 1652
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['prompt', 'intent'],
         num_rows: 1653
     })
 }))