In [36]:
import random
from functools import partial
import pandas as pd

In [15]:
df = pd.read_csv('AgeDataset-V1.csv')
df = df.dropna(subset=['Country', 'Occupation', 'Manner of death'])

In [39]:
occupation = [
    "What profession was {person} known for?",
    "Can you tell me what job {person} held during his lifetime?",
    "What was the primary occupation of {person}?",
    "In what field did {person} work?",
]

country = [
    "Which country is {person} originally from?",
    "What nationality was {person}?",
    "Can you identify {person}'s country of origin?",
    "From which country did {person} hail?",
]

manner_of_death = [
    "What was the cause of death for {person}?",
    "How did {person} die?",
    "What led to {person}'s demise?"
    "What was the reason behind {person}'s death?",
]

def create_question(row, category):
    if category == 'occupation':
        question = random.choice(occupation)
        return question.format(person=row['Name'])
    elif category == 'country':
        question = random.choice(country)
        return question.format(person=row['Name'])
    elif category == 'manner_of_death':
        question = random.choice(manner_of_death)
        return question.format(person=row['Name'])
    return None

def apply_all_question(df):
    df['question_occupation'] = df.apply(partial(create_question, category='occupation'), axis=1)
    df['question_country'] = df.apply(partial(create_question, category='country'), axis=1)
    df['question_manner_of_death'] = df.apply(partial(create_question, category='manner_of_death'), axis=1)
    return df

In [84]:
def create_dataset(df):
    # 新しいDataFrameを作成する準備
    questions = []
    answers = []
    persons = []
    choices = []
    
    # 各行に対して処理
    for index, row in df.iterrows():
        # 質問と答えのペアを追加
        questions += [row['question_occupation'], row['question_country'], row['question_manner_of_death']]
        answers += [row['Occupation'], row['Country'], row['Manner of death']]
        persons += [row['Name']] * 3  # 同一人物の名前を3回繰り返す
        for _ in range(3):
            random_list = random.sample(list(df['Name']), 10)
            while row['Name'] in random_list:
                random_list = random.sample(list(df['Name']), 10)
            choices += [random_list]

    # 新しいDataFrameを作成
    df_new = pd.DataFrame({
        'question': questions,
        'answer': answers,
        'person': persons,
        'choices': choices
    })
    
    return df_new

In [86]:
train, valid = df.iloc[:200], df.iloc[200:300]
train, valid = create_dataset(train), create_dataset(valid)

In [98]:
from datasets import Dataset, DatasetDict
train_dataset = Dataset.from_pandas(train)
valid_dataset = Dataset.from_pandas(valid)

In [99]:
dataset = DatasetDict({
    "train": train_dataset,
    "valid": valid_dataset,
})

In [102]:
dataset.save_to_disk('datasets/age-dataset')

Saving the dataset (0/1 shards):   0%|          | 0/600 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/300 [00:00<?, ? examples/s]

In [104]:
from datasets import load_from_disk
dataset = load_from_disk('datasets/age-dataset')

In [105]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'person', 'choices'],
        num_rows: 600
    })
    valid: Dataset({
        features: ['question', 'answer', 'person', 'choices'],
        num_rows: 300
    })
})

In [None]:
|