In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/content/sample_data/structured_data.csv")

In [3]:
df = df.dropna(subset=['Age']).reset_index()

In [18]:
pip install sentencepiece
pip install transformers torch

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [4]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import DataLoader, Dataset

class T5Dataset(Dataset):
    def __init__(self, tokenizer, input_texts, target_texts, max_length):
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        source_encoding = tokenizer(
            self.input_texts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        target_encoding = tokenizer(
            self.target_texts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        return {
            'source_text': self.input_texts[idx],
            'target_text': self.target_texts[idx],
            'source_input_ids': source_encoding['input_ids'].flatten(),
            'target_input_ids': target_encoding['input_ids'].flatten(),
            'source_attention_mask': source_encoding['attention_mask'].flatten(),
            'target_attention_mask': target_encoding['attention_mask'].flatten()
        }

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Prepare the dataset
max_length = 512
input_texts = df['transcription'].loc[:10].tolist() # Replace with your transcription texts
target_texts = df['Age'].loc[:10].apply(str).tolist() # Replace with your target texts (ages)

dataset = T5Dataset(tokenizer, input_texts, target_texts, max_length)
loader = DataLoader(dataset, batch_size=8, shuffle=True)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Load T5 model
model = T5ForConditionalGeneration.from_pretrained('t5-small')
model.train()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 3
for epoch in range(num_epochs):
    for batch in loader:
        optimizer.zero_grad()
        input_ids = batch['source_input_ids']
        attention_mask = batch['source_attention_mask']
        labels = batch['target_input_ids']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{num_epochs} completed. Loss: {loss.item()}')


Epoch 1/3 completed. Loss: 22.839197158813477


In [None]:
model.eval()

test_text = "Your test transcription text here"
input_encoding = tokenizer(
    test_text,
    max_length=max_length,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors='pt'
)

with torch.no_grad():
    outputs = model.generate(
        input_ids=input_encoding['input_ids'],
        attention_mask=input_encoding['attention_mask'],
        max_length=20
    )

print(f"Input: {test_text}\nPredicted Age: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")
