In [118]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [119]:
# colab pro 더 빠른 GPU 사용
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Not connected to a GPU


In [120]:
# colab pro 추가 메모리
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 37.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [121]:
import numpy as np

## Loading the dataset

In [122]:
# # install datasets
!pip install datasets



In [123]:
from datasets import load_dataset

dataset_file = '/content/drive/MyDrive/Colab Notebooks/Aiffel_Hackathon/financial_documents/Training_Data_200'

dataset = load_dataset('csv', data_files=dataset_file, split='train')

dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
val_dataset = dataset['test']

Using custom data configuration default-c6d4a6a8c6f12a1d
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-c6d4a6a8c6f12a1d/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


In [124]:
# test dataset

test_dataset_file = '/content/drive/MyDrive/Colab Notebooks/Aiffel_Hackathon/financial_documents/Val_Data_Clean.csv'

test_dataset = load_dataset('csv', data_files=test_dataset_file, split=['train'])

Using custom data configuration default-94fc8c6155ea1649
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-94fc8c6155ea1649/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

In [125]:
train_dataset

Dataset({
    features: ['Unnamed: 0', 'Original_Filename_x', 'Gold_Filename', 'Document', 'Full_Text', 'Summary_Text'],
    num_rows: 180
})

In [126]:
val_dataset

Dataset({
    features: ['Unnamed: 0', 'Original_Filename_x', 'Gold_Filename', 'Document', 'Full_Text', 'Summary_Text'],
    num_rows: 20
})

In [139]:
test_dataset
test_dataset = test_dataset[0] # 리스트 -> 데이터셋으로 변환

## Tokenization

In [128]:
# # transformer 설치
!pip install transformers



In [129]:
train_dataset

Dataset({
    features: ['Unnamed: 0', 'Original_Filename_x', 'Gold_Filename', 'Document', 'Full_Text', 'Summary_Text'],
    num_rows: 180
})

In [130]:
max_source = max([len(i.split()) for i in train_dataset['Full_Text']])
print(max_source)
max_target = max([len(i.split()) for i in train_dataset['Summary_Text']])
print(max_target)

2435
241


In [131]:
len(train_dataset['Full_Text'][0].split())

855

In [140]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('t5-base')

def tokenize(batch):
  tokenized_input = tokenizer(batch['Full_Text'], padding = 'max_length', truncation=True, max_length = 1000)
  tokenized_label = tokenizer(batch['Summary_Text'], padding = 'max_length', truncation=True, max_length = max_target)

  tokenized_input['labels'] = tokenized_label['input_ids']

  return tokenized_input

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=64)
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=64) # 추가

train_dataset.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/t5-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/91e9fe874e06c44883b535d6c950b8b89d6eaa3298d8e7fb3b2c78039e9f8b7b.66b9637a52aa11e9285cdd6e668cc0df14b3bcf0b6674cf3ba5353c542649637
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length":

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [141]:
val_dataset

Dataset({
    features: ['Document', 'Full_Text', 'Gold_Filename', 'Original_Filename_x', 'Summary_Text', 'Unnamed: 0', 'attention_mask', 'input_ids', 'labels'],
    num_rows: 20
})

In [142]:
train_dataset[0]

{'attention_mask': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 

In [144]:
train_dataset.save_to_disk('/content/drive/MyDrive/Colab Notebooks/Aiffel_Hackathon/T5/')
val_dataset.save_to_disk('/content/drive/MyDrive/Colab Notebooks/Aiffel_Hackathon/T5/')
test_dataset.save_to_disk('/content/drive/MyDrive/Colab Notebooks/Aiffel_Hackathon/T5/')

In [105]:
test_data = '/content/drive/MyDrive/Colab Notebooks/Aiffel_Hackathon/financial_documents/Val_Data_Clean.csv'

In [145]:
# 처음에는 max_source, max_target = 0으로 설정했는데, 인덱싱 오류 나와서 일단 위에서 출력한 숫자를 적용했음.
# max_source, max_target = 0으로 설정했을 때 tokenized된 max_source = 6448, max_target = 1124임

import pandas as pd

# df = pd.read_csv(train_data) # 필요 없는 부분인것 같음

source_text = train_dataset['Full_Text']
target_text = train_dataset['Summary_Text']

tokenized_source_text = tokenizer(list(source_text), truncation=False, padding=False)
tokenized_target_text = tokenizer(list(target_text), truncation=False, padding=False)

# max_source = 1000
# for item in tokenized_source_text['input_ids']:
#   if len(item) > max_source:
#     max_source = len(item)

# max_target = 243
# for item in tokenized_target_text['input_ids']:
#   if len(item) > max_target:
#     max_target = len(item)

Token indices sequence length is longer than the specified maximum sequence length for this model (1273 > 512). Running this sequence through the model will result in indexing errors


## Training

In [146]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
##
import torch
import gc
gc.collect()
torch.cuda.empty_cache()
##
model = T5ForConditionalGeneration.from_pretrained('t5-base')

output_dir = '/content/drive/MyDrive/Colab Notebooks/Aiffel_Hackathon/T5'

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=2,
    per_device_train_batch_size = 1, # default 8
    per_device_eval_batch_size = 1, # default 8
    eval_accumulation_steps = 4, # Number of eval steps to keep in GPU (the higher, the more vRAM used) default 1
    prediction_loss_only = True, # If i need to cumpute only loss and not other metrics, setting this to true will use less RAM
    learning_rate=0.001,
    evaluation_strategy = 'steps', # Run evaluation every eval_steps
    save_steps = 1000, # How often to save a checkpoint
    save_total_limit = 1, # Number of maximum checkpoints to save
    remove_unused_columns=True, # Removes unless columns from the dataset
    # run_name = 'run_name', # Wandb run name
    # logging_steps=1000, # How often to log loss to wandb
    eval_steps = 1000, # How often to run evaluation on the val_set
    # logging_first_step = False, # Wheter to log also the very first training step to wandb
    load_best_model_at_end = True, # Whether to load the best model found at each evaluation.
    metric_for_best_model = "loss", # Use loss to evaluate best model.
    greater_is_better = False # Best model is the one with the lowest loss, not highest.
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)



trainer.train()
trainer.save_model(output_dir + '/model')

loading configuration file https://huggingface.co/t5-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/91e9fe874e06c44883b535d6c950b8b89d6eaa3298d8e7fb3b2c78039e9f8b7b.66b9637a52aa11e9285cdd6e668cc0df14b3bcf0b6674cf3ba5353c542649637
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "p

Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/Aiffel_Hackathon/T5/model
Configuration saved in /content/drive/MyDrive/Colab Notebooks/Aiffel_Hackathon/T5/model/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/Aiffel_Hackathon/T5/model/pytorch_model.bin


## Evaluation

In [148]:
model_dir = '/content/drive/MyDrive/Colab Notebooks/Aiffel_Hackathon/T5/model'
output_dir = '/content/drive/MyDrive/Colab Notebooks/Aiffel_Hackathon/T5'

model = T5ForConditionalGeneration.from_pretrained(model_dir)

pred_args = TrainingArguments(
    output_dir=output_dir,
    per_device_eval_batch_size=4,
    remove_unused_columns=True,
    eval_accumulation_steps=1
)

trainer = Trainer(model=model, args=pred_args)

preds, labels, *_ = trainer.predict(test_dataset)
# preds = np.asarray(preds) # 기존 자료형이 튜플이기 때문에 오류 발생. numpy로 변환하면 모양이 안맞아서 broad casting 안됨
preds_tokens = preds.argmax(axis=2)

decoded_sources = []
for row in test_dataset:
    decoded_sources.append(tokenizer.decode(row['input_ids']))

decoded_preds = [tokenizer.decode(pred) for pred in preds_tokens]
decoded_labels = [tokenizer.decode(label) for label in labels]

output = pd.DataFrame({'Source Text': decoded_sources, 'Target Text': decoded_labels, 'Generated Text': decoded_preds})
output.to_excel(output_dir + "/predictions.xlsx")

loading configuration file /content/drive/MyDrive/Colab Notebooks/Aiffel_Hackathon/T5/model/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "n

AttributeError: ignored

In [151]:
preds

(array([[[ -8.8167515, -11.349447 , -14.951155 , ..., -44.507427 ,
          -44.28397  , -44.361774 ],
         [-30.52831  , -15.63776  ,  -8.775997 , ..., -46.43926  ,
          -46.27089  , -46.385086 ],
         [-52.70977  , -27.755772 , -27.75125  , ..., -70.00102  ,
          -69.837135 , -70.02378  ],
         ...,
         [ 39.024258 ,  -7.18725  ,  -8.601317 , ..., -30.532154 ,
          -30.390512 , -30.358355 ],
         [ 38.910435 ,  -7.1637287,  -8.609358 , ..., -30.50736  ,
          -30.365067 , -30.334715 ],
         [ 39.360703 ,  -7.131469 ,  -8.646362 , ..., -30.611734 ,
          -30.468115 , -30.445229 ]],
 
        [[-11.504383 , -12.996344 , -15.867767 , ..., -43.720524 ,
          -43.627098 , -43.66737  ],
         [-60.891907 , -28.500023 , -25.09755  , ..., -72.279854 ,
          -72.196754 , -72.397415 ],
         [-58.528572 , -30.0933   , -26.258911 , ..., -75.82728  ,
          -75.55645  , -75.83769  ],
         ...,
         [ 37.132545 ,  -4.515740

In [185]:
temp = np.asarray(preds)

  return array(a, dtype, copy=False, order=order)


ValueError: ignored

In [153]:
temp = list(preds)

In [178]:
labels.shape

(246, 241)

In [159]:
len(preds[1])

246

In [164]:
len(preds[0])

246

In [176]:
len(preds)

2

preds, labels의 타입이 왜 다른지 모르겠다.  
* preds: Tuple
* labels: numpyarray
  
해당 오류로 Evaluation을 진행하지 못했다.