<a href="https://colab.research.google.com/github/pranay8297/BitcoinSummaryGeneration/blob/main/SummaryFineTune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the required libs
!pip install datasets
!pip install evaluate
!pip install rouge_score
!pip install accelerate -U
!pip install transformers[torch]
!pip install ipdb

In [2]:
import torch
import accelerate
import transformers
import evaluate
import numpy as np
import pandas as pd
import os

from ipdb import set_trace as st
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline
from google.colab import drive

from huggingface_hub import notebook_login

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
# Test
billsum = load_dataset("billsum", split="ca_test")

Downloading builder script:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.70k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/67.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [6]:
df = pd.read_csv('/content/drive/MyDrive/EAI6120DS/final_data.csv')
df.head()

Unnamed: 0,date,data,gen_summary
0,2023-11-05,"The price of Bitcoin today is 35393.84, and ye...",:\nThe price of Bitcoin has remained relativel...
1,2023-11-06,"The price of Bitcoin today is 35288.36, and ye...",:\nThe price of Bitcoin has seen a minor decre...
2,2023-11-07,"The price of Bitcoin today is 35929.83, and ye...",:\nThe news articles today provide insight int...
3,2023-11-08,"The price of Bitcoin today is 36109.43, and ye...",:\nThe price of Bitcoin has been relatively st...
4,2023-11-09,"The price of Bitcoin today is 37990.51, and ye...",:\nThe price of Bitcoin has increased by $1090...


In [7]:
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['date', 'data', 'gen_summary'],
    num_rows: 26
})

In [8]:
dataset = dataset.train_test_split(test_size=0.15)
dataset

DatasetDict({
    train: Dataset({
        features: ['date', 'data', 'gen_summary'],
        num_rows: 22
    })
    test: Dataset({
        features: ['date', 'data', 'gen_summary'],
        num_rows: 4
    })
})

In [9]:
checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [10]:
prefix = "Summarize the given text on Bitcoin's current and previous prices along with key news insights, focusing on the price changes and news impact: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["data"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    labels = tokenizer(text_target=examples["gen_summary"], max_length=512, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [12]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['date', 'data', 'gen_summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 22
    })
    test: Dataset({
        features: ['date', 'data', 'gen_summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4
    })
})

### **Lets Get a Evaluator**

In [13]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

### Lets Get a Model and trian!

In [17]:
os.environ['CUDA_VISIBLE_DEVICES'] = "1"
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)

In [15]:
device

device(type='cuda')

In [18]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir = "SummaryGenerator",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    weight_decay = 0.01,
    save_total_limit = 3,
    num_train_epochs = 10,
    predict_with_generate = True,
    fp16 = True,
    # push_to_hub = True,

)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,11.504471,0.0773,0.0272,0.0653,0.0653,19.0
2,No log,7.023804,0.0773,0.0272,0.0653,0.0653,19.0
3,No log,4.294915,0.075,0.0272,0.0654,0.0654,19.0
4,No log,3.366152,0.0798,0.0253,0.0662,0.0662,19.0
5,No log,3.007705,0.0883,0.0362,0.0742,0.0742,19.0
6,No log,2.810476,0.0883,0.0362,0.0742,0.0742,19.0
7,No log,2.690889,0.0883,0.0362,0.0742,0.0742,19.0
8,No log,2.620088,0.0882,0.0339,0.0765,0.0765,19.0
9,No log,2.583548,0.0882,0.0339,0.0765,0.0765,19.0
10,No log,2.570549,0.0882,0.0339,0.0765,0.0765,19.0


TrainOutput(global_step=110, training_loss=4.9133522727272725, metrics={'train_runtime': 43.8781, 'train_samples_per_second': 5.014, 'train_steps_per_second': 2.507, 'total_flos': 65983917680640.0, 'train_loss': 4.9133522727272725, 'epoch': 10.0})

# Inference

In [55]:
text = prefix + dataset['test']["gen_summary"][0]

In [56]:
def inference(text):
  input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
  output_ids = model.generate(input_ids, max_length = 1024)
  summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
  return summary

In [57]:
inference(text)

'the Mt. Gox bankruptcy saga may have a positive impact on the price of Bitcoin. the fall in jobless claims to a five-week low is a positive economic indicator. high inflation can lead to increased regulatory scrutiny and potential bans on cryptocurrency trading. the volatility of the cryptocurrency market can be affected by a wide range of factors.'