In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://us

In [None]:
# run this code when running the code on Google Colab
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.insert(0,'/content/drive/MyDrive/Applied_ML_Project/')

Mounted at /content/drive


In [None]:
# check for GPU

import torch

if torch.cuda.is_available():
  device = torch.device("cuda")
  print('There are %d GPU(s) available.' % torch.cuda.device_count())
  print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
  print('No GPU available, using the CPU instead.')
  device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
from huggingface_hub import login

login(token="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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


Loading dataset and preprocessing data

In [None]:
import pandas as pd
import transformers
from datasets import load_metric
from sklearn.model_selection import train_test_split
import re

data = pd.read_pickle("/content/drive/MyDrive/Applied_ML_Project/data_preprocessing/summary_data1.pkl")
data.dropna(how='any', inplace = True)


# selecting only those rows with string length <= 1024
new_data = data[data['original_text'].apply(lambda x: len(x.split(' ')) <= 1024)]
len(new_data)

15991

Splitting into train-test-val sets

In [None]:
train_data, val_test_data = train_test_split(new_data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=123)

Tokenization of dataset

In [None]:
from transformers import AutoTokenizer

#model_checkpoint = "facebook/bart-base"
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
max_input_length = 1024
max_target_length = 256

def preprocess(df):
    tokenized_data = []
    for index, row in df.iterrows():
        inputs = re.sub("\s\s+", " ", row['original_text'])    
        model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding='max_length')

        # Setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(row['reference_summary'], max_length=max_target_length, truncation=True, padding='max_length')

        model_inputs['labels'] = labels['input_ids']
        
        tokenized_data.append(model_inputs)
    
    # tokenized_df = pd.DataFrame(tokenized_data).values
    return tokenized_data

In [None]:
tokenized_train_data = preprocess(train_data)
tokenized_val_data = preprocess(val_data)
tokenized_test_data = preprocess(test_data)



DataLoader

In [None]:
import datasets


tokenized_train_data = datasets.Dataset.from_pandas(pd.DataFrame(data=tokenized_train_data))
tokenized_val_data = datasets.Dataset.from_pandas(pd.DataFrame(data=tokenized_val_data))


In [None]:
tokenized_train_data.set_format("torch")
tokenized_val_data.set_format("torch")


In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_train_data, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_val_data, batch_size=8)

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Optimizer and Learning Rate Scheduler

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler

epochs = 4
training_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_training_steps=training_steps, num_warmup_steps=0)


Training

In [None]:
torch.cuda.empty_cache()
# import gc
# del variables
# gc.collect()

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(training_steps))

model.to(device)
model.train()
for epoch in range(epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/5600 [00:00<?, ?it/s]

In [None]:
torch.save(model, "/content/drive/MyDrive/Applied_ML_Project/Summarization Final Models/t5-small_finetuned.pt")

In [None]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


In [None]:
!pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24954 sha256=f19f845c1bc842e444df91354662dd2d19d6f256d5f2d68354cabe5fa8ccf516
  Stored in directory: /root/.cache/pip/wheels/9b/3d/39/09558097d3119ca0a4d462df68f22c6f3c1b345ac63a09b86e
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import evaluate
from rouge_score import rouge_scorer

# Load ROUGE metric
metric = evaluate.load("rouge")

# Create scorer object for ROUGE
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Set model to evaluation mode
model.eval()

# Initialize score dictionaries for each ROUGE measure
scores = {'rouge1': {'f': 0, 'p': 0, 'r': 0},
          'rouge2': {'f': 0, 'p': 0, 'r': 0},
          'rougeL': {'f': 0, 'p': 0, 'r': 0}}

# Iterate over batches in eval_dataloader
for batch in eval_dataloader:
    # Transfer batch to device
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        # Compute model predictions
        outputs = model(**batch)
        
    # Get predicted labels
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    
    # Convert predictions and labels to lists
    predictions_list = predictions.cpu().tolist()
    labels_list = batch["labels"].cpu().tolist()
    
    # Compute ROUGE score for batch
    batch_scores = scorer.score(str(predictions_list), str(labels_list))
    
    # Add batch scores to overall scores
    for rouge_type in scores.keys():
        scores[rouge_type]['f'] += batch_scores[rouge_type].fmeasure
        scores[rouge_type]['p'] += batch_scores[rouge_type].precision
        scores[rouge_type]['r'] += batch_scores[rouge_type].recall

# Compute average scores over all batches
num_batches = len(eval_dataloader)
for rouge_type in scores.keys():
    scores[rouge_type]['f'] /= num_batches
    scores[rouge_type]['p'] /= num_batches
    scores[rouge_type]['r'] /= num_batches

# Print ROUGE scores
print('ROUGE-1: F={:.2f}, P={:.2f}, R={:.2f}'.format(scores['rouge1']['f'], scores['rouge1']['p'], scores['rouge1']['r']))
print('ROUGE-2: F={:.2f}, P={:.2f}, R={:.2f}'.format(scores['rouge2']['f'], scores['rouge2']['p'], scores['rouge2']['r']))
print('ROUGE-L: F={:.2f}, P={:.2f}, R={:.2f}'.format(scores['rougeL']['f'], scores['rougeL']['p'], scores['rougeL']['r']))

ROUGE-1: F=0.90, P=0.90, R=0.90
ROUGE-2: F=0.77, P=0.77, R=0.77
ROUGE-L: F=0.84, P=0.84, R=0.84


In [None]:
# Tokenize test text
tokenized_test_data = datasets.Dataset.from_pandas(pd.DataFrame(data=tokenized_test_data))
tokenized_test_data.set_format("torch")

ArrowInvalid: ignored

In [None]:
# Set decoding parameters
num_beams = 4
max_length = 256
# Generate summary using beam search decoding

outputs = model.generate(
    input_ids=torch.unsqueeze(tokenized_test_data[500]['input_ids'].to(device),0),
    max_length=256,
    num_beams=num_beams,
    early_stopping=True,
    no_repeat_ngram_size=2,
    num_return_sequences=1
)

# Decode summary tokens back into text
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print summary
print("Generated summary:", summary)

Generated summary: (iv) Any Federal judge, without regard to the article of the Constitution pursuant to which appointed. (ii) A member of Congress (including any Delegate and Resident Commissioner) or any federal official appointed by the President, Vice-President, or a federal agency head and subject to confirmation by any Senate. (b) Effective Date.--The amendment made by this section shall apply to contributions made after the date of enactment of this Act.


In [None]:
test_data.iloc[500,0]

"SECTION 1. SHORT TITLE. This Act may be cited as the ``Contributions Legally Interdicted from Noncitizens To Our Nonprofits Act of 2016'' or as the ``CLINTON Act of 2016''. SEC. 2. CERTAIN CHARITABLE ORGANIZATIONS PROHIBITED FOR ACCEPTING CONTRIBUTIONS FROM PERSONS CONNECTED TO FOREIGN GOVERNMENTS. (a) In General.--Section 501 of the Internal Revenue Code of 1986 is amended by adding at the end the following new subsection: ``(s) Prohibition on Acceptance of Contributions From Persons Connected to Foreign Governments by 501(c)(3) Organizations Established by Certain Federal Officials.-- ``(1) Termination of tax-exempt status.--A Federal official organization shall not be treated as described in subsection (c)(3) with respect to any period after the date on which such organization knowingly or willingly accepts or solicits any contribution from any person connected to a foreign government. If a Federal official organization accepts a contribution from any person and learns that such pe

In [None]:
summary

'(iv) Any Federal judge, without regard to the article of the Constitution pursuant to which appointed. (ii) A member of Congress (including any Delegate and Resident Commissioner) or any federal official appointed by the President, Vice-President, or a federal agency head and subject to confirmation by any Senate. (b) Effective Date.--The amendment made by this section shall apply to contributions made after the date of enactment of this Act.'