In [1]:
import os
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel
from tqdm.notebook import tqdm, trange
import nltk
from nltk.tokenize import sent_tokenize

In [2]:
# print("system version", sys.__version__)
print("numpy version", np.__version__)
print("pandas version", pd.__version__)
print("seaborn version", sns.__version__)
print("torch version", torch.__version__)
print("nltk version", nltk.__version__)

numpy version 2.2.0
pandas version 2.2.3
seaborn version 0.13.2
torch version 2.5.1.post306
nltk version 3.9.1


In [3]:
# Collect garbage
print(gc.collect())
print(torch.cuda.empty_cache())

858
None


In [4]:
def collect_garbage():
    gc.collect()
    torch.cuda.empty_cache()

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [6]:
# Initialize tokenizer
BERT_MODEL_CKPT = "bert-base-uncased"
bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_CKPT)
bert_model = BertModel.from_pretrained(BERT_MODEL_CKPT).to(device)

In [17]:
DISTILBERT_MODEL_CKPT = "distilbert-base-uncased"
distilbert_tokenizer = DistilBertTokenizer.from_pretrained(DISTILBERT_MODEL_CKPT)
distilbert_model = DistilBertModel.from_pretrained(DISTILBERT_MODEL_CKPT)

In [7]:
# Download nltk models
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/andy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/andy/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
CNNDM_BASE_PATH = os.path.expanduser("~/data/news/cnn_dailymail")
print(os.listdir(CNNDM_BASE_PATH))
CNNDM_TRAIN_PATH = os.path.join(CNNDM_BASE_PATH,"train.csv")
CNNDM_TEST_PATH = os.path.join(CNNDM_BASE_PATH,"test.csv")
CNNDM_VAL_PATH = os.path.join(CNNDM_BASE_PATH,"validation.csv")

['test.csv', 'train.csv', 'validation.csv']


In [9]:
def tokenize(text, max_sequence_length, tokenizer, model):
    """
        Convert text into numerical representation
        using BertTokenizer and BertModel
    """
    sents = sent_tokenize(text)
    # print(len(sents))
    tokenized_sentences = tokenizer(sents,
                          truncation=True,
                          max_length=max_sequence_length,
                          padding="max_length",
                          return_tensors="pt").to(device)
    input_ids = tokenized_sentences["input_ids"]
    attention_mask = tokenized_sentences["attention_mask"]
    # # print(input_ids.size())
    # # print(attention_mask.size())
    
    with torch.no_grad():
        output = bert_model(input_ids, attention_mask=attention_mask)
        last_hidden_state = output.last_hidden_state
    
    # print(last_hidden_state.size())
    return last_hidden_state
    # return input_ids

In [11]:
tokenize(text,20, bert_tokenizer, bert_model).size()


torch.Size([15, 20, 768])

In [10]:
text = """By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for newly ordained bishops in Italy last month. Symptoms of hepatitis A include fever, tiredness, loss of appetite, nausea and abdominal discomfort. Fargo Catholic Diocese in North Dakota (pictured) is where the bishop is located ."""
print(text)
print(len(text))

By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for newly ordained b

In [38]:
# tokenize(text, language_to_index=english_to_index, max_sequence_length=1_500, start_token=True, end_token=True).size()

torch.Size([1500])

In [99]:
text = train_dataset[0][0]
print(text)
print(len(text))

By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for newly ordained b

In [22]:
train_df = pd.read_csv(CNNDM_TRAIN_PATH)

In [None]:

train_df_slen = train_df["highlights"].str.len().to_numpy()
print("Mean ", train_df_slen.mean())
print("Max ", train_df_slen.max())
print("Min ", train_df_slen.min())

# sns.histplot(train_df_slen)
# plt.xlabel("Length")
# plt.ylabel("Count")
# plt.show()
plt.figure(figsize=(5,3))
sns.kdeplot(train_df_slen, fill=True)
plt.xlabel("Length")
plt.ylabel("Probability")
plt.show()

In [19]:
text = train_dataset[0][0]
print(len(text))
print(text)

t = bert_tokenizer.encode(text,
                          padding="max_length",
                          max_length=512,
                          truncation=False,
                          return_tensors="pt")
print(t.size())

1211
By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for newly ordai

### Create Dataset

In [12]:
class CnnDmDataset(Dataset):
    """
    CNN DailyMail News Summarization dataset
    """
    def __init__(self,filename:str):
        super(CnnDmDataset,self).__init__()
        self.df = pd.read_csv(filename)

    def __len__(self):
        return len(self.df)

    def __getitem__(self,idx):
        """
            returns a tuple (text, summary)
        """
        # print( self.df.iloc[idx]["article"] )
        # print( self.df.iloc[idx]["highlights"] )
        return self.df.iloc[idx]["article"], self.df.iloc[idx]["highlights"]

In [13]:
# Set batch size, max_sequence_length
MAX_SEQUENCE_LENGTH = 20
BATCH_SIZE = 1
NUM_EPOCHS = 1

In [14]:
# Create datasets
train_dataset = CnnDmDataset(CNNDM_TRAIN_PATH)
test_dataset = CnnDmDataset(CNNDM_TEST_PATH)
val_dataset = CnnDmDataset(CNNDM_VAL_PATH)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle = True)

In [15]:
def train_model(tok_tokenizer, tok_model, num_epochs = 1, gc_itergap = 1_000):
    """
        num_epochs
        gc_itergap : Number of iterations after which garbage collector will be called
    """
    iter_gc = 0
    for epoch in trange(num_epochs, desc = "Epochs") :
        for articles, highlights in tqdm(train_loader, desc="Training"):
            iter_gc = (iter_gc + 1 ) % gc_itergap
            article  = articles[0]
            highlight = highlights[0]
    
            # print(len(article), len(highlight))
            t_article = tokenize(article,
                                 max_sequence_length= MAX_SEQUENCE_LENGTH,
                                 tokenizer= tok_tokenizer,
                                model = tok_model)
            t_article = t_article.to(device)
            t_highlight = tokenize(highlight,
                                  max_sequence_length = MAX_SEQUENCE_LENGTH,
                                  tokenizer= tok_tokenizer,
                                   model = tok_model)
            t_highlight = t_highlight.to(device)
    
            # print(t_article.size(), t_highlight.size())
    
            if iter_gc == 1 :
                collect_garbage()

        
        

In [18]:
train_model(
    tok_tokenizer = distilbert_tokenizer,
    tok_model = distilbert_model,
    num_epochs= NUM_EPOCHS,
    gc_itergap=1_000)

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/287113 [00:00<?, ?it/s]

KeyboardInterrupt: 