In [None]:
!pip install transformers
!pip install sentencepiece

In [None]:
import pandas as pd
import os
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers.optimization import  Adafactor 
import time
import warnings
warnings.filterwarnings('ignore')


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
train_df=pd.read_csv('task_2_event_summarization_train.tsv', sep = '\t')
valid_df=pd.read_csv('task_2_event_summarization_valid.tsv', sep = '\t')

In [None]:
train_df.head()

In [None]:
len(train_df)
len(valid_df)

In [None]:
train_df.head()

In [None]:
batch_size = 5
num_of_batches = len(train_df)/batch_size
# num_of_epochs = 4

In [None]:
num_of_batches=int(num_of_batches)

In [None]:
if torch.cuda.is_available():
    dev = torch.device("cuda:0") 
    print("Running on the GPU")
else:
    dev = torch.device("cpu")
    print("Running on the CPU")

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-large')
model = T5ForConditionalGeneration.from_pretrained('t5-large', return_dict=True)
#moving the model to device(GPU/CPU)
model.to(dev)

In [None]:
optimizer = Adafactor(
    model.parameters(),
    lr=1e-3,
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8,
    beta1=None,
    weight_decay=0.0,
    relative_step=False,
    scale_parameter=False,
    warmup_init=False
)

In [None]:
from IPython.display import HTML, display

def progress(loss,value, max=100):
    return HTML(""" Batch loss :{loss}
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(loss=loss,value=value, max=max))

In [None]:
num_of_epochs=1

In [None]:
#Sets the module in training mode
model.train()

loss_per_10_steps=[]
for epoch in range(1,num_of_epochs+1):
  print('Running epoch: {}'.format(epoch))
  
  running_loss=0

  out = display(progress(1, num_of_batches+1), display_id=True)
  for i in range(num_of_batches):
    inputbatch=[]
    labelbatch=[]
    new_df=train_df[i*batch_size:i*batch_size+batch_size]
    for indx,row in new_df.iterrows():
    #   print(indx, row)
        val1 = row['ACTOR1']
        val2 = row['ACTOR2']
        if pd.isnull(val1) and pd.isnull(val2):
            input = 'TEXT: '+ str(row['EVENT_DATE']) + ' | '+ row['SOURCE'] + ' | '+ str(row['FATALITIES']) + ' | '+ row['EVENT_TYPE']+ ' | '+ row['SUB_EVENT_TYPE']+ ' | '+ row['LOCATION']
        elif pd.isnull(val1):
            input = 'TEXT: '+ str(row['EVENT_DATE']) + ' | '+ row['SOURCE'] + ' | '+ str(row['FATALITIES']) + ' | '+ row['EVENT_TYPE']+ ' | '+ row['SUB_EVENT_TYPE']+ ' | '+ row['LOCATION'] + ' | '+ str(row['ACTOR2'])
        else:
            input = 'TEXT: '+ str(row['EVENT_DATE']) + ' | '+ row['SOURCE'] + ' | '+ str(row['FATALITIES']) + ' | '+ row['EVENT_TYPE']+ ' | '+ row['SUB_EVENT_TYPE']+ ' | '+ row['LOCATION'] + ' | '+ str(row['ACTOR1'])
    
        # input = 'TEXT: '+ str(row['EVENT_DATE'])+ row['SOURCE'] + str(row['FATALITIES']) + row['EVENT_TYPE']+ row['SUB_EVENT_TYPE']+ row['LOCATION'] + str(row['ACTOR1'])+ str(row['ACTOR2'])+'</s>' 
    
        labels = row['NOTES']+'</s>'   
        inputbatch.append(input)
        labelbatch.append(labels)
    inputbatch=tokenizer.batch_encode_plus(inputbatch,padding=True,max_length=400,return_tensors='pt')["input_ids"]
    labelbatch=tokenizer.batch_encode_plus(labelbatch,padding=True,max_length=400,return_tensors="pt") ["input_ids"]
    inputbatch=inputbatch.to(dev)
    labelbatch=labelbatch.to(dev)

    # clear out the gradients of all Variables 
    optimizer.zero_grad()
    torch.cuda.memory_summary(device=None, abbreviated=False)

    # Forward propogation
    outputs = model(input_ids=inputbatch, labels=labelbatch)
    loss = outputs.loss
    loss_num=loss.item()
    logits = outputs.logits
    running_loss+=loss_num
    if i%10 ==0:      
      loss_per_10_steps.append(loss_num)
    out.update(progress(loss_num,i, num_of_batches+1))

    # calculating the gradients
    loss.backward()

    #updating the params
    optimizer.step()
    
  running_loss=running_loss/int(num_of_batches)
  print('Epoch: {} , Running loss: {}'.format(epoch,running_loss))

In [None]:
len(valid_df)
final_valid = []
for indx,row in valid_df.iterrows():
    val1 = row['ACTOR1']
    val2 = row['ACTOR2']
    if pd.isnull(val1) and pd.isnull(val2):
        text = str(row['EVENT_DATE']) + ' | '+ row['SOURCE'] + ' | '+ str(row['FATALITIES']) + ' | '+ row['EVENT_TYPE']+ ' | '+ row['SUB_EVENT_TYPE']+ ' | '+ row['LOCATION']
    elif pd.isnull(val1):
        text = str(row['EVENT_DATE']) + ' | '+ row['SOURCE'] + ' | '+ str(row['FATALITIES']) + ' | '+ row['EVENT_TYPE']+ ' | '+ row['SUB_EVENT_TYPE']+ ' | '+ row['LOCATION'] + ' | '+ str(row['ACTOR2'])
    else:
        text = str(row['EVENT_DATE']) + ' | '+ row['SOURCE'] + ' | '+ str(row['FATALITIES']) + ' | '+ row['EVENT_TYPE']+ ' | '+ row['SUB_EVENT_TYPE']+ ' | '+ row['LOCATION'] + ' | '+ str(row['ACTOR1'])
    # text = str(row['EVENT_DATE']) + ' | '+ row['SOURCE'] + ' | '+ str(row['FATALITIES']) + ' | '+ row['EVENT_TYPE']+ ' | '+ row['SUB_EVENT_TYPE']+ ' | '+ row['LOCATION'] + ' | '+ str(row['ACTOR1'])+ ' | '+ str(row['ACTOR2'])
    
    final_valid.append(text)

In [None]:
len(final_valid)

In [None]:
model.eval()
final_valid_text = []
encodings = []
for text in final_valid:
    input_ids = tokenizer.encode("TEXT: {} </s>".format(text), return_tensors="pt")  # Batch size 1
    input_ids=input_ids.to(dev)
    # encodings.append(input_ids)
    outputs = model.generate(input_ids)
    final_txt = tokenizer.decode(outputs[0]).replace('<pad>','').replace('</s>','')
    final_valid_text.append(final_txt)

In [None]:
len(final_valid_text)

In [None]:
import torch
from tqdm import tqdm

max_length = model.config.n_positions
stride = 512

nlls = []
for i in tqdm(range(0, input_ids.size(1), stride)):
    begin_loc = max(i + stride - max_length, 0)
    end_loc = min(i + stride, input_ids.size(1))
    trg_len = end_loc - i  # may be different from stride on last loop
    input_ids = input_ids[:, begin_loc:end_loc].to(dev)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)
        neg_log_likelihood = outputs[0] * trg_len

    nlls.append(neg_log_likelihood)

ppl = torch.exp(torch.stack(nlls).sum() / end_loc)

In [None]:
ppl

In [None]:
! pip install rouge

In [None]:
from rouge import Rouge
rouge = Rouge()

In [None]:
reference = []
for txt in valid_df['NOTES']:
    reference.append(txt)


In [None]:
len(final_valid_text)

In [None]:
len(reference)

In [None]:
rouge.get_scores(final_valid_text, reference, avg = True)