<a href="https://colab.research.google.com/github/skywalker00001/Conterfactual-Reasoning-Project/blob/main/huggingface_t5_3_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparation

In [2]:
from google.colab import drive
drive.mount('/content/drive')
root = 'drive/MyDrive/LM/'

Mounted at /content/drive


In [3]:
!pip install sentencepiece
!pip install transformers -q
!pip install wandb -q

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 26.8 MB/s eta 0:00:01[K     |▌                               | 20 kB 26.1 MB/s eta 0:00:01[K     |▉                               | 30 kB 17.5 MB/s eta 0:00:01[K     |█                               | 40 kB 15.9 MB/s eta 0:00:01[K     |█▍                              | 51 kB 8.5 MB/s eta 0:00:01[K     |█▋                              | 61 kB 9.1 MB/s eta 0:00:01[K     |██                              | 71 kB 9.2 MB/s eta 0:00:01[K     |██▏                             | 81 kB 10.3 MB/s eta 0:00:01[K     |██▍                             | 92 kB 10.1 MB/s eta 0:00:01[K     |██▊                             | 102 kB 8.1 MB/s eta 0:00:01[K     |███                             | 112 kB 8.1 MB/s eta 0:00:01[K     |███▎                            | 122 kB 8.1 MB/s eta 0:00:01[K     |███▌    

In [4]:
# Importing stock libraries
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
import os
import regex as re
import torch
from torch import cuda
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

# WandB – Import the wandb library
import wandb

In [5]:
# Checking out the GPU we have access to. This is output is from the google colab version. 
!nvidia-smi

Wed Feb  9 18:13:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    26W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
# # Setting up the device for GPU usage

device = 'cuda' if cuda.is_available() else 'cpu'
print("Device is: ", device)

# Set random seeds and deterministic pytorch for reproducibility
SEED = 42
torch.manual_seed(SEED) # pytorch random seed
np.random.seed(SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

Device is:  cuda


In [7]:
# Login to wandb to log the model run and all the parameters
# 7229adacb32965027d73056a6927efd0365a00bc
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [51]:
# WandB – Initialize a new run
wandb.init(project="counterfactual")

# WandB – Config is a variable that holds and saves hyperparameters and inputs
# Defining some key variables that will be used later on in the training  
config = wandb.config          # Initialize config
config.TRAIN_BATCH_SIZE = 16    # input batch size for training (default: 64)
config.VALID_BATCH_SIZE = 32    # input batch size for testing (default: 1000)
config.TRAIN_EPOCHS = 2        # number of epochs to train (default: 10)
config.VAL_EPOCHS = 1 
config.LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
config.SEED = 42               # random seed (default: 42)
config.SOURCE_LEN = 128
config.TARGET_LEN = 128

[34m[1mwandb[0m: Currently logged in as: [33mskywalk3r[0m (use `wandb login --relogin` to force relogin)


In [8]:
# Global Parameter
model_version = "3.0"

In [9]:
PRETRAINED_MODEL_NAME = "t5-base"
# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained(PRETRAINED_MODEL_NAME)
model = model.to(device)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [None]:
# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

# Load dataframe

In [10]:
#training df
small_path = root + '/TimeTravel/train_supervised_small.json'
small_df = pd.read_json(small_path, lines=True)
small_df.head()

Unnamed: 0,story_id,premise,initial,counterfactual,original_ending,edited_ending
0,080198fc-d0e7-42b3-8e63-b2144e59d816,On my way to work I stopped to get some coffee.,I went through the drive through and placed my...,I went inside to place my order.,I paid the cashier and patiently waited for my...,[I paid the cashier and patiently waited at th...
1,1ba02a18-8807-4f39-9271-ef555597ce21,Terry aspired to be a chef.,His father is one.,He moved to Italy and opened a restaurant.,He decided he would continue the business. He ...,"[He decided he would continue the business., H..."
2,dbb0ad3e-9389-44ee-8290-7c3458e3fa0f,Kim and her glass went on a field trip to an a...,Everyone enjoyed looking at the sea creatures.,Everyone did not enjoy looking at the sea cre...,"But when they went to the shark exhibit, Kim w...",[Instead of going home they went to the shark ...
3,e5955040-5b87-4acb-a8c7-7e81d0ffb9f5,Susie was sitting on her barstool.,She kept kicking the counter with her feet.,She kept herself steady with her feet.,"Suddenly, her kick sent her falling backwards....","[Suddenly, an earthquake sent her falling back..."
4,dc234072-2e69-4999-9e2f-632d3ea30b78,Celeste rode her motorcycle across the woods.,"When she almost arrived at the intersection, a...","When she almost arrived at the intersection, a...",She didn't have enough time to brake and drove...,[She didn't have enough time to brake and drov...


In [11]:
print(small_df.loc[:,"edited_ending"][0])

['I paid the cashier and patiently waited at the counter for my drink.', 'When she handed me the drink, the lid came off and spilled on me.', 'The coffee hurt and I had to go home and change clothes.']


In [12]:
# text_a: source, text_b: target
text_a, text_b = [], []

for i in range(len(small_df)):
  text_a.append("premise: " + small_df.loc[i, 'premise'] + " initial: " + \
                small_df.loc[i, 'initial'] + " counterfactual: " + small_df.loc[i, 'counterfactual'] + \
                " original_ending: " + small_df.loc[i, 'original_ending'])
  #text_a.append(re.sub(re_pat, df.loc[i, 'edit1'], df.loc[i, 'original1']))
  text_b.append("edited_ending: " + small_df.loc[i, 'edited_ending'][0] +" "+ small_df.loc[i, 'edited_ending'][1] +" "+ \
                small_df.loc[i, 'edited_ending'][2])

In [13]:
train_df = pd.DataFrame({'source_text': text_a, 'target_text': text_b}) 
train_df.head()

Unnamed: 0,source_text,target_text
0,premise: On my way to work I stopped to get so...,edited_ending: I paid the cashier and patientl...
1,premise: Terry aspired to be a chef. initial: ...,edited_ending: He decided he would continue th...
2,premise: Kim and her glass went on a field tri...,edited_ending: Instead of going home they went...
3,premise: Susie was sitting on her barstool. in...,"edited_ending: Suddenly, an earthquake sent he..."
4,premise: Celeste rode her motorcycle across th...,edited_ending: She didn't have enough time to ...


In [14]:
print(train_df.loc[0, "source_text"])
print("-------------")
print(train_df.loc[0, "target_text"])

premise: On my way to work I stopped to get some coffee. initial: I went through the drive through and placed my order. counterfactual: I went inside to place my order. original_ending: I paid the cashier and patiently waited for my drink. When she handed me the drink, the lid came off and spilled on me. The coffee hurt and I had to go home and change clothes.
-------------
edited_ending: I paid the cashier and patiently waited at the counter for my drink. When she handed me the drink, the lid came off and spilled on me. The coffee hurt and I had to go home and change clothes.


In [15]:
print(train_df.shape)

(16752, 2)


In [None]:
# valid df
large_path = root + '/TimeTravel/train_supervised_large.json'
df_large = pd.read_json(large_path, lines=True)
print(len(df_large))

In [None]:
small_ids = []
for i in range(len(small_df)):
    small_ids.append(small_df.loc[i, 'story_id'])

print(len(small_ids))

In [None]:
df_large = df_large[~df_large.story_id.isin(small_ids)]
df_large = df_large.reset_index()   # must reset index after delete rows
print(len(df_large))

In [None]:
# select data not in training set
part_df_large = df_large[0:1000]
df_large = df_large.reset_index()
print(len(part_df_large))

In [None]:
text, gt = [],[] # gt for ground truth

for i in range(len(part_df_large)):
  text.append("premise: " + part_df_large.loc[i, 'premise'] + \
              " initial: " + part_df_large.loc[i, 'initial'] + \
              " counterfactual: " + part_df_large.loc[i, 'counterfactual'] + \
              " original_ending: " + part_df_large.loc[i, 'original_ending'])
  gt.append("edited_ending: " + part_df_large.loc[i, 'edited_ending'][0] +" "+ \
            part_df_large.loc[i, 'edited_ending'][1] +" "+ part_df_large.loc[i, 'edited_ending'][2])

print(len(text))

In [None]:
valid_df = pd.DataFrame({'source_text': text, 'target_text': gt}) 
valid_df.head()

# Dataset and Dataloader

In [30]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, ori_len, con_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.ori_len = ori_len
        self.con_len = con_len
        self.original = self.data.source_text
        self.counterfactual = self.data.target_text
        

    def __len__(self):
        return len(self.counterfactual)

    def __getitem__(self, index):
        original = str(self.original[index])
        original = ' '.join(original.split())

        counterfactual = str(self.counterfactual[index])
        counterfactual = ' '.join(counterfactual.split())

        source = self.tokenizer.encode_plus(original, max_length= self.ori_len, padding='max_length',return_tensors='pt')
        target = self.tokenizer.encode_plus(counterfactual, max_length= self.con_len, padding='max_length',return_tensors='pt')

        # source_ids = source['input_ids'].squeeze()
        # source_mask = source['attention_mask'].squeeze()
        # target_ids = target['input_ids'].squeeze()
        # target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [35]:
trainingset = CustomDataset(dataframe=train_df, tokenizer=tokenizer, ori_len=config.SOURCE_LEN , con_len=config.TARGET_LEN )
validset = CustomDataset(dataframe=valid_df, tokenizer=tokenizer, ori_len=config.SOURCE_LEN , con_len=config.TARGET_LEN )

In [36]:
# pick up a data sample
sample_idx = 4
sample = trainingset[sample_idx]

source_ids = sample["source_ids"]
source_mask = sample["source_mask"]
target_ids = sample["target_ids"]
target_ids_y = sample["target_ids_y"]

print(source_ids)

tensor([    3, 17398,    10, 13136,   849,  6102,    15,   160, 11718,   640,
            8,  1679,     7,     5,  2332,    10,   366,   255,   966,  4363,
           44,     8, 15415,     6,     3,     9,     3, 31062,    15,  4283,
            5,  3485,    89, 25481,    10,   366,   255,   966,  4363,    44,
            8, 15415,     6,     3,     9,    20,    49,  4283,     5,   926,
          834,  9303,    10,   451,   737,    31,    17,    43,   631,    97,
           12,  9563,    11, 10719,  1587,     8,     3, 31062,    15,     5,
          328,   130,   321,  7673,    15,    26,    91,     5,   451,  3725,
          530,    95,    11,   718,     8,  2095,     5,     1,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [40]:
print(train_df.iloc[sample_idx].target_text)

sen = tokenizer.decode(target_ids, skip_special_tokens=False) # skip_special_tokens=True will be completely same.
print(sen)

edited_ending: She didn't have enough time to brake and drove towards the deer, They were both knocked out. She eventually got up and called the police.
edited_ending: She didn't have enough time to brake and drove towards the deer, They were both knocked out. She eventually got up and called the police.</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [50]:
# DataLoader

train_params = {
    'batch_size': config.TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 2
    }

val_params = {
    'batch_size': config.VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 2
    }

training_loader = DataLoader(trainingset, **train_params)
val_loader = DataLoader(validset, **val_params)
print(len(training_loader))
print(len(val_loader))

1047


# Define train() and val()

In [None]:
# Creating the training function. This will be called in the main function. It is run depending on the epoch value.
# The model is put into train mode and then we wnumerate over the training loader and passed to the defined network 

def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for i,data in enumerate(loader):
        #len(loader)=10xx
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)
        y = data['target_ids'].to(device, dtype = torch.long)

        # padded ids (pad=0) are set to -100, which means ignore for loss calculation
        y[y[: ,:] == tokenizer.pad_token_id ] = -100
        label_ids = y.to(device)

        outputs = model(input_ids = ids, attention_mask = mask, labels=label_ids)
        loss = outputs[0]
        #logit = outputs[1]
        
        if i%50 == 0:
            wandb.log({"Training Loss": loss.item()})

        if i%600==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # xm.optimizer_step(optimizer)
        # xm.mark_step()

In [None]:
def validate(tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for i, data in enumerate(loader):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            raw = [tokenizer.decode(i, skip_special_tokens=True, clean_up_tokenization_spaces=True) for i in ids]
            preds = [tokenizer.decode(i, skip_special_tokens=True, clean_up_tokenization_spaces=True) for i in generated_ids]
            target = [tokenizer.decode(i, skip_special_tokens=True, clean_up_tokenization_spaces=True)for i in y]
            if i%(len(loader)/5)==0:
                print(f'valid Completed {i/loader} / {5}')

            predictions.extend(preds)
            actuals.extend(target)
    return raw, predictions, actuals

# main

In [None]:
import time
# Helper function to print time between epochs
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
# Log metrics with wandb
#wandb.watch(model, log="all")
# Training loop
print('Initiating Fine-Tuning for the model on counterfactual dataset:')


for epoch in tqdm(range(config.TRAIN_EPOCHS)):
    start_time = time.time()
    train(epoch, tokenizer, model, device, training_loader, optimizer)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print(f'Epoch: {epoch:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')


In [None]:
# Validation loop and saving the resulting file with predictions and acutals in a dataframe.
# Saving the dataframe as predictions.csv
print('Now inferecing:')
raw, predictions, actuals = validate(tokenizer, model, device, val_loader)
final_df = pd.DataFrame({'raw_text': raw, 'ground_truth': actuals, 'generated_text': predictions})
final_df.to_csv(root + 'results/' + 'output' + model_version + '.csv')
print('Output Files generated for review')