In [1]:
from torch import cuda

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from tqdm.auto import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sklearn.model_selection import train_test_split

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
data = pd.read_csv("../data/raw_data/rawdat/AFG/quadruple.txt", sep='\t', names = ["source","relation","target","date"])
data.head()

Unnamed: 0,source,relation,target,date
0,Armed Gang (Afghanistan),Carry out suicide bombing,United States,2010-01-01
1,Central Intelligence Agency,Make statement,Taliban,2010-01-01
2,Taliban,Make statement,Attacker (Afghanistan),2010-01-01
3,Citizen (Afghanistan),Demonstrate or rally,Unspecified Actor,2010-01-01
4,Armed Gang (Afghanistan),Carry out suicide bombing,Central Intelligence Agency,2010-01-01


In [4]:
text =pd.read_csv("../data/raw_data/rawdat/AFG/text.txt", sep='\t', names = ["text"])
text.head()

Unnamed: 0,text
0,"According to a foreign news agency, a suicide ..."
1,The CIA said a Taliban bomber on Wednesday man...
2,The Taliban claimed responsibility for the att...
3,The case in Kunar has already prompted Afghans...
4,The CIA base attacked by a suicide bomber in A...


In [5]:
data['source'] = data['source'].str.strip()
data['relation'] = data['relation'].str.strip()
data['target'] = data['target'].str.strip()
text['text'] = text['text'].str.strip()

In [6]:
result = pd.concat([data,text],axis=1)
result.head()

Unnamed: 0,source,relation,target,date,text
0,Armed Gang (Afghanistan),Carry out suicide bombing,United States,2010-01-01,"According to a foreign news agency, a suicide ..."
1,Central Intelligence Agency,Make statement,Taliban,2010-01-01,The CIA said a Taliban bomber on Wednesday man...
2,Taliban,Make statement,Attacker (Afghanistan),2010-01-01,The Taliban claimed responsibility for the att...
3,Citizen (Afghanistan),Demonstrate or rally,Unspecified Actor,2010-01-01,The case in Kunar has already prompted Afghans...
4,Armed Gang (Afghanistan),Carry out suicide bombing,Central Intelligence Agency,2010-01-01,The CIA base attacked by a suicide bomber in A...


In [7]:
result['text'][0]

'According to a foreign news agency, a suicide bomber attacked the US CIA center in Khost, Afghanistan, killing eight CIA officials and wounding six others.'

In [8]:
unique_result = result.drop_duplicates()
# unique_result = result.drop_duplicates(subset=['text'],keep='last')

In [9]:
# len(result)

In [10]:
# len(unique_result)

In [11]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Tried In-context learning, gives answers close to what we want, but hallucinates sometimes. There is a limit to number of input tokens that can be provided.

In [12]:
# print(f"Unique Sources: {len(result['source'].unique())} \nUnique Targets: {len(result['target'].unique())}\nUnique Relations: {len(result['relation'].unique())} \nUnique Texts: {len(result['text'].unique())}")

In [13]:
# input_text = "Given an event text, extract the source, target and relation between them: For e.g.\n"
# # for t in range(len(result)):
# for t in range(len(result)):
#     if len(input_text.split())>=512:
#         input_text += f"Given Text: {result['text'][t]}"
#         print(f"Prediction text :{result['text'][t]}\n Extracted information -> Source: {result['source'][t]}, Target: {result['target'][t]}, Relation: {result['relation'][t]}\n")
#         break
#     input_text += f"Given text:{result['text'][t]}\n Extracted information -> Source:{result['source'][t]}, Target:{result['target'][t]}, Relation:{result['relation'][t]}\n"

# input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

In [14]:
# outputs = model.generate(input_ids,max_new_tokens=1000)
# print(tokenizer.decode(outputs[0],skip_special_tokens=True))

Tried Chain-of-Thought Prompting. Didn't work well as the reasoning didn't make much sense.

In [15]:
# cot_input = f"Given a text: 'In western Farah province, which like Badghis is experiencing escalating violence as Taliban influence spreads to previously peaceful areas, three militants were killed by police in a shootout late Thursday, police said', we extract the source:'Police (Afghanistan)', target:'Police (Afghanistan)' and relation:' Use conventional military force'. Explain the reasoning as to how the source, target and relation are extracted based on the text."
# cot_input_ids = tokenizer(cot_input, return_tensors="pt").input_ids.to("cuda")

In [16]:
# cot_outputs = model.generate(cot_input_ids,max_new_tokens=1000)
# print(tokenizer.decode(cot_outputs[0],skip_special_tokens=True))

### Training a supervised model using FLAN-T5 for multiclass classification of event type

In [17]:
relation_ids = pd.read_csv('../data/raw_data/rawdat/AFG/relation2id.txt', sep='\t', names=['relation','id'])
relation_ids.head()

Unnamed: 0,relation,id
0,Carry out suicide bombing,0
1,Make statement,1
2,Demonstrate or rally,2
3,Meet at a 'third' location,3
4,Employ aerial weapons,4


In [18]:
relation_id_maps = pd.Series(relation_ids.id.values, index=relation_ids.relation).to_dict()
relation_id_maps

{'Carry out suicide bombing': 0,
 'Make statement': 1,
 'Demonstrate or rally': 2,
 "Meet at a 'third' location": 3,
 'Employ aerial weapons': 4,
 'Acknowledge or claim responsibility': 5,
 'Engage in negotiation': 6,
 'Make an appeal or request': 7,
 'Use conventional military force': 8,
 'Consult': 9,
 'Threaten with military force': 10,
 'Mobilize or increase armed forces': 11,
 'Abduct, hijack, or take hostage': 12,
 'Use unconventional violence': 13,
 'Make a visit': 14,
 'Host a visit': 15,
 'Appeal for diplomatic cooperation (such as policy support)': 16,
 'Provide aid': 17,
 'Conduct suicide, car, or other non-military bombing': 18,
 'Investigate': 19,
 'Accuse': 20,
 'Criticize or denounce': 21,
 'Reject': 22,
 'Arrest, detain, or charge with legal action': 23,
 'Praise or endorse': 24,
 'Demand': 25,
 'fight with artillery and tanks': 26,
 'Deny responsibility': 27,
 'Use as human shield': 28,
 'Accuse of human rights abuses': 29,
 'fight with small arms and light weapons': 3

In [19]:
unique_result['relation_to_id'] = unique_result['relation'].map(relation_id_maps)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_result['relation_to_id'] = unique_result['relation'].map(relation_id_maps)


In [20]:
unique_result.head()

Unnamed: 0,source,relation,target,date,text,relation_to_id
0,Armed Gang (Afghanistan),Carry out suicide bombing,United States,2010-01-01,"According to a foreign news agency, a suicide ...",0
1,Central Intelligence Agency,Make statement,Taliban,2010-01-01,The CIA said a Taliban bomber on Wednesday man...,1
2,Taliban,Make statement,Attacker (Afghanistan),2010-01-01,The Taliban claimed responsibility for the att...,1
3,Citizen (Afghanistan),Demonstrate or rally,Unspecified Actor,2010-01-01,The case in Kunar has already prompted Afghans...,2
4,Armed Gang (Afghanistan),Carry out suicide bombing,Central Intelligence Agency,2010-01-01,The CIA base attacked by a suicide bomber in A...,0


In [21]:
# y = F.one_hot(torch.randint(0, 10, (10,)),num_classes = 10)

In [22]:
# labels = torch.randint(0, 10, (10,))

# # labels --> one-hot 
# one_hot = torch.nn.functional.one_hot(labels)
# # one-hot --> labels
# labels_again = torch.argmax(one_hot, dim=1)

In [23]:
# y_ids = y[:, :-1].contiguous()
# lm_labels = y[:, 1:].clone().detach()
# lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100

In [24]:
class EventDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_len, source_text, target_label):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.source_text = dataframe[source_text]
        self.target_label = dataframe[target_label]

    def __len__(self):
        return len(self.target_label)
    
    def __getitem__(self, index):
        source_text = str(self.source_text[index])
        target_label = int(self.target_label[index])
        
        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "labels": torch.tensor(target_label, dtype=torch.long),
        }

In [25]:
def train(epoch, model, device, loader, optimizer):
    model.train()
    for _, data in enumerate(loader, 0):
        labels = data["labels"].to(device, dtype=torch.long)
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)

        optimizer.zero_grad()
        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        del ids, mask, labels, outputs  # Free up memory
        torch.cuda.empty_cache()

        if _ % 10 == 0:
            training_logger.add_row(str(epoch), str(_), str(loss.item()))
            console.print(training_logger)

In [26]:
def validate(epoch, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            labels = data["labels"].to(device, dtype=torch.long)
            ids = data["source_ids"].to(device, dtype=torch.long)
            mask = data["source_mask"].to(device, dtype=torch.long)
            
            outputs = model(input_ids=ids, attention_mask=mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            actuals.extend(labels.cpu().numpy())
            if _ % 10 == 0:
                print(f'Completed {_}')
    
    return predictions, actuals

In [27]:
def T5Trainer(dataframe, source_text, tokenizer, target_text, model, model_params, output_dir="./outputs/"):
    torch.manual_seed(model_params["SEED"])
    np.random.seed(model_params["SEED"])
    torch.backends.cudnn.deterministic = True

    print(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    tokenizer = tokenizer
    model = model.to(device)

    print(f"[Data]: Reading data...\n")

    dataframe = dataframe[[source_text, target_text]]
    print(dataframe.head(2))

    train_size = 0.8
    train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
    val_dataset = dataframe.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    print(f"FULL Dataset: {dataframe.shape}")
    print(f"TRAIN Dataset: {train_dataset.shape}")
    print(f"TEST Dataset: {val_dataset.shape}\n")

    training_set = EventDataset(
        train_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        source_text,
        target_text,
    )

    val_set = EventDataset(
        val_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        source_text,
        target_text,
    )

    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }

    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    optimizer = torch.optim.Adam(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )

    print(f"[Initiating Fine Tuning]...\n")
    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train(epoch, model, device, training_loader, optimizer)


    print(f"[Saving Model]...\n")
    path = os.path.join(output_dir, "model_files")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)


    print(f"[Initiating Validation]...\n")
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = validate(epoch, model, device, val_loader)
        final_df = pd.DataFrame({"Predictions": predictions, "Actuals": actuals})
        final_df.to_csv(os.path.join(output_dir, "predictions.csv"))

    print(f"[Validation Completed.]\n")
    print(f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n""")
    print(f"""[Validation] Predictions on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n""")
    print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")

In [32]:
# let's define model parameters specific to T5
model_params = {
    "MODEL": "t5-xl",  # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE": 8,  # training batch size
    "VALID_BATCH_SIZE": 8,  # validation batch size
    "TRAIN_EPOCHS": 3,  # number of training epochs
    "VAL_EPOCHS": 1,  # number of validation epochs
    "LEARNING_RATE": 1e-4,  # learning rate
    "MAX_SOURCE_TEXT_LENGTH": 512,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH": 50,  # max length of target text
    "SEED": 42,  # set seed for reproducibility
}

In [33]:
T5Trainer(
    dataframe=unique_result,
    source_text="text",
    tokenizer = tokenizer,
    target_text="relation_to_id",
    model_params=model_params,
    model = model,
    output_dir="../notebooks and .py/outputs/",
)



[Model]: Loading t5-xl...



RuntimeError: You can't move a model that has some modules offloaded to cpu or disk.