## Run this notebook after Installation of Libraries and change dataset path is needed

### If you want to get Our final prediction then plz run this cells in Sequence as it is.

## Installation Libraries
## Transformers
## Datasets
## TQDM
## PyTorch
## Safetensors

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from tqdm.auto import tqdm
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel, AutoConfig, PreTrainedModel
from transformers.trainer import Trainer
from transformers.training_args import TrainingArguments
from transformers.data.data_collator import DataCollatorForTokenClassification
from safetensors.torch import safe_open
from huggingface_hub import hf_hub_download
import torch.nn as nn
import torch
import gc

# Part-1 NER Detection


### Config & Parameters

In [2]:
INFERENCE_MAX_LENGTH = 1024
CONF_THRESH = 0.01  # threshold for "O" class
MODEL_PATH = 'psresearch/deberta-v3-large-NER-Scholarly-text' ##NER Model Path

### Utils

In [3]:
def find_span(target: list[str], document: list[str]) -> list[list[int]]:
    idx = 0
    spans = []
    span = []

    for i, token in enumerate(document):
        if token != target[idx]:
            idx = 0
            span = []
            continue
        span.append(i)
        idx += 1
        if idx == len(target):
            spans.append(span)
            span = []
            idx = 0
            continue
    
    return spans

### Tokenizer

In [4]:
class CustomTokenizer:
    def __init__(self, tokenizer: PreTrainedTokenizerBase, max_length: int) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, example: dict) -> dict:
        text = []
        token_map = []

        for idx, t in enumerate(example["tokens"]):
            text.append(t)
            token_map.extend([idx]*len(t))

        text = "".join(text)

        tokenized = self.tokenizer(
            "".join(text),
            return_offsets_mapping=True,
            truncation=True,
            max_length=self.max_length,
        )

        return {**tokenized,"token_map": token_map,}

In [5]:
with open("/kaggle/input/workshop-task-acl/SOMD2025-PhaseII/test_texts.txt", "r", encoding="utf-8") as file:
    text = file.read()

text_list = text.split('\n')

In [6]:
df = pd.DataFrame()
df['train_text'] = text_list
df['train_text_list'] = df['train_text'].str.split(' ')
df['document'] = np.arange(len(df))

In [7]:
all_labels = [
    'B-Extension','I-Extension','B-Application','I-Application','B-Abbreviation','B-Citation','I-Citation',
    'B-SoftwareCoreference','I-SoftwareCoreference','B-URL','I-URL','B-AlternativeName', 'I-AlternativeName',
    'B-OperatingSystem','I-OperatingSystem','B-Developer','I-Developer','O','B-License','I-License','B-PlugIn','I-PlugIn',
    'B-Release','I-Release','B-ProgrammingEnvironment','I-ProgrammingEnvironment','B-Version','I-Version']

id2label = {i: l for i, l in enumerate(all_labels)}
label2id = {v: k for k, v in id2label.items()}
target = [l for l in all_labels if l != "O"]

In [None]:
ds = Dataset.from_dict({
        "full_text": df['train_text'].values.tolist(),
        "tokens": df['train_text_list'].values.tolist(),
        "document": [str(x) for x in df['document'].values.tolist()],
})

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
ds = ds.map(CustomTokenizer(tokenizer=tokenizer, max_length=INFERENCE_MAX_LENGTH), num_proc=os.cpu_count())

### Loading NER Model

In [None]:
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
collator = DataCollatorForTokenClassification(tokenizer)
args = TrainingArguments(".", per_device_eval_batch_size=1, report_to="none")
trainer = Trainer(
    model=model, args=args, data_collator=collator, tokenizer=tokenizer,
)

### Prediction

In [None]:
predictions = trainer.predict(ds).predictions  # (n_sample, len, n_labels)

### Post-processing

In [11]:
pred_softmax = torch.softmax(torch.from_numpy(predictions), dim=2).numpy()
id2label = model.config.id2label
o_index = model.config.label2id["O"]
preds = predictions.argmax(-1)
preds_without_o = pred_softmax.copy()
preds_without_o[:,:,o_index] = 0
preds_without_o = preds_without_o.argmax(-1)
o_preds = pred_softmax[:,:,o_index]
preds_final = np.where(o_preds < CONF_THRESH, preds_without_o , preds)

In [12]:
processed =[]
pairs = set()

# Iterate over document
for p, token_map, offsets, tokens, doc in zip(
    preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]
):
    # Iterate over sequence
    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[token_pred]

        if start_idx + end_idx == 0:
            continue

        if token_map[start_idx] == -1:
            start_idx += 1

        # ignore "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map): 
            break

        token_id = token_map[start_idx]
        pair = (doc, token_id)

        # # ignore certain labels and whitespace
        # if label_pred in ("O", "B-EMAIL", "B-URL_PERSONAL", "B-PHONE_NUM", "I-PHONE_NUM") or token_id == -1:
        #     continue        

        if pair in pairs:
            continue
            
        processed.append(
            {"document": doc, "token": token_id, "label": label_pred, "token_str": tokens[token_id]}
        )
        pairs.add(pair)

In [13]:
df_pred = pd.DataFrame(processed)
df_pred["row_id"] = list(range(len(df_pred)))

l = []
token_map = []
for x in df_pred['document'].unique():
    l.append(' '.join(df_pred[df_pred['document']==x]['label'].values))
    token_map.append(' '.join(df_pred[df_pred['document']==x]['token'].values.astype(str)))

### Correcting BIO taggings

In [14]:
def correct_bio_tagging(tags):
    corrected_tags = tags[:]
    
    for i in range(1, len(tags)):
        prev_tag = tags[i-1].split('-')[-1] if '-' in tags[i-1] else None
        curr_prefix, curr_tag = (tags[i].split('-') + [None])[:2]

        if curr_prefix == 'B' and prev_tag == curr_tag:
            corrected_tags[i] = f'I-{curr_tag}'

    return ' '.join(corrected_tags)

### Aligning Model predicted Ner Taggs

In [15]:
def align_labels_to_tokens(tokens, labels):
    """
    Ensure the labels list matches the length of the tokens list.
    Adds 'O' if labels are short, truncates if too long.
    """
    token_len = len(tokens)
    label_len = len(labels)

    if label_len < token_len:
        labels += ['O'] * (token_len - label_len)
    elif label_len > token_len:
        labels = labels[:token_len]
    
    return labels

In [16]:
len_list = [46, 8, 26, 49, 29, 42, 37, 85, 36, 41, 30, 21, 28, 44, 23, 15, 36, 27, 18, 34, 11, 44, 19, 
            9, 26, 56, 31, 16, 36, 48, 62, 29, 45, 31, 31, 38, 30, 33, 35, 22, 30, 43, 19, 17, 6, 47, 
            39, 25, 28, 32, 24, 17, 38, 30, 22, 44, 39, 30, 37, 28, 29, 24, 27, 9, 33, 20, 21, 35, 18, 
            38, 28, 17, 42, 81, 27, 8, 29, 42, 44, 63, 22, 57, 31, 59, 36, 56, 126, 85, 62, 63, 81, 56, 
            51, 79, 52, 48, 87, 50, 30, 28, 174, 39, 31, 31, 23, 24, 40, 30, 63, 28, 24, 40, 34, 41, 76, 
            39, 67, 39, 17, 120, 69, 19, 40, 22, 65, 43, 61, 34, 70, 27, 17, 30, 21, 25, 36, 15, 12, 48, 
            22, 17, 26, 59, 61, 57, 51, 58, 54, 35, 84, 44, 52, 31, 205, 45, 29, 28, 13, 52, 24, 29, 40, 
            55, 30, 23, 21, 16, 29, 20, 12, 19, 19, 22, 27, 47, 51, 41, 33, 21, 61, 64, 80, 36, 26, 39, 
            33, 43, 20, 31, 25, 101, 29, 28, 36, 31, 31, 29, 50, 57, 33, 20, 31, 78, 26, 14, 20, 27, 20, 
            21, 17, 39, 11, 41, 32, 32, 16, 27, 48, 25, 22, 30]


In [17]:
corrected_bio_tagging = []
for i in list(range(len(l))):
    w = l[i].split(' ')
    labels_text = ' '.join(align_labels_to_tokens(df['train_text_list'].values[i], correct_bio_tagging(w).split(' ')))
    assert len_list[i] == len(labels_text.split(' '))
    corrected_bio_tagging.append(labels_text)

### Entity Recognition Pipeline Ends

In [18]:
with open("predictions.entities.txt", "w") as f:
    for text in corrected_bio_tagging:
        f.write(text + "\n")

# RE Pipeline

###  Load dataset

In [19]:
with open("/kaggle/input/workshop-task-acl/SOMD2025-PhaseII/test_texts.txt", "r", encoding="utf-8") as file:
    text = file.read()

text_list = text.split('\n')

with open("predictions.entities.txt", "r", encoding="utf-8") as file:
    text = file.read()

labels_list = text.split('\n')
dfl = pd.DataFrame()
dfl['train_text'] = text_list
dfl['train_labels'] = labels_list[:-1]
dfl['train_text_list'] = dfl['train_text'].str.split(' ')
dfl['train_labels_list'] = dfl['train_labels'].str.split(' ')
dfl['document'] = np.arange(len(dfl))
dfl['unique_labels'] = dfl['train_labels_list'].apply(lambda x: ' '.join(list(set(x))))

### Converting NER entities prediction to dataset

In [20]:
def extract_bio_entities_with_indices(tokens, labels):
    """
    Extract full BIO-tagged entities with their starting index and entity type.
    
    Returns a list of tuples: (entity_text, start_index, entity_type)
    """
    entities = []
    current_entity = []
    start_index = None
    entity_type = None

    tokens = tokens.split(' ')
    labels = labels.split(' ')

    for i, (token, label) in enumerate(zip(tokens, labels)):
        if label.startswith("B-"):
            if current_entity:
                entities.append((" ".join(current_entity), start_index, entity_type))
            current_entity = [token]
            start_index = i
            entity_type = label
        elif label.startswith("I-") and current_entity:
            current_entity.append(token)
        else:
            if current_entity:
                entities.append((" ".join(current_entity), start_index, entity_type))
                current_entity = []
                start_index = None
                entity_type = None

    # Catch any leftover entity
    if current_entity:
        entities.append((" ".join(current_entity), start_index, entity_type))

    return entities

In [None]:
relations_df = pd.DataFrame()
for i in range(len(dfl)):
    try:
        entities = extract_bio_entities_with_indices(dfl[dfl['document']==i]['train_text'].values[0], 
                                                     dfl[dfl['document']==i]['train_labels'].values[0])
        entities = pd.DataFrame(entities)
        entities.columns = ['entity_text', 'token_start', 'entity_label']
        entities['document'] = i
        entities['train_text'] = dfl[dfl['document']==i]['train_text'].values[0]
        entities['train_labels'] = dfl[dfl['document']==i]['train_labels'].values[0]
        relations_df = pd.concat([relations_df, entities]).reset_index(drop = True)
    except:
        print(i)

In [None]:
df_208 = pd.DataFrame([['O', 0, 'O', 208, dfl[dfl['document']==208]['train_text'].values[0], 
                   dfl[dfl['document']==208]['train_labels'].values[0]]])

df_208.columns = ['entity_text', 'token_start', 'entity_label', 'document', 'train_text', 'train_labels']
df_208

In [23]:
relations_df = pd.concat([relations_df, df_208]).reset_index(drop = True)
relations_df.sort_values(['document'], inplace = True)
relations_df['entity_text_labels'] = relations_df['entity_label'] + ' [SEP] ' + relations_df['entity_text']
relations_df['entity_text_labels'] = relations_df['entity_text_labels'] + ' [TOKEN] ' + relations_df['token_start'].astype(str)
relations_df['entity_text_labels'] = relations_df['entity_text_labels'] + ' [DOC] ' + relations_df['document'].astype(str)

In [24]:
final_df = pd.DataFrame()
for k in relations_df['document'].unique():
    data = relations_df[relations_df['document']==k]['entity_text_labels'].values
    
    combinations = []
    
    for i in range(len(data)):
        for j in range(len(data)):
            # if i == j:
            #     continue  # skip self-pairs
            text_i = data[i]
            text_j = data[j]
            # if text_i.strip() != text_j.strip():
            combinations.append((data[i], data[j]))
    
    combinations = pd.DataFrame(combinations)
    combinations.columns = ['entity_text_1', 'entity_text_2']
    combinations['entity_token_1'] = combinations['entity_text_1'].apply(lambda x: x.split(' [TOKEN] ')[1].split(' [DOC] ')[0])
    combinations['entity_token_2'] = combinations['entity_text_2'].apply(lambda x: x.split(' [TOKEN] ')[1].split(' [DOC] ')[0])
    combinations['entity_token_a'] = combinations['entity_text_1'].apply(lambda x: x.split(' [TOKEN] ')[0])
    combinations['entity_token_b'] = combinations['entity_text_2'].apply(lambda x: x.split(' [TOKEN] ')[0])
    combinations['entity_doc'] = combinations['entity_text_1'].apply(lambda x: x.split(' [DOC] ')[1])
    combinations['entity_doc'] = combinations['entity_text_1'].apply(lambda x: x.split(' [DOC] ')[1])
    combinations['train_text'] = relations_df[relations_df['document']==k]['train_text'].values[0]
    combinations = combinations.drop(columns = ['entity_text_1', 'entity_text_2'])
    combinations['relation_text_1'] = combinations['train_text'] + ' [SEP] ' + combinations['entity_token_a'] + ' [SEP] ' + combinations['entity_token_b']
    combinations['relation_text_2'] = combinations['train_text'] + ' [SEP] ' + combinations['entity_token_b'] + ' [SEP] ' + combinations['entity_token_a']
    final_df = pd.concat([final_df, combinations]).reset_index(drop = True)

# final_df = final_df[final_df['entity_token_a']!=final_df['entity_token_b']].reset_index(drop = True)

In [25]:
no_rel = final_df[final_df['entity_doc'].isin(['14', '126', '208'])].reset_index(drop = True)

In [None]:
final_df = final_df[final_df['entity_token_a']!=final_df['entity_token_b']].reset_index(drop = True)
final_df.shape

In [None]:
for i in range(220):
    if i not in final_df['entity_doc'].astype(int).unique():
        print(i)

In [None]:
final_df = pd.concat([final_df, no_rel]).reset_index(drop = True)
final_df.shape

### Relation Dataset Created

In [29]:
final_df.to_csv('relations_test.csv', index = False)

In [30]:
from torch.utils.data import DataLoader, Dataset

def prepare_input(text, tokenizer):
    inputs = tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=INFERENCE_MAX_LENGTH,
        padding='max_length', # TODO: check padding to max sequence in batch
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long) # TODO: check dtypes
    return inputs


def collate(inputs):
    """
    It truncates the inputs to the maximum sequence length in the batch. 
    """
    mask_len = int(inputs["attention_mask"].sum(axis=1).max()) # Get batch's max sequence length
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs


class CustomDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.texts = df['relation_text'].values
        self.labels = df['labels'].values
        self.tokenizer = tokenizer
        self.text_ids = df['document'].values
        self.num_classes = 12  # Number of unique classes

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        output = {}
        output["inputs"] = prepare_input(self.texts[item], self.tokenizer)        
        output["labels"] = torch.tensor(self.labels[item], dtype=torch.long)
        output["document"] = self.text_ids[item]
        return output

### Loading Relation Extraction Model

In [None]:
MODEL_NAME = "psresearch/RE_scholarly_text_deberta_v3_large" #RE Model Path

class MeanPooling(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        return sum_embeddings / sum_mask

class CustomHFModel(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.model = AutoModel.from_config(config)
        self.pool = MeanPooling()
        self.fc = nn.Linear(config.hidden_size, 12)
        self._init_weights(self.fc)

    def forward(self, input_ids):
        outputs = self.model(**input_ids)
        last_hidden_states = outputs[0]
        pooled = self.pool(last_hidden_states, input_ids['attention_mask'])
        return self.fc(pooled)

# Load the config
config = AutoConfig.from_pretrained(MODEL_NAME, output_hidden_states=True) 
model = CustomHFModel(config)

model_path = hf_hub_download(repo_id=MODEL_NAME, filename="model.safetensors")

state_dict = {}
with safe_open(model_path, framework="pt", device="cpu") as f:
    for key in f.keys():
        state_dict[key] = f.get_tensor(key)

missing, unexpected = model.load_state_dict(state_dict, strict=False)
print("Missing keys:", missing)
print("Unexpected keys:", unexpected)

In [None]:
### Predictions on NER model outptus

In [None]:
test_folds = pd.read_csv('relations_test.csv')
test_folds['labels'] = 0
test_folds = test_folds.rename(columns={"entity_doc": "document", "relation_text_1": "relation_text"})

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

test_dataset = CustomDataset(test_folds, tokenizer)

test_loader = DataLoader(test_dataset,
                              batch_size=64,
                              shuffle=False,
                              num_workers=0, pin_memory=True, drop_last=False)

In [33]:
model = model.to('cuda')

In [None]:
test_dataset[0]

In [35]:
def valid_epoch(valid_loader, model, device):
    model.eval() # set model in evaluation mode
    prediction_dict = {}
    preds = []
    with tqdm(valid_loader, unit="valid_batch", desc='Validation') as tqdm_valid_loader:
        for step, batch in enumerate(tqdm_valid_loader):
            inputs = batch.pop("inputs")
            labels = batch.pop("labels")
            document_ids = batch.pop("document")
            inputs = collate(inputs) # collate inputs
            for k, v in inputs.items():
                inputs[k] = v.to(device) # send inputs to device
            batch_size = labels.size(0)
            with torch.no_grad():
                y_preds = model(inputs) # forward propagation pass
            preds.append(y_preds.to('cpu').numpy()) # save predictions

    prediction_dict["predictions"] = np.concatenate(preds) # np.array() of shape (fold_size, target_cols)
    prediction_dict[""] = document_ids
    return prediction_dict

In [None]:
softmax = nn.Softmax(dim=1)

test_dict = valid_epoch(test_loader, model, 'cuda')
test_pred = test_dict["predictions"]
_, test_pred = torch.max(softmax(torch.tensor(test_pred)), dim=1)

test_folds["preds"] = test_pred

torch.cuda.empty_cache()
gc.collect()

### Mappinf predicted class to Relation text

In [37]:
re_lab_dict = {'no_relation': 0,
 'Developer_of': 1,
 'URL_of': 2,
 'Citation_of': 3,
 'PlugIn_of': 4,
 'Version_of': 5,
 'Extension_of': 6,
 'Specification_of': 7,
 'Release_of': 8,
 'Abbreviation_of': 9,
 'License_of': 10,
 'AlternativeName_of': 11}

re_lab_inv_map = {v: k for k, v in re_lab_dict.items()}

In [38]:
test_folds['relations'] = test_folds['preds'].apply(lambda x: re_lab_inv_map[x])

In [39]:
def get_preds(df):
    df['relations'] = df['preds'].apply(lambda x: re_lab_inv_map[x])
    # df['relations'] = df['preds_avg'].apply(lambda x: re_lab_inv_map[x])
    df['entity_label_a'] = df['entity_token_a'].apply(lambda x: x.split(' [SEP] ')[0][2:])
    df['entity_label_b'] = df['entity_token_b'].apply(lambda x: x.split(' [SEP] ')[0][2:])
    df['relations_type'] = df['relations'].apply(lambda x: x.split('_')[0])
    df = df[df['entity_label_a']==df['relations_type']].reset_index(drop = True)
    df = df.drop(columns = ['entity_label_a', 'entity_label_b', 'relations_type', 'relation_text_2', 'train_text'])
    df.sort_values('document', inplace = True)
    df = df[['entity_token_1', 'entity_token_a', 'entity_token_b',
             'entity_token_2', 'document', 'relations']]
    return df

In [40]:
test_folds = get_preds(test_folds)

In [41]:
test_folds = test_folds.sort_values('document').reset_index(drop = True)

### Computing final Relations Predictions

In [42]:
import pandas as pd

relation_dict = {}

for doc_id in range(220):
    if doc_id in test_folds['document'].unique():
        doc_rows = test_folds[test_folds['document'] == doc_id]
    
        formatted = "; ".join(
        f"{row['relations']}\t{row['entity_token_1']}\t{row['entity_token_2']}"
        for _, row in doc_rows.iterrows()
        )
    else:
        formatted = ''
    relation_dict[doc_id] = formatted

with open("predictions.relations.txt", "w") as f:
    for doc, text in relation_dict.items():
        f.write(f"{text}\n")