In [None]:
# Cell 1: Setup Environment & Download Data (Updated for namdtgk14 dataset)
!git clone https://github.com/kevinscaria/instructabsa.git
%cd instructabsa
!pip install -r requirements.txt
!pip install kagglehub

import kagglehub
import pandas as pd
import os
import ast

# 1. Download the new dataset
print("Downloading dataset from Kaggle...")
path = kagglehub.dataset_download("namdtgk14/aspect-based-sentiment-analysis-for-financial-news")
print("Path to dataset:", path)

# Find the CSV file
csv_file = [f for f in os.listdir(path) if f.endswith('.csv')][0]
full_path = os.path.join(path, csv_file)
print(f"Found CSV file: {full_path}")

# 2. Process Data using the 'split' column
def process_and_split_data(input_path):
    df = pd.read_csv(input_path)

    formatted_data = []

    for index, row in df.iterrows():
        try:
            # Parse 'Decisions' column (String -> Dict)
            raw_decisions = row['Decisions']
            if isinstance(raw_decisions, str):
                # Clean up potential double quotes from CSV parsing issues
                if '""' in raw_decisions:
                    raw_decisions = raw_decisions.replace('""', '"')
                decisions_dict = ast.literal_eval(raw_decisions)
            else:
                decisions_dict = raw_decisions

            # Convert to list of dicts for InstructABSA
            aspect_terms = []
            if isinstance(decisions_dict, dict):
                for entity, sentiment in decisions_dict.items():
                    aspect_terms.append({
                        'term': entity,
                        'polarity': sentiment.lower()
                    })

            # Skip rows with no valid aspect terms
            if not aspect_terms: continue

            # Determine split from the dataset row
            # Normalize to lowercase and strip just in case (e.g. "Train " -> "train")
            split_type = str(row['split']).strip().lower()

            formatted_data.append({
                'sentenceId': f"{index}:1",
                'raw_text': row['Title'],
                'aspectTerms': str(aspect_terms),
                'aspectCategories': "[{'category': 'none', 'polarity': 'none'}]",
                'split_type': split_type  # Keep track of the split
            })
        except Exception as e:
            # print(f"Skipping row {index}: {e}") # Optional debugging
            continue

    # Create DataFrame
    final_df = pd.DataFrame(formatted_data)

    # 3. Split based on the 'split_type' column
    train_df = final_df[final_df['split_type'] == 'train'].drop(columns=['split_type'])
    val_df = final_df[final_df['split_type'] == 'val'].drop(columns=['split_type'])
    test_df = final_df[final_df['split_type'] == 'test'].drop(columns=['split_type'])

    # Save to internal CSVs for the training script
    train_df.to_csv('train_internal.csv', index=False)
    val_df.to_csv('val_internal.csv', index=False)
    test_df.to_csv('test_internal.csv', index=False)

    print(f"Done! Processed data using provided 'split' column.")
    print(f" - Created 'train_internal.csv' ({len(train_df)} rows)")
    print(f" - Created 'val_internal.csv' ({len(val_df)} rows)")
    print(f" - Created 'test_internal.csv' ({len(test_df)} rows)")

process_and_split_data(full_path)

Cloning into 'instructabsa'...
remote: Enumerating objects: 562, done.[K
remote: Counting objects: 100% (224/224), done.[K
remote: Compressing objects: 100% (114/114), done.[K
remote: Total 562 (delta 123), reused 197 (delta 106), pack-reused 338 (from 1)[K
Receiving objects: 100% (562/562), 1.58 MiB | 3.16 MiB/s, done.
Resolving deltas: 100% (320/320), done.
/content/instructabsa
Collecting evaluate (from -r requirements.txt (line 4))
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6
Downloading dataset from Kaggle...
Downloading from https://www.kaggle.com/api/v1/datasets/download/namdtgk14/aspect-based-sentiment-analysis-for-financial-ne

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 361k/361k [00:00<00:00, 973kB/s]

Extracting files...





Path to dataset: /root/.cache/kagglehub/datasets/namdtgk14/aspect-based-sentiment-analysis-for-financial-news/versions/1
Found CSV file: /root/.cache/kagglehub/datasets/namdtgk14/aspect-based-sentiment-analysis-for-financial-news/versions/1/SEntFiN-v1.1_with_split.csv
Done! Processed data using provided 'split' column.
 - Created 'train_internal.csv' (7693 rows)
 - Created 'val_internal.csv' (833 rows)
 - Created 'test_internal.csv' (2227 rows)


In [None]:
# Cell 2: Create Instructions and Data Prep Scripts
# 1. instructions.py
with open("instructions.py", "w") as f:
    f.write('''class InstructionsHandler:
    def __init__(self):
        self.ate = {
            'bos_instruct1': """Definition: The output will be the financial entities or aspects (both implicit and explicit) which have an associated sentiment/opinion extracted from the input text. In cases where there are no aspects the output should be noaspectterm.\\n    Positive example 1-\\n    input: Net profit surged by 20% in the last quarter surpassing analyst estimates.\\n    output: Net profit\\n    Positive example 2-\\n    input: The company's balance sheet remains strong with high liquidity.\\n    output: balance sheet, liquidity\\n    Negative example 1-\\n    input: Shares of ABC Corp plummeted due to the scandal.\\n    output: Shares\\n    Negative example 2-\\n    input: High inflation continues to hurt the operating margin.\\n    output: operating margin\\n    Neutral example 1-\\n    input: The board of directors announced a meeting scheduled for next Monday.\\n    output: board of directors\\n    Neutral example 2-\\n    input: SpiceJet to issue 6.4 crore warrants to promoters.\\n    output: SpiceJet\\n    Now complete the following example-\\n    input: """,
            'bos_instruct2': """Definition: The output will be the financial entities or aspects (both implicit and explicit) which have an associated sentiment/opinion extracted from the input text. In cases where there are no aspects the output should be noaspectterm.\\n    Positive example 1-\\n    input: Net profit surged by 20% in the last quarter surpassing analyst estimates.\\n    output: Net profit\\n    Positive example 2-\\n    input: The company's balance sheet remains strong with high liquidity.\\n    output: balance sheet, liquidity\\n    Negative example 1-\\n    input: Shares of ABC Corp plummeted due to the scandal.\\n    output: Shares\\n    Negative example 2-\\n    input: High inflation continues to hurt the operating margin.\\n    output: operating margin\\n    Neutral example 1-\\n    input: The board of directors announced a meeting scheduled for next Monday.\\n    output: board of directors\\n    Neutral example 2-\\n    input: SpiceJet to issue 6.4 crore warrants to promoters.\\n    output: SpiceJet\\n    Now complete the following example-\\n    input: """,
            'eos_instruct': ' \\noutput:'
        }
        self.atsc = {
            'bos_instruct1': """Definition: The output will be 'positive' if the sentiment of the identified financial entity or aspect in the input is positive (good news, growth, profit). If the sentiment is negative (loss, drop, risk), the answer will be 'negative'. Otherwise, the output should be 'neutral'. For aspects which are classified as noaspectterm, the sentiment is none.\\n    Positive example 1-\\n    input: Profits for Apple surged by 20% this quarter exceeding expectations. The aspect is Profits.\\n    output: positive\\n    Positive example 2-\\n    input: The bank maintains a healthy capital adequacy ratio. The aspect is capital adequacy ratio.\\n    output: positive\\n    Negative example 1-\\n    input: Stocks of Tesla fell sharply due to production delays. The aspect is Stocks.\\n    output: negative\\n    Negative example 2-\\n    input: Rising debt levels are a major concern for the investors. The aspect is debt levels.\\n    output: negative\\n    Neutral example 1-\\n    input: SpiceJet to issue 6.4 crore warrants to promoters. The aspect is SpiceJet.\\n    output: neutral\\n    Neutral example 2-\\n    input: The merger discussion is still ongoing with no final decision. The aspect is merger.\\n    output: neutral\\n    Now complete the following example-\\n    input: """,
            'bos_instruct2': """Definition: The output will be 'positive' if the sentiment of the identified financial entity or aspect in the input is positive (good news, growth, profit). If the sentiment is negative (loss, drop, risk), the answer will be 'negative'. Otherwise, the output should be 'neutral'. For aspects which are classified as noaspectterm, the sentiment is none.\\n    Positive example 1-\\n    input: Profits for Apple surged by 20% this quarter exceeding expectations. The aspect is Profits.\\n    output: positive\\n    Positive example 2-\\n    input: The bank maintains a healthy capital adequacy ratio. The aspect is capital adequacy ratio.\\n    output: positive\\n    Negative example 1-\\n    input: Stocks of Tesla fell sharply due to production delays. The aspect is Stocks.\\n    output: negative\\n    Negative example 2-\\n    input: Rising debt levels are a major concern for the investors. The aspect is debt levels.\\n    output: negative\\n    Neutral example 1-\\n    input: SpiceJet to issue 6.4 crore warrants to promoters. The aspect is SpiceJet.\\n    output: neutral\\n    Neutral example 2-\\n    input: The merger discussion is still ongoing with no final decision. The aspect is merger.\\n    output: neutral\\n    Now complete the following example-\\n    input: """,
            'delim_instruct': ' The aspect is ',
            'eos_instruct': ' \\noutput:'
        }
        self.joint = {
            'bos_instruct1': """Definition: The output will be the financial aspects and their sentiment polarity. Format: aspect:sentiment.\\n    Positive example 1-\\n    input: Revenue grew significantly.\\n    output: Revenue:positive\\n    Negative example 1-\\n    input: Costs are spiraling out of control.\\n    output: Costs:negative\\n    Neutral example 1-\\n    input: The CEO spoke at the conference.\\n    output: CEO:neutral\\n    Now complete the following example-\\n    input: """,
            'bos_instruct2': """Definition: The output will be the financial aspects and their sentiment polarity. Format: aspect:sentiment.\\n    Positive example 1-\\n    input: Revenue grew significantly.\\n    output: Revenue:positive\\n    Negative example 1-\\n    input: Costs are spiraling out of control.\\n    output: Costs:negative\\n    Neutral example 1-\\n    input: The CEO spoke at the conference.\\n    output: CEO:neutral\\n    Now complete the following example-\\n    input: """,
            'eos_instruct': ' \\noutput:'
        }

    def load_instruction_set1(self):
        self.ate = self.ate
        self.atsc = self.atsc
        self.joint = self.joint

    def load_instruction_set2(self):
        self.ate = self.ate
        self.atsc = self.atsc
        self.joint = self.joint
''')

# 2. InstructABSA/data_prep.py
with open("InstructABSA/data_prep.py", "w") as f:
    f.write('''from datasets import Dataset
from datasets.dataset_dict import DatasetDict
import ast

class DatasetLoader:
    def __init__(self, train_df_id=None, test_df_id=None,
                 train_df_ood=None, test_df_ood=None, sample_size=1,
                 val_df_id=None, val_df_ood=None):

        self.train_df_id = train_df_id.sample(frac=sample_size, random_state=1999) if train_df_id is not None else train_df_id
        self.test_df_id = test_df_id
        self.train_df_ood = train_df_ood
        self.test_df_ood = test_df_ood
        self.val_df_id = val_df_id
        self.val_df_ood = val_df_ood

    def reconstruct_strings(self, df, col):
        reconstructed_col = []
        for text in df[col]:
            try:
                if isinstance(text, (list, dict)):
                    reconstructed_col.append(text)
                elif isinstance(text, str):
                    if text == '[]':
                        reconstructed_col.append([])
                    else:
                        reconstructed_col.append(ast.literal_eval(text))
                else:
                    reconstructed_col.append([])
            except (ValueError, SyntaxError):
                reconstructed_col.append([])
        df[col] = reconstructed_col
        return df

    def extract_rowwise_aspect_polarity(self, df, on, key, min_val = None):
        try:
            df.iloc[0][on][0][key]
        except:
            df = self.reconstruct_strings(df, on)

        df['len'] = df[on].apply(lambda x: len(x))
        if min_val is not None:
            df.loc[df['len'] == 0, 'len'] = min_val
        df = df.loc[df.index.repeat(df['len'])]
        df['record_idx'] = df.groupby(df.index).cumcount()
        df['aspect'] = df[[on, 'record_idx']].apply(lambda x : (x[0][x[1]][key], x[0][x[1]]['polarity']) if len(x[0]) != 0 else ('',''), axis=1)
        df['polarity'] = df['aspect'].apply(lambda x: x[-1])
        df['aspect'] = df['aspect'].apply(lambda x: x[0])
        df = df.drop(['len', 'record_idx'], axis=1).reset_index(drop = True)
        return df

    def extract_rowwise_aspect_opinions(self, df, aspect_col, opinion_col, key, min_val = None):
        df['len'] = df[aspect_col].apply(lambda x: len(x))
        if min_val is not None:
            df.loc[df['len'] == 0, 'len'] = min_val
        df = df.loc[df.index.repeat(df['len'])]
        df['record_idx'] = df.groupby(df.index).cumcount()
        df['aspect'] = df[[aspect_col, 'record_idx']].apply(lambda x : x[0][x[1]][key] if len(x[0]) != 0 else '', axis=1)
        df['opinion_term'] = df[[opinion_col, 'record_idx']].apply(lambda x : x[0][x[1]][key] if len(x[0]) != 0 else '', axis=1)
        df['aspect'] = df['aspect'].apply(lambda x: ' '.join(x))
        df['opinion_term'] = df['opinion_term'].apply(lambda x: ' '.join(x))
        df = df.drop(['len', 'record_idx'], axis=1).reset_index(drop = True)
        return df

    def create_data_in_ate_format(self, df, key, text_col, aspect_col, bos_instruction = '', eos_instruction = ''):
        if df is None: return
        try: df.iloc[0][aspect_col][0][key]
        except: df = self.reconstruct_strings(df, aspect_col)
        df['labels'] = df[aspect_col].apply(lambda x: ', '.join([i[key] for i in x]))
        df['text'] = df[text_col].apply(lambda x: bos_instruction + x + eos_instruction)
        return df

    def create_data_in_atsc_format(self, df, on, key, text_col, aspect_col, bos_instruction = '', delim_instruction = '', eos_instruction = ''):
        if df is None: return
        df = self.extract_rowwise_aspect_polarity(df, on=on, key=key, min_val=1)
        df['text'] = df[[text_col, aspect_col]].apply(lambda x: bos_instruction + x[0] + delim_instruction + x[1] + eos_instruction, axis=1)
        df = df.rename(columns = {'polarity': 'labels'})
        return df

    def create_data_in_aspe_format(self, df, key, label_key, text_col, aspect_col, bos_instruction = '', eos_instruction = ''):
        if df is None: return
        try: df.iloc[0][aspect_col][0][key]
        except: df = self.reconstruct_strings(df, aspect_col)
        df['labels'] = df[aspect_col].apply(lambda x: ', '.join([f"{i[key]}:{i[label_key]}" for i in x]))
        df['text'] = df[text_col].apply(lambda x: bos_instruction + x + eos_instruction)
        return df

    def create_data_in_aooe_format(self, df, aspect_col, opinion_col, key, text_col, bos_instruction = '', delim_instruction = '', eos_instruction = ''):
        if df is None: return
        df = self.extract_rowwise_aspect_opinions(df, aspect_col=aspect_col, opinion_col=opinion_col, key=key, min_val=1)
        df['text'] = df[[text_col, 'aspect']].apply(lambda x: bos_instruction + x[0] + delim_instruction + x[1] + eos_instruction, axis=1)
        df = df.rename(columns = {'opinion_term': 'labels'})
        return df

    def create_data_in_aope_format(self, df, key, text_col, aspect_col, opinion_col, bos_instruction = '', eos_instruction = ''):
        df['labels'] = df[[aspect_col, opinion_col]].apply(lambda x: ', '.join([f"{' '.join(i[key])}:{' '.join(j[key])}" for i, j in zip(x[0], x[1])]), axis=1)
        df['text'] = df[text_col].apply(lambda x: bos_instruction + x + eos_instruction)
        return df

    def create_data_in_aoste_format(self, df, key, label_key, text_col, aspect_col, opinion_col, bos_instruction = '', eos_instruction = ''):
        label_map = {'POS':'positive', 'NEG':'negative', 'NEU':'neutral'}
        df['labels'] = df[[aspect_col, opinion_col]].apply(lambda x: ', '.join([f"{' '.join(i[key])}:{' '.join(j[key])}:{label_map[i[label_key]]}" for i, j in zip(x[0], x[1])]), axis=1)
        df['text'] = df[text_col].apply(lambda x: bos_instruction + x + eos_instruction)
        return df

    def set_data_for_training_semeval(self, tokenize_function):
        dataset_dict_id, dataset_dict_ood = {}, {}
        if self.train_df_id is not None: dataset_dict_id['train'] = Dataset.from_pandas(self.train_df_id)
        if self.test_df_id is not None: dataset_dict_id['test'] = Dataset.from_pandas(self.test_df_id)
        if self.val_df_id is not None: dataset_dict_id['validation'] = Dataset.from_pandas(self.val_df_id)

        if len(dataset_dict_id) > 1:
            indomain_dataset = DatasetDict(dataset_dict_id)
            indomain_tokenized_datasets = indomain_dataset.map(tokenize_function, batched=True)
        else:
            indomain_dataset, indomain_tokenized_datasets = {}, {}

        if self.train_df_ood is not None: dataset_dict_ood['train'] = Dataset.from_pandas(self.train_df_ood)
        if self.test_df_ood is not None: dataset_dict_ood['test'] = Dataset.from_pandas(self.test_df_ood)
        if self.val_df_ood is not None: dataset_dict_ood['validation'] = Dataset.from_pandas(self.val_df_ood)

        if len(dataset_dict_id) > 1:
            other_domain_dataset = DatasetDict(dataset_dict_ood)
            other_domain_tokenized_dataset = other_domain_dataset.map(tokenize_function, batched=True)
        else:
            other_domain_dataset, other_domain_tokenized_dataset = {}, {}

        return indomain_dataset, indomain_tokenized_datasets, other_domain_dataset, other_domain_tokenized_dataset
''')

# 3. InstructABSA/utils.py (Initial version for training)
with open("InstructABSA/utils.py", "w") as f:
    f.write('''import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from transformers import (
    DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments, Trainer, Seq2SeqTrainer
)

class T5Generator:
    def __init__(self, model_checkpoint):
        self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
        self.data_collator = DataCollatorForSeq2Seq(self.tokenizer)
        self.device = 'cuda' if torch.has_cuda else ('mps' if torch.has_mps else 'cpu')

    def tokenize_function_inputs(self, sample):
        model_inputs = self.tokenizer(sample['text'], max_length=512, truncation=True)
        labels = self.tokenizer(sample["labels"], max_length=64, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def train(self, tokenized_datasets, **kwargs):
        args = Seq2SeqTrainingArguments(**kwargs)
        eval_ds = tokenized_datasets.get("validation")
        if eval_ds is None: eval_ds = tokenized_datasets.get("test")
        trainer = Seq2SeqTrainer(
            self.model, args, train_dataset=tokenized_datasets["train"],
            eval_dataset=eval_ds, tokenizer=self.tokenizer, data_collator=self.data_collator,
        )
        torch.cuda.empty_cache()
        trainer.train()
        trainer.save_model()
        return trainer

    def get_labels(self, tokenized_dataset, batch_size = 4, max_length = 128, sample_set = 'train'):
        def collate_fn(batch):
            input_ids = [torch.tensor(example['input_ids']) for example in batch]
            input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
            return input_ids
        dataloader = DataLoader(tokenized_dataset[sample_set], batch_size=batch_size, collate_fn=collate_fn)
        predicted_output = []
        self.model.to(self.device)
        for batch in tqdm(dataloader):
            batch = batch.to(self.device)
            output_ids = self.model.generate(batch, max_length = max_length)
            output_texts = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
            for output_text in output_texts: predicted_output.append(output_text)
        return predicted_output

    def get_metrics(self, y_true, y_pred, is_triplet_extraction=False):
        # Simplified for brevity in this combined script
        return precision_score(y_true, y_pred, average='macro'), recall_score(y_true, y_pred, average='macro'), f1_score(y_true, y_pred, average='macro'), None

class T5Classifier:
    def __init__(self, model_checkpoint):
        self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, force_download = True)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, force_download = True)
        self.data_collator = DataCollatorForSeq2Seq(self.tokenizer)
        self.device = 'cuda' if torch.has_cuda else ('mps' if torch.has_mps else 'cpu')

    def tokenize_function_inputs(self, sample):
        sample['input_ids'] = self.tokenizer(sample["text"], max_length = 512, truncation = True).input_ids
        sample['labels'] = self.tokenizer(sample["labels"], max_length = 64, truncation = True).input_ids
        return sample

    def train(self, tokenized_datasets, **kwargs):
        args = Seq2SeqTrainingArguments(**kwargs)
        eval_ds = tokenized_datasets.get("validation")
        if eval_ds is None: eval_ds = tokenized_datasets.get("test")
        trainer = Trainer(
            self.model, args, train_dataset=tokenized_datasets["train"],
            eval_dataset=eval_ds, tokenizer=self.tokenizer, data_collator = self.data_collator
        )
        torch.cuda.empty_cache()
        trainer.train()
        trainer.save_model()
        return trainer

    def get_labels(self, tokenized_dataset, batch_size = 4, sample_set = 'train'):
        def collate_fn(batch):
            input_ids = [torch.tensor(example['input_ids']) for example in batch]
            input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
            return input_ids
        dataloader = DataLoader(tokenized_dataset[sample_set], batch_size=batch_size, collate_fn=collate_fn)
        predicted_output = []
        self.model.to(self.device)
        for batch in tqdm(dataloader):
            batch = batch.to(self.device)
            output_ids = self.model.generate(batch)
            output_texts = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
            for output_text in output_texts: predicted_output.append(output_text)
        return predicted_output

    def get_metrics(self, y_true, y_pred):
        return precision_score(y_true, y_pred, average='macro'), recall_score(y_true, y_pred, average='macro'), f1_score(y_true, y_pred, average='macro'), accuracy_score(y_true, y_pred)
''')

# 4. run_model.py
with open("run_model.py", "w") as f:
    f.write('''import os
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import torch
from InstructABSA.data_prep import DatasetLoader
from InstructABSA.utils import T5Generator, T5Classifier
from InstructABSA.config import Config
from instructions import InstructionsHandler

try: use_mps = True if torch.has_mps else False
except: use_mps = False

config = Config()
instruct_handler = InstructionsHandler()
if config.inst_type == 1: instruct_handler.load_instruction_set1()
else: instruct_handler.load_instruction_set2()

if config.mode == 'train' and config.id_tr_data_path is None: raise Exception('Provide training data path.')
if config.mode == 'eval' and config.id_te_data_path is None and config.ood_te_data_path is None: raise Exception('Provide testing data path.')

if config.experiment_name is not None and config.mode == 'train':
    model_checkpoint = config.model_checkpoint
    model_out_path = os.path.join(config.output_dir, config.task, f"{model_checkpoint.replace('/', '')}-{config.experiment_name}")
else:
    model_checkpoint = config.model_checkpoint
    model_out_path = config.model_checkpoint

id_tr_data_path = config.id_tr_data_path
ood_tr_data_path = config.ood_tr_data_path
id_te_data_path = config.id_te_data_path
ood_te_data_path = config.ood_te_data_path

if config.mode != 'cli':
    id_tr_df, id_te_df, ood_tr_df, ood_te_df = None, None, None, None
    if id_tr_data_path: id_tr_df = pd.read_csv(id_tr_data_path)
    if id_te_data_path: id_te_df = pd.read_csv(id_te_data_path)
    if ood_tr_data_path: ood_tr_df = pd.read_csv(ood_tr_data_path)
    if ood_te_data_path: ood_te_df = pd.read_csv(ood_te_data_path)
    print('Loaded data...')

training_args = {
    'output_dir': model_out_path,
    'eval_strategy': config.evaluation_strategy if config.id_te_data_path is not None else 'no',
    'learning_rate': config.learning_rate,
    'per_device_train_batch_size': config.per_device_train_batch_size,
    'per_device_eval_batch_size': config.per_device_eval_batch_size,
    'num_train_epochs': config.num_train_epochs,
    'weight_decay': config.weight_decay,
    'warmup_ratio': config.warmup_ratio,
    'save_strategy': config.save_strategy,
    'load_best_model_at_end': config.load_best_model_at_end,
    'push_to_hub': config.push_to_hub,
    'eval_accumulation_steps': config.eval_accumulation_steps,
    'predict_with_generate': config.predict_with_generate,
    'use_mps_device': use_mps
}

if config.set_instruction_key == 1:
    indomain, outdomain = 'bos_instruct1', 'bos_instruct2'
else:
    indomain, outdomain = 'bos_instruct2', 'bos_instruct1'

if config.task == 'ate':
    t5_exp = T5Generator(model_checkpoint)
    bos_instruction_id = instruct_handler.ate[indomain]
    bos_instruction_ood = instruct_handler.ate[outdomain] if (ood_tr_data_path or ood_te_data_path) else ''
    eos_instruction = instruct_handler.ate['eos_instruct']
elif config.task == 'atsc':
    t5_exp = T5Classifier(model_checkpoint)
    bos_instruction_id = instruct_handler.atsc[indomain]
    bos_instruction_ood = instruct_handler.atsc[outdomain] if (ood_tr_data_path or ood_te_data_path) else ''
    delim_instruction = instruct_handler.atsc['delim_instruct']
    eos_instruction = instruct_handler.atsc['eos_instruct']
elif config.task == 'joint':
    t5_exp = T5Generator(model_checkpoint)
    bos_instruction_id = instruct_handler.joint[indomain]
    bos_instruction_ood = instruct_handler.joint[outdomain] if (ood_tr_data_path or ood_te_data_path) else ''
    eos_instruction = instruct_handler.joint['eos_instruct']

if config.mode != 'cli':
    loader = DatasetLoader(id_tr_df, id_te_df, ood_tr_df, ood_te_df, config.sample_size)

    if config.task == 'ate':
        if loader.train_df_id is not None: loader.train_df_id = loader.create_data_in_ate_format(loader.train_df_id, 'term', 'raw_text', 'aspectTerms', bos_instruction_id, eos_instruction)
        if loader.test_df_id is not None: loader.test_df_id = loader.create_data_in_ate_format(loader.test_df_id, 'term', 'raw_text', 'aspectTerms', bos_instruction_id, eos_instruction)
    elif config.task == 'atsc':
        if loader.train_df_id is not None: loader.train_df_id = loader.create_data_in_atsc_format(loader.train_df_id, 'aspectTerms', 'term', 'raw_text', 'aspect', bos_instruction_id, delim_instruction, eos_instruction)
        if loader.test_df_id is not None: loader.test_df_id = loader.create_data_in_atsc_format(loader.test_df_id, 'aspectTerms', 'term', 'raw_text', 'aspect', bos_instruction_id, delim_instruction, eos_instruction)

    id_ds, id_tokenized_ds, ood_ds, ood_tokenized_ds = loader.set_data_for_training_semeval(t5_exp.tokenize_function_inputs)

    if config.mode == 'train':
        model_trainer = t5_exp.train(id_tokenized_ds, **training_args)
        print('Model saved at: ', model_out_path)
    elif config.mode == 'eval':
        print('Model loaded from: ', model_checkpoint)
        if id_tokenized_ds.get("test") is not None:
            id_te_pred_labels = t5_exp.get_labels(tokenized_dataset = id_tokenized_ds, sample_set = 'test', batch_size=config.per_device_eval_batch_size, max_length = config.max_token_length)
            id_te_df = pd.DataFrame(id_ds['test'])[['text', 'labels']]
            id_te_df['pred_labels'] = id_te_pred_labels
            id_te_df.to_csv(os.path.join(config.output_path, f'{config.experiment_name}_id_test.csv'), index=False)
            print('*****Test Metrics*****')
            precision, recall, f1, accuracy = t5_exp.get_metrics(id_te_df['labels'], id_te_pred_labels)
            print('Precision: ', precision)
            print('Recall: ', recall)
            print('F1-Score: ', f1)
            if config.task == 'atsc': print('Accuracy: ', accuracy)
else:
    print("CLI mode not fully implemented in this unified script.")
''')

print("Configuration files created successfully.")

Configuration files created successfully.


In [None]:
# Cell 3: Run Training
!python run_model.py \
    -task atsc \
    -mode train \
    -id_tr_data_path "train_internal.csv" \
    -id_te_data_path "val_internal.csv" \
    -model_checkpoint "google/flan-t5-base" \
    -num_train_epochs 5 \
    -per_device_train_batch_size 8 \
    -learning_rate 2e-5 \
    -output_dir "sentfin_model_output" \
    -experiment_name "run1" \
    -evaluation_strategy "epoch" \
    -save_strategy "epoch"

2025-12-22 13:52:45.828572: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766411565.864731     410 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766411565.874214     410 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766411565.898921     410 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766411565.898958     410 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766411565.898967     410 computation_placer.cc:177] computation placer alr

In [None]:
# Cell 4: Save Model to Drive
!zip -r my_final_model.zip sentfin_model_output/

from google.colab import drive
import shutil
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define destination
destination_folder = '/content/drive/MyDrive/'
source_file = 'my_final_model.zip'

if os.path.exists(source_file):
    print(f"Copying {source_file} to Google Drive...")
    shutil.copy(source_file, destination_folder)
    print(f"Success! Model saved to: {destination_folder}{source_file}")
else:
    print("Error: my_final_model.zip not found.")

updating: sentfin_model_output/ (stored 0%)
updating: sentfin_model_output/atsc/ (stored 0%)
updating: sentfin_model_output/atsc/googleflan-t5-base-run1/ (stored 0%)
updating: sentfin_model_output/atsc/googleflan-t5-base-run1/special_tokens_map.json (deflated 85%)
updating: sentfin_model_output/atsc/googleflan-t5-base-run1/tokenizer_config.json (deflated 95%)
updating: sentfin_model_output/atsc/googleflan-t5-base-run1/checkpoint-6430/ (stored 0%)
updating: sentfin_model_output/atsc/googleflan-t5-base-run1/checkpoint-6430/special_tokens_map.json (deflated 85%)
updating: sentfin_model_output/atsc/googleflan-t5-base-run1/checkpoint-6430/tokenizer_config.json (deflated 95%)
updating: sentfin_model_output/atsc/googleflan-t5-base-run1/checkpoint-6430/training_args.bin (deflated 53%)
updating: sentfin_model_output/atsc/googleflan-t5-base-run1/checkpoint-6430/config.json (deflated 62%)
updating: sentfin_model_output/atsc/googleflan-t5-base-run1/checkpoint-6430/scheduler.pt (deflated 61%)
updat

In [None]:
# Cell 5: Apply Inference Patches (Fixing utils.py and run_model.py)

# 1. Update run_model.py to handle evaluation_strategy naming if needed
# (Actually, the version written in Cell 2 already handles this via the if-check,
# but we will ensure the variable naming aligns with what Config expects).
# We primarily need to update utils.py for the max_new_tokens fix.

updated_utils_code = """
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from transformers import (
    DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments, Trainer, Seq2SeqTrainer
)

class T5Generator:
    def __init__(self, model_checkpoint):
        self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
        self.data_collator = DataCollatorForSeq2Seq(self.tokenizer)
        self.device = 'cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu')

    def tokenize_function_inputs(self, sample):
        model_inputs = self.tokenizer(sample['text'], max_length=512, truncation=True)
        labels = self.tokenizer(sample["labels"], max_length=64, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def train(self, tokenized_datasets, **kwargs):
        args = Seq2SeqTrainingArguments(**kwargs)
        eval_ds = tokenized_datasets.get("validation")
        if eval_ds is None: eval_ds = tokenized_datasets.get("test")
        trainer = Seq2SeqTrainer(
            self.model, args, train_dataset=tokenized_datasets["train"],
            eval_dataset=eval_ds, tokenizer=self.tokenizer, data_collator=self.data_collator,
        )
        torch.cuda.empty_cache()
        trainer.train()
        trainer.save_model()
        return trainer

    def get_labels(self, tokenized_dataset, batch_size=4, max_length=128, sample_set='train'):
        def collate_fn(batch):
            input_ids = [torch.tensor(example['input_ids']) for example in batch]
            input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
            return input_ids
        dataloader = DataLoader(tokenized_dataset[sample_set], batch_size=batch_size, collate_fn=collate_fn)
        predicted_output = []
        self.model.to(self.device)
        for batch in tqdm(dataloader):
            batch = batch.to(self.device)
            # FIXED: max_new_tokens
            output_ids = self.model.generate(batch, max_new_tokens=max_length)
            output_texts = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
            for output_text in output_texts: predicted_output.append(output_text)
        return predicted_output

    def get_metrics(self, y_true, y_pred, is_triplet_extraction=False):
        return precision_score(y_true, y_pred, average='macro'), recall_score(y_true, y_pred, average='macro'), f1_score(y_true, y_pred, average='macro'), accuracy_score(y_true, y_pred)

class T5Classifier:
    def __init__(self, model_checkpoint):
        self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, force_download=True)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, force_download=True)
        self.data_collator = DataCollatorForSeq2Seq(self.tokenizer)
        self.device = 'cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu')

    def tokenize_function_inputs(self, sample):
        sample['input_ids'] = self.tokenizer(sample["text"], max_length=512, truncation=True).input_ids
        sample['labels'] = self.tokenizer(sample["labels"], max_length=64, truncation=True).input_ids
        return sample

    def train(self, tokenized_datasets, **kwargs):
        args = Seq2SeqTrainingArguments(**kwargs)
        eval_ds = tokenized_datasets.get("validation")
        if eval_ds is None: eval_ds = tokenized_datasets.get("test")
        trainer = Trainer(
            self.model, args, train_dataset=tokenized_datasets["train"],
            eval_dataset=eval_ds, tokenizer=self.tokenizer, data_collator=self.data_collator
        )
        torch.cuda.empty_cache()
        trainer.train()
        trainer.save_model()
        return trainer

    def get_labels(self, tokenized_dataset, batch_size=4, max_length=128, sample_set='train'):
        def collate_fn(batch):
            input_ids = [torch.tensor(example['input_ids']) for example in batch]
            input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
            return input_ids
        dataloader = DataLoader(tokenized_dataset[sample_set], batch_size=batch_size, collate_fn=collate_fn)
        predicted_output = []
        self.model.to(self.device)
        for batch in tqdm(dataloader):
            batch = batch.to(self.device)
            # FIXED: max_new_tokens (This was the critical bug in the inference NB)
            output_ids = self.model.generate(batch, max_new_tokens=max_length)
            output_texts = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
            for output_text in output_texts: predicted_output.append(output_text)
        return predicted_output

    def get_metrics(self, y_true, y_pred):
        return precision_score(y_true, y_pred, average='macro'), recall_score(y_true, y_pred, average='macro'), \
            f1_score(y_true, y_pred, average='macro'), accuracy_score(y_true, y_pred)
"""

with open("InstructABSA/utils.py", "w") as f:
    f.write(updated_utils_code)

print("Inference patches applied (switched to max_new_tokens for generation).")

Inference patches applied (switched to max_new_tokens for generation).


In [None]:
# Cell 6: Run Evaluation
!python run_model.py \
    -task atsc \
    -mode eval \
    -id_te_data_path "test_internal.csv" \
    -model_checkpoint "sentfin_model_output/atsc/googleflan-t5-base-run1" \
    -output_dir "sentfin_model_output" \
    -output_path "sentfin_model_output" \
    -experiment_name "run1_eval" \
    -per_device_eval_batch_size 16 \
    -max_token_length 128

2025-12-22 15:47:33.146820: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766418453.174633   30982 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766418453.181657   30982 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766418453.199789   30982 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766418453.199819   30982 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766418453.199826   30982 computation_placer.cc:177] computation placer alr

In [None]:
#Cell 7
from google.colab import drive
import os

# Mount Google Drive if not already mounted
drive.mount('/content/drive', force_remount=True)

# Define the path to the zipped model in Google Drive
gdrive_path = '/content/drive/MyDrive/my_final_model.zip'

# Define the destination directory for unzipping
unzip_destination = 'unzipped_model/'

# Create the destination directory if it doesn't exist
os.makedirs(unzip_destination, exist_ok=True)

# Unzip the file from Google Drive to the specified local directory
if os.path.exists(gdrive_path):
    print(f"Unzipping {gdrive_path} to {unzip_destination}...")
    !unzip -q {gdrive_path} -d {unzip_destination}
    print("Unzipping complete!")
else:
    print(f"Error: {gdrive_path} not found in Google Drive.")


Mounted at /content/drive
Unzipping /content/drive/MyDrive/my_final_model.zip to unzipped_model/...
Unzipping complete!


In [None]:
# Cell 8: Manual Inference Tool
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_path = "sentfin_model_output/atsc/googleflan-t5-base-run1"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to('cuda')

def predict_sentiment(text, aspect):
    prompt = (
        "Definition: The output will be 'positive' if the sentiment of the identified "
        "financial entity or aspect in the input is positive (good news, growth, profit). "
        "If the sentiment is negative (loss, drop, risk), the answer will be 'negative'. "
        "Otherwise, the output should be 'neutral'.\n"
        "Positive example 1- input: Profits for Apple surged. The aspect is Profits. output: positive\n"
        "Negative example 1- input: Tesla stock crashed. The aspect is stock. output: negative\n"
        "Now complete the following example-\n"
        f"input: {text} The aspect is {aspect}.\noutput:"
    )
    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
    # Using max_new_tokens here as well to align with the patch
    outputs = model.generate(**inputs, max_new_tokens=10)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Try it out!
text = "China‚Äôs exports to U.S. extend double-digit declines, dropping 29% in November, despite trade truce"
aspect = "China"
print(f"Input: {text} | Aspect: {aspect}")
print(f"Prediction: {predict_sentiment(text, aspect)}")
print("\n" + "="*50 + "\n")

# New test case: term not in text
text_implicit = "The company reported unexpectedly low quarterly earnings, causing investor concern."
aspect_implicit = "stock price"
print(f"Input: {text_implicit} | Aspect: {aspect_implicit}")
print(f"Prediction: {predict_sentiment(text_implicit, aspect_implicit)}")


KeyboardInterrupt: 

In [None]:
test = pd.read_csv("test_internal.csv")
test.head()

Unnamed: 0,sentenceId,raw_text,aspectTerms,aspectCategories
0,0:1,SpiceJet to issue 6.4 crore warrants to promoters,"[{'term': 'SpiceJet', 'polarity': 'neutral'}]","[{'category': 'none', 'polarity': 'none'}]"
1,11:1,Wait and watch on Bharti Airtel: Vinay Khattar,"[{'term': 'Bharti Airtel', 'polarity': 'neutra...","[{'category': 'none', 'polarity': 'none'}]"
2,17:1,US stocks finish mixed amid more tech selling,"[{'term': 'tech', 'polarity': 'negative'}, {'t...","[{'category': 'none', 'polarity': 'none'}]"
3,21:1,Gur closes quiet on some support,"[{'term': 'Gur', 'polarity': 'neutral'}]","[{'category': 'none', 'polarity': 'none'}]"
4,22:1,Gur closes steady on low demand,"[{'term': 'Gur', 'polarity': 'neutral'}]","[{'category': 'none', 'polarity': 'none'}]"


In [None]:
# Cell 10
# Load the model manually (or use your existing pipeline)
from InstructABSA.utils import T5Classifier
from InstructABSA.data_prep import DatasetLoader
import pandas as pd
import ast

# 1. Load Model
model_path = "sentfin_model_output/atsc/googleflan-t5-base-run1"
t5_classifier = T5Classifier(model_path)

# 2. Prepare Data (Example with Test Data)
# Using the full test_internal.csv as requested
original_test_df = pd.read_csv("test_internal.csv")
loader = DatasetLoader(None, original_test_df, None, None)

# Get Instruction prompts
from instructions import InstructionsHandler
ih = InstructionsHandler()
bos_instruct = ih.atsc['bos_instruct2'] # Use the instruction set you prefer
delim_instruct = ih.atsc['delim_instruct']
eos_instruct = ih.atsc['eos_instruct']

# Format Data
# This call expands the DataFrame so that each row corresponds to one aspect term
processed_data_for_prediction = loader.create_data_in_atsc_format(
    loader.test_df_id, 'aspectTerms', 'term', 'raw_text', 'aspect',
    bos_instruct, delim_instruct, eos_instruct
)

# Tokenize
from datasets import Dataset
hf_dataset = Dataset.from_pandas(processed_data_for_prediction)
tokenized_dataset = hf_dataset.map(t5_classifier.tokenize_function_inputs, batched=True)
wrapped_dataset = {"test": tokenized_dataset}

# 3. Get predicted labels
predicted_labels = t5_classifier.get_labels(
    wrapped_dataset, sample_set="test", max_length=10 # Max_length for sentiment output (e.g., 'positive')
)

# Calculate and Display Metrics
y_true = processed_data_for_prediction['labels'].tolist()
y_pred = predicted_labels

precision, recall, f1, accuracy = t5_classifier.get_metrics(y_true, y_pred)

print('\n***** Evaluation Metrics on Test Data *****')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')
print(f'Accuracy: {accuracy:.4f}')


  df['aspect'] = df[[on, 'record_idx']].apply(lambda x : (x[0][x[1]][key], x[0][x[1]]['polarity']) if len(x[0]) != 0 else ('',''), axis=1)
  df['text'] = df[[text_col, aspect_col]].apply(lambda x: bos_instruction + x[0] + delim_instruction + x[1] + eos_instruction, axis=1)


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 750/750 [01:37<00:00,  7.68it/s]


***** Evaluation Metrics on Test Data *****
Precision: 0.8945
Recall: 0.8977
F1-Score: 0.8958
Accuracy: 0.8953





In [None]:
from google.colab import drive
drive.mount('/content/drive')