# Run libraries

In [None]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U peft
%pip install -U trl

In [None]:
import wandb
from google.colab import userdata
wb_token = userdata.get('WandB')

In [None]:
run = wandb.init(
    project='Fine-tune-Microsoft-Phi-3.5-mini-instruct-Fixed',
    job_type="training",
    anonymous="allow"
)

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix,
                             f1_score)
from sklearn.model_selection import train_test_split

# With hybrid dataset

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

df = pd.read_csv('/content/gdrive/MyDrive/DSSI 2025/Group 1: DSSI Summer 2025/Data/summary_banktrak - summary_banktrak.csv')
df.head()

In [None]:
# Shuffle
df = df.sample(frac=1, random_state=85).reset_index(drop=True)

# Split the DataFrame
train_size = 0.8
eval_size = 0.1 # different from 80/20

# Calculate sizes
train_end = int(train_size * len(df))
eval_end = train_end + int(eval_size * len(df))

# Split the data
X_train = df[:train_end].copy()  # Use .copy() to avoid warnings
X_eval = df[train_end:eval_end].copy()
X_test = df[eval_end:].copy()

# IMPORTANT: Preserve the original text before generating prompts
X_train['original_text'] = X_train['text'].copy()
X_eval['original_text'] = X_eval['text'].copy()
X_test['original_text'] = X_test['text'].copy()

# Define the prompt generation functions - now using 'original_text'
# Refined prompt and made it more concise without examples due to previous errors
def generate_prompt(data_point):
    return f"""You are a financial text classifier. Answer ONLY "True" if the text explicitly mentions debt instruments (bonds, loans, credit agreement, debt settlement, promissory notes).
    If the item just references a debt instrument, this column should still be false. There must be some details outside of the name (such as the start date, the amount, the lenders, etc.).
    Answer "False" for all other topics.

Now classify this text:
Text: {data_point["original_text"]}
Classification: """.strip()

def generate_test_prompt(data_point):
    return f"""
            Classify the text into True (debt-related) or False (non-debt-related).
Text: {data_point["original_text"]}
Classification: """.strip()

# Generate prompts for training and evaluation data - now using original_text
X_train['text'] = X_train.apply(generate_prompt, axis=1)
X_eval['text'] = X_eval.apply(generate_prompt, axis=1)

# Generate test prompts and extract true labels
y_true = X_test['contains_debt_instrument_information'].copy()
X_test['text'] = X_test.apply(generate_test_prompt, axis=1)

In [None]:
X_train.contains_debt_instrument_information.value_counts()

In [None]:
y_true.value_counts()

In [None]:
# Convert to datasets
train_data = Dataset.from_pandas(X_train[["text"]])
eval_data = Dataset.from_pandas(X_eval[["text"]])

In [None]:
train_data['text'][3]

In [None]:
base_model_name = "microsoft/Phi-3.5-mini-instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
from datasets import load_dataset

# Memory-optimized predict function
def predict(test, model, tokenizer):
    import torch
    y_pred = []

    # Clear GPU cache before starting
    torch.cuda.empty_cache()

    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]

        try:
            # Use pipeline for text generation with memory optimization
            pipe = pipeline(task="text-generation",
                            model=model,
                            tokenizer=tokenizer,
                            max_new_tokens=5, # maybe change to 1 or 2
                            temperature=0.1,
                            device_map="auto",
                            torch_dtype=torch.float16)  # Use half precision

            result = pipe(prompt)
            response = result[0]['generated_text']
            prediction = response.strip().split("Response:")[-1].strip()
            # Parse the binary classification result
            if "true" in prediction.lower():
                y_pred.append(True)
            elif "false" in prediction.lower():
                y_pred.append(False)
            else:
                # Default to False if unclear
                y_pred.append(False)

        except Exception as e:
            print(f"Error processing sample {i}: {e}")
            y_pred.append(False)  # Default on error

        # Clear cache every 10 samples to prevent memory buildup
        if i % 10 == 0:
            torch.cuda.empty_cache()

    return y_pred

In [None]:
# predict(X_test, model, tokenizer)

In [None]:
y_pred = predict(X_test, model, tokenizer)

In [None]:
def evaluate(y_true, y_pred):
    labels = [False, True]  # Binary labels for debt classification
    label_names = ["Non-debt-related", "Debt-related"]

    # Convert y_true to a list to allow integer indexing
    y_true_list = y_true.tolist()

    # Calculate accuracy and F1 score
    accuracy = accuracy_score(y_true=y_true_list, y_pred=y_pred)
    f1 = f1_score(y_true=y_true_list, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    print(f'F1 Score: {f1:.3f}')

    # Generate accuracy report for each class
    unique_labels = set(y_true_list)  # Get unique labels from the list

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_list)) if y_true_list[i] == label]
        label_y_true = [y_true_list[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        label_name = "Debt-related" if label else "Non-debt-related"
        print(f'Accuracy for {label_name}: {label_accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true_list, y_pred=y_pred, target_names=label_names)
    print('\nClassification Report:')
    print(class_report)

In [None]:
evaluate(y_true, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=['False', 'True'])
disp.plot(cmap='Blues')
plt.title('Phi Model Performance on Original Dataset')
plt.tight_layout()
plt.show()

In [None]:
from datasets import load_dataset

# Memory-optimized predict function
def predict(test, model, tokenizer):
    import torch
    y_pred = []

    # Clear GPU cache before starting
    torch.cuda.empty_cache()

    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]

        try:
            # Use pipeline for text generation with memory optimization
            pipe = pipeline(task="text-generation",
                            model=model,
                            tokenizer=tokenizer,
                            max_new_tokens=2,
                            temperature=0.1,
                            device_map="auto",
                            torch_dtype=torch.float16)  # Use half precision

            result = pipe(prompt)
            response = result[0]['generated_text']
            prediction = response.strip().split("Response:")[-1].strip()

            # Parse the binary classification result
            if "true" in prediction.lower():
                y_pred.append(True)
            elif "false" in prediction.lower():
                y_pred.append(False)
            else:
                # Default to False if unclear
                y_pred.append(False)

        except Exception as e:
            print(f"Error processing sample {i}: {e}")
            y_pred.append(False)  # Default on error

        # Clear cache every 10 samples to prevent memory buildup
        if i % 10 == 0:
            torch.cuda.empty_cache()

    return y_pred

# Load the test dataset
test_dataset = load_dataset("csv", data_files="/content/gdrive/MyDrive/DSSI 2025/Group 1: DSSI Summer 2025/Data/test.csv", split="train")

# Convert to pandas DataFrame for easier manipulation
test_df = test_dataset.to_pandas()

# Generate test prompts (without labels)
def generate_test_prompt(data_point):
    return f"""
            Classify the text into True (debt-related) or False (non-debt-related).
text: {data_point["text"]}
label: """.strip()

# Extract true labels and create test prompts
y_true_test = test_df["contains_debt_instrument_information"]
test_df_prompts = pd.DataFrame(test_df.apply(generate_test_prompt, axis=1), columns=["text"])

# Set environment variable for memory optimization
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Run predictions with memory optimization
y_pred_test = predict(test_df_prompts, model, tokenizer)

In [None]:
evaluate(y_true_test, y_pred_test)

In [None]:
# Create confusion matrix with increased font sizes
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

y_true_list = y_true_test.tolist()
cm = confusion_matrix(y_true=y_true_list, y_pred=y_pred_test)

# Set larger font sizes
plt.rcParams.update({'font.size': 14})  # Base font size

disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=['Non-debt-related', 'Debt-related'])

# Create the plot with adjusted figure size for better spacing
fig, ax = plt.subplots(figsize=(9, 7))
disp.plot(cmap='Blues', ax=ax)

# Customize font sizes for different elements
plt.title('Phi Model Performance on Newly Annotated Dataset', fontsize=20, pad=20)
ax.set_xlabel('Predicted Label', fontsize=18)
ax.set_ylabel('True Label', fontsize=18)

# Increase tick label font size
ax.tick_params(axis='both', which='major', labelsize=16)

# Keep matrix cell numbers at default size (no change needed)

plt.tight_layout(pad=3.0)  # Increase padding further
plt.show()

# Reset font settings to default after plotting (optional)
plt.rcParams.update({'font.size': 10})

In [None]:
# Save the model
new_model_local = "stpereir-Phi-3.5-summarized-and-test"
model.save_pretrained(new_model_local)
tokenizer.save_pretrained(new_model_local)