## Installing Required Libraries for Model Fine-Tuning and Optimization

In [1]:
!pip install -q -U torch --index-url https://download.pytorch.org/whl/cu117

In [2]:
# Install the specified version of the HuggingFace Transformers library, for model loading, fine-tuning, and inference.
!pip install -q -U transformers==4.38.2  # The '-q' flag suppresses output to keep it clean, and '-U' ensures the version is upgraded if already installed.

# Install the specified version of the Accelerate library, which is useful for efficient training on multiple devices.
!pip install -q accelerate==0.26.1  # Accelerate helps in simplifying training and inference on multi-GPU or TPU setups.

# Install BitsAndBytes library for 4-bit model loading and optimization.
!pip install -q -i https://pypi.org/simple/ bitsandbytes  # This installs bitsandbytes from the default PyPI index, optimizing large models with 4-bit quantization.

# Install the specified version of the Datasets library, which is used for handling and processing datasets, especially for machine learning tasks.
!pip install -q -U datasets==2.16.1  # Datasets is used for easily loading and working with datasets in the HuggingFace ecosystem.

# Install the specified version of the TRL (Transformer Reinforcement Learning) library, which


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.7/130.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 3.3.1 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.38.2 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m 

## Import Libraries

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
# Import essential libraries for numerical and data manipulation
import numpy as np  # For numerical operations and array manipulations
import pandas as pd  # For data handling, particularly DataFrames
import os  # For interacting with the operating system (e.g., file paths)
from tqdm import tqdm  # For creating progress bars in loops

# Import torch and PyTorch-specific modules for deep learning
import torch  # For tensor operations and neural network models
import torch.nn as nn  # For defining neural network layers and models

# Importing the HuggingFace Transformers library for NLP models
import transformers  # Main HuggingFace package
from transformers import (AutoModelForCausalLM,  # For loading causal language models (e.g., GPT)
                          AutoTokenizer,  # For tokenizing text data for model inputs
                          BitsAndBytesConfig,  # For configuring model quantization
                          TrainingArguments,  # For specifying training configurations
                          pipeline,  # For easily using pre-trained models for specific tasks
                          logging)  # For controlling logging and debugging
                          
# Import the datasets library from HuggingFace for efficient data loading and preprocessing
from datasets import Dataset  # For loading and working with datasets in HuggingFace format

# Import PEFT (Parameter-Efficient Fine-T


In [6]:
print(f"transformers=={transformers.__version__}")

transformers==4.38.2


## Model Loading and Configuration with Quantization

In [7]:
# Define the path to the pre-trained model stored in the Kaggle input directory
model_name = "/kaggle/input/gemma/transformers/7b-it/1"  # The model is loaded from this specific directory.

# Set the data type for computations to 'float16' for memory efficiency and faster computation on compatible hardware (e.g., GPUs).
compute_dtype = getattr(torch, "float16")  # Retrieve 'float16' precision type from the torch module.

# Configuration for quantization with BitsAndBytes for 4-bit model weights to reduce memory usage.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable loading the model weights in 4-bit precision.
    bnb_4bit_use_double_quant=False,  # Disable double quantization to optimize memory usage.
    bnb_4bit_quant_type="nf4",  # Set the quantization type to 'nf4' for improved memory efficiency.
    bnb_4bit_compute_dtype=compute_dtype,  # Set the computation data type to 'float16' for efficient computation.
)

# Load the pre-trained language model with the specified quantization configuration and automatic device placement (GPU/CPU).
model = AutoModelForCausalLM.from_pretrained(
    model_name,  # Path to the model directory.
    device_map="auto",  # Automatically place the model on available GPUs or CPUs.
    quantization_config=bnb_config,  # Apply the BitsAndBytes quantization configuration for efficient memory usage.
)

# Disable caching to prevent memory overhead during inference (especially useful in fine-tuning or evaluation).
model.config.use_cache = False  # Disable caching mechanism in the model for saving memory.

# Set the pretraining tensor parallelism (helps when distributing model computation over multiple GPUs).
model.config.pretraining_tp = 1  # Set tensor parallelism to 1, which may help with memory optimization.

# Define the maximum sequence length the model can handle for tokenization.
max_seq_length = 2048  # The maximum sequence length allowed by the tokenizer for each input.

# Load the tokenizer for the model, ensuring that the max sequence length is respected during tokenization.
tokenizer = AutoTokenizer.from_pretrained(model_name, max_seq_length=max_seq_length)  # Load the tokenizer for the model.

# Store the end-of-sequence (EOS) token for later use during text generation.
EOS_TOKEN = tokenizer.eos_token  # Retrieve the EOS token used to mark the end of a generated text sequence.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Data Preprocessing, Prompt Generation, Model Training

In [8]:
# Specify the path to the CSV file containing the sentiment analysis data
filename = "../input/sentiment-analysis-for-financial-news/all-data.csv"

# Load the CSV file into a pandas DataFrame. The file is expected to have two columns: 'sentiment' and 'text'.
df = pd.read_csv(filename, 
                 names=["sentiment", "text"],  # Column names: 'sentiment' for labels and 'text' for headlines
                 encoding="utf-8", encoding_errors="replace")  # Handle encoding errors by replacing invalid characters

# Initialize empty lists to store training and testing data for each sentiment class
X_train = list()
X_test = list()

# Split the dataset into training and testing sets for each sentiment category: 'positive', 'neutral', and 'negative'
for sentiment in ["positive", "neutral", "negative"]:
    # Filter the DataFrame by sentiment, and split into training and testing sets with 300 samples each
    train, test  = train_test_split(df[df.sentiment == sentiment], 
                                    train_size=300,  # 300 samples for training
                                    test_size=300,   # 300 samples for testing
                                    random_state=42)  # Set a random seed for reproducibility
    # Append the split data to the respective lists
    X_train.append(train)
    X_test.append(test)

# Combine all training and testing data into single DataFrames and shuffle the training data
X_train = pd.concat(X_train).sample(frac=1, random_state=10)  # Shuffle the training data
X_test = pd.concat(X_test)

# Identify the indices of the rows not used in training or testing to create the evaluation dataset
eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
# Create the evaluation dataset using the identified indices
X_eval = df[df.index.isin(eval_idx)]

# For each sentiment, sample 50 examples from the evaluation dataset to ensure balanced representation
X_eval = (X_eval
          .groupby('sentiment', group_keys=False)  # Group by sentiment
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))  # Sample 50 examples per group

# Reset the index of the training data after shuffling
X_train = X_train.reset_index(drop=True)

# Function to generate a training prompt for the model, including the sentiment label and text headline
def generate_prompt(data_point):
    return f"""generate_prompt
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative"

            [{data_point["text"]}] = {data_point["sentiment"]}
            """.strip() + EOS_TOKEN  # Add the EOS (End-of-Sequence) token

# Function to generate a test prompt for the model without a sentiment label (for prediction)
def generate_test_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative"

            [{data_point["text"]}] = 

            """.strip()  # No sentiment label included for test prompts

# Apply the 'generate_prompt' function to each row in the training and evaluation datasets to generate input prompts
X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1), columns=["text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1), columns=["text"])

# Extract the true sentiment labels from the test set for evaluation
y_true = X_test.sentiment
# Apply the 'generate_test_prompt' function to the test set to generate prompts for evaluation
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

# Convert the training and evaluation datasets into HuggingFace Datasets for model training
train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)


## Evaluation Function  with Accuracy, Classification Report, and Confusion Matrix

In [9]:
def evaluate(y_true, y_pred):
    # Define the possible sentiment labels
    labels = ['positive', 'neutral', 'negative']
    
    # Mapping of sentiment labels to numeric values for evaluation
    mapping = {'positive': 2, 'neutral': 1, 'none': 1, 'negative': 0}

    # Define a function to map sentiment labels to numeric values
    def map_func(x):
        # Return the mapped value for the sentiment, defaulting to 'neutral' (1) if not found
        return mapping.get(x, 1)
    
    # Apply the map function to the true and predicted labels using numpy's vectorize method
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)
    
    # Calculate the overall accuracy of the predictions compared to the true labels
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate an accuracy report for each unique sentiment label
    unique_labels = set(y_true)  # Get unique labels from the true labels
    
    # Loop through each unique label to calculate its accuracy separately
    for label in unique_labels:
        # Find indices of all occurrences of the current label in the true labels
        label_indices = [i for i in range(len(y_true)) if y_true[i] == label]
        
        # Extract the corresponding true and predicted labels for this sentiment
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        
        # Calculate the accuracy for the current label
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    # Generate a classification report that includes precision, recall, and F1-score
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate and print a confusion matrix to show the counts of correct and incorrect predictions
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)


## Prediction Function

In [10]:
def predict(X_test, model, tokenizer):
    # Initialize an empty list to store the predicted labels
    y_pred = []
    
    # Iterate through each test sample in X_test
    for i in tqdm(range(len(X_test))):  # Using tqdm for a progress bar over the loop
        # Extract the text of the current sample
        prompt = X_test.iloc[i]["text"]
        
        # Tokenize the input text and move the tokenized input to GPU (if available)
        input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")  # Convert text to input IDs
        
        # Generate the output from the model by passing the tokenized input
        outputs = model.generate(**input_ids, max_new_tokens=1, temperature=0.0)  # Set temperature to 0 for deterministic output
        
        # Decode the output to get the generated text from the model
        result = tokenizer.decode(outputs[0])
        
        # Split the result at "=" and extract the part after the equal sign as the predicted sentiment label
        answer = result.split("=")[-1].lower()  # Convert to lowercase for uniformity
        
        # Check the content of the answer and append the corresponding sentiment label to y_pred
        if "positive" in answer:
            y_pred.append("positive")  # Assign "positive" if found in the answer
        elif "negative" in answer:
            y_pred.append("negative")  # Assign "negative" if found in the answer
        elif "neutral" in answer:
            y_pred.append("neutral")  # Assign "neutral" if found in the answer
        else:
            y_pred.append("none")  # Assign "none" if no sentiment was detected in the answer
    
    return y_pred  # Return the list of predicted sentiment labels


In [11]:
y_pred = predict(X_test, model, tokenizer)

100%|██████████| 900/900 [06:26<00:00,  2.33it/s]


In [12]:
evaluate(y_true, y_pred)

Accuracy: 0.632
Accuracy for label 0: 0.807
Accuracy for label 1: 0.193
Accuracy for label 2: 0.897

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.81      0.87       300
           1       0.46      0.19      0.27       300
           2       0.52      0.90      0.66       300

    accuracy                           0.63       900
   macro avg       0.64      0.63      0.60       900
weighted avg       0.64      0.63      0.60       900


Confusion Matrix:
[[242  41  17]
 [ 13  58 229]
 [  4  27 269]]


## PEFT (Parameter-Efficient Fine-Tuning) with LoRA Configuration and Trainer Setup

In [13]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
)

training_arguments = TrainingArguments(
    output_dir="logs",
    num_train_epochs=1,
    gradient_checkpointing=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=False,
    evaluation_strategy='steps',
    eval_steps = 112,
    eval_accumulation_steps=1,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    args=training_arguments,
    packing=False,
)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [14]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained("trained-model")

Step,Training Loss,Validation Loss
112,0.8343,1.525713


In [15]:
%load_ext tensorboard
%tensorboard --logdir logs/runs

<IPython.core.display.Javascript object>

## Predict and Evaluate Sentiment Labels on Test Data

In [16]:
y_pred = predict(X_test, model, tokenizer)
evaluate(y_true, y_pred)

100%|██████████| 900/900 [07:01<00:00,  2.14it/s]

Accuracy: 0.844
Accuracy for label 0: 0.920
Accuracy for label 1: 0.873
Accuracy for label 2: 0.740

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.92      0.93       300
           1       0.74      0.87      0.80       300
           2       0.88      0.74      0.80       300

    accuracy                           0.84       900
   macro avg       0.85      0.84      0.85       900
weighted avg       0.85      0.84      0.85       900


Confusion Matrix:
[[276  20   4]
 [ 11 262  27]
 [  4  74 222]]





In [17]:
evaluation = pd.DataFrame({'text': X_test["text"], 
                           'y_true':y_true, 
                           'y_pred': y_pred},
                         )
evaluation.to_csv("test_predictions.csv", index=False)