In [1]:
import torch
print(torch.__version__)

2.5.1+cu121


In [2]:
!python --version

Python 3.10.12


# <font color = 'indianred'>**Multilabel Classification of Emotion Tweet Dataset using LLAMA Models** </font>























# <font color = 'indianred'> **1. Setting up the Environment** </font>



In [3]:
import sys
# If in Colab, then import the drive module from google.colab
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  # Mount the Google Drive to access files stored there
  drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import gc
def free_memory():
    """
    Attempts to free up memory by deleting variables and running Python's garbage collector.
    """
    gc.collect()
    for device_id in range(torch.cuda.device_count()):
        torch.cuda.set_device(device_id)
        torch.cuda.empty_cache()
    gc.collect()

In [5]:

import torch

def check_cuda():
    if torch.cuda.is_available():
        print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
    else:
        print("CUDA is not available. Using CPU.")

check_cuda()

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB


In [7]:
# pip install evaluate bitsandbytes trl

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting trl
  Downloading trl-0.12.2-py3-none-any.whl.metadata (11 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloadin

<font color = 'indianred'> *Load Libraries* </font>

In [8]:
# standard pythgion libraries
from pathlib import Path
import re
import gc
import time
from typing import Dict, List, Union, Optional
from tqdm import tqdm
import itertools
import json
import joblib
import ast
from datetime import datetime
from difflib import get_close_matches

# Data Science librraies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix, precision_score, recall_score, f1_score

# Pytorch
import torch
import torch.nn as nn

# Huggingface Librraies
import evaluate
from datasets import load_dataset, DatasetDict, Dataset, ClassLabel
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import (
    TrainingArguments,
    Trainer,
    set_seed,
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    AutoConfig,
    pipeline,
    BitsAndBytesConfig,
)
from peft import (
    TaskType,
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model,
    AutoPeftModelForCausalLM,
    PeftConfig
)

from huggingface_hub import login

# Logging and secrets
import wandb
from google.colab import userdata


In [9]:
set_seed(42)

In [10]:
import os
from pathlib import Path
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Set the working directory to the NLP folder in your Drive
nlp_folder = Path('/content/drive/My Drive/NLP')
os.chdir(nlp_folder)

# Verify the current working directory
print(f"Current working directory: {os.getcwd()}")

# Define the base path (now the NLP folder)
basepath = nlp_folder

# Create the data folder, model folder, and custom functions folder
data_folder = Path(basepath) / 'data_folder'
model_folder = Path(basepath) / 'lect 7' / 'assignment 7'/ 'sentiment' / 'nn' # Updated path
custom_functions = Path(basepath) / 'lect 7' / 'assignment 7' /'CustomFiles'  # Updated path

# Create the necessary directories if they don't exist
model_folder.mkdir(exist_ok=True, parents=True)
custom_functions.mkdir(exist_ok=True, parents=True)

# Verify that the folders are created
print(f"Model folder created: {model_folder.exists()}")
print(f"Custom functions folder created: {custom_functions.exists()}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Current working directory: /content/drive/My Drive/NLP
Model folder created: True
Custom functions folder created: True


In [11]:
import wandb

# Log into WandB
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [12]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# <font color = 'indianred'> **2. Load Data set**
    


In [13]:
import pandas as pd
import numpy as np
# In[3]: Load and preview the data
def load_data():
    train_df = pd.read_csv('data_folder/train.csv')
    test_df = pd.read_csv('data_folder/test.csv')

    print("Training set shape:", train_df.shape)
    print("Test set shape:", test_df.shape)

    # Define emotion labels
    emotion_labels = ['anger', 'anticipation', 'disgust', 'fear', 'joy',
                     'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

    return train_df, test_df, emotion_labels

train_df, test_df, emotion_labels = load_data()

Training set shape: (7724, 13)
Test set shape: (3259, 13)


In [14]:
# In[4]: Create emotion descriptions and prompts
def create_emotion_prompts():
    """Create detailed emotion descriptions for zero-shot classification"""
    return {
        'anger': {
            'descriptors': ['anger', 'fury', 'rage', 'irritation', 'hostility'],
            'context': 'expressing frustration, hostility, or intense displeasure',
            'examples': ['This makes me so mad!', 'I hate when this happens!']
        },
        'anticipation': {
            'descriptors': ['anticipation', 'expectation', 'looking forward', 'awaiting'],
            'context': 'awaiting or expecting something with excitement or anxiety',
            'examples': ['Can\'t wait for tomorrow!', 'Looking forward to this!']
        },
        'disgust': {
            'descriptors': ['disgust', 'repulsion', 'revulsion', 'distaste'],
            'context': 'feeling strong aversion or profound disapproval',
            'examples': ['This is so gross!', 'I can\'t stand this!']
        },
        'fear': {
            'descriptors': ['fear', 'anxiety', 'worry', 'dread', 'terror'],
            'context': 'feeling afraid or anxious about something',
            'examples': ['I\'m scared about what might happen', 'This is terrifying']
        },
        'joy': {
            'descriptors': ['joy', 'happiness', 'delight', 'pleasure', 'elation'],
            'context': 'feeling or expressing great happiness or pleasure',
            'examples': ['This makes me so happy!', 'What a wonderful day!']
        },
        'love': {
            'descriptors': ['love', 'affection', 'adoration', 'fondness'],
            'context': 'feeling or expressing deep affection or attachment',
            'examples': ['I love this so much!', 'You mean everything to me']
        },
        'optimism': {
            'descriptors': ['optimism', 'hope', 'positivity', 'confidence'],
            'context': 'having a positive outlook or expectation',
            'examples': ['Things will get better!', 'I believe in a bright future']
        },
        'pessimism': {
            'descriptors': ['pessimism', 'negativity', 'doubt', 'cynicism'],
            'context': 'having a negative or doubtful outlook',
            'examples': ['This will never work', 'Everything always goes wrong']
        },
        'sadness': {
            'descriptors': ['sadness', 'sorrow', 'grief', 'melancholy'],
            'context': 'feeling or expressing unhappiness or sorrow',
            'examples': ['This makes me so sad', 'I feel heartbroken']
        },
        'surprise': {
            'descriptors': ['surprise', 'shock', 'astonishment', 'amazement'],
            'context': 'feeling or expressing unexpected amazement',
            'examples': ['I can\'t believe this!', 'This is so unexpected!']
        },
        'trust': {
            'descriptors': ['trust', 'confidence', 'faith', 'reliability'],
            'context': 'having or expressing confidence or faith in someone/something',
            'examples': ['I believe in you', 'You can count on this']
        }
    }

emotion_descriptions = create_emotion_prompts()
print("Emotion descriptions created successfully!")

Emotion descriptions created successfully!


In [16]:
# In[5]: Setup zero-shot classifier
def setup_zero_shot_classifier():
    """Initialize the zero-shot classification pipeline"""
    classifier = pipeline(
        "zero-shot-classification",
        model="meta-llama/Llama-3.2-1B",
        device=0 if torch.cuda.is_available() else -1
    )
    return classifier

classifier = setup_zero_shot_classifier()
print("Zero-shot classifier initialized!")

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


Zero-shot classifier initialized!


In [17]:
# In[6]: Create custom prompt template
def create_custom_prompt(text, emotion, emotion_info):
    """
    Create a detailed prompt for each emotion classification
    """
    return f"""
    Task: Determine if the following tweet expresses {emotion}.

    Context about {emotion}:
    - Key indicators: {', '.join(emotion_info['descriptors'])}
    - Context: {emotion_info['context']}
    - Example expressions: {' | '.join(emotion_info['examples'])}

    Tweet: "{text}"

    Question: Does this tweet express {emotion}?
    Let's approach this step by step:
    1. First, identify the key emotions in the tweet
    2. Then, determine if {emotion} is present
    3. Finally, provide a yes/no answer

    Answer:
    """

# Test the prompt creation
sample_text = "I can't wait for the weekend!"
sample_prompt = create_custom_prompt(sample_text, "anticipation", emotion_descriptions["anticipation"])
print("Sample prompt created!")

Sample prompt created!


In [21]:
# In[7]: Implement prediction function
def zero_shot_predict(text, classifier, emotion_descriptions):
    """
    Perform zero-shot prediction for a single text
    """
    results = {}

    for emotion, info in emotion_descriptions.items():
        # Create hypothesis template with placeholder {}
        hypothesis_template = "This tweet expresses {}"

        # Get prediction
        prediction = classifier(
            text,
            info['descriptors'],
            hypothesis_template=hypothesis_template,
            multi_label=True
        )

        # Take the maximum probability among the descriptors
        results[emotion] = max(prediction['scores'])

    return results

# Add calibration
def calibrate_predictions(raw_predictions, calibration_factor=1.5):
    """
    Adjust prediction confidence scores
    """
    return {
        emotion: 1 / (1 + np.exp(-calibration_factor * (score - 0.5)))
        for emotion, score in raw_predictions.items()
    }

In [22]:
# In[8]: Implement batch prediction
def batch_predict(df, classifier, emotion_descriptions, batch_size=32):
    """
    Process the entire dataset in batches
    """
    all_predictions = []

    for i in tqdm(range(0, len(df), batch_size)):
        batch = df['Tweet'].iloc[i:i+batch_size]
        batch_predictions = []

        for text in batch:
            raw_pred = zero_shot_predict(text, classifier, emotion_descriptions)
            calibrated_pred = calibrate_predictions(raw_pred)
            batch_predictions.append(calibrated_pred)

        all_predictions.extend(batch_predictions)

    return all_predictions

In [37]:
# In[9]: Run predictions on test set
print("Starting predictions on test set...")
predictions = batch_predict(test_df, classifier, emotion_descriptions)  # Removed head(100)
pred_df = pd.DataFrame(predictions)

# Apply threshold to get binary predictions
threshold = 0.5
binary_predictions = (pred_df >= threshold).astype(int)

print("Predictions completed!")

Starting predictions on test set...


100%|██████████| 102/102 [45:29<00:00, 26.76s/it]

Predictions completed!





In [40]:
predictions

[{'anger': 0.46614834083065215,
  'anticipation': 0.5271418351442289,
  'disgust': 0.4840506614038529,
  'fear': 0.42926921840017745,
  'joy': 0.4451678722585144,
  'love': 0.6241727427261573,
  'optimism': 0.47016939611234315,
  'pessimism': 0.4120833240376993,
  'sadness': 0.5643866958412525,
  'surprise': 0.46211117851062794,
  'trust': 0.39627505682051306},
 {'anger': 0.3618572216788695,
  'anticipation': 0.4457787797884014,
  'disgust': 0.40028393928824213,
  'fear': 0.38402907153305493,
  'joy': 0.34821991967361937,
  'love': 0.5732891692153645,
  'optimism': 0.4674209950782008,
  'pessimism': 0.3771161224147959,
  'sadness': 0.43662220462855056,
  'surprise': 0.41674511199125697,
  'trust': 0.36314181610729557},
 {'anger': 0.3476522978342893,
  'anticipation': 0.4009420268883334,
  'disgust': 0.38080940574656014,
  'fear': 0.4013971232731511,
  'joy': 0.3502367592406325,
  'love': 0.5075905600467835,
  'optimism': 0.39575299078638226,
  'pessimism': 0.3614564741939168,
  'sadnes

In [41]:
def convert_to_binary(label_list, emotion_labels):
    label_to_index = {label: i for i, label in enumerate(emotion_labels)}
    binary_labels = []

    for labels in label_list:
        binary_row = [0] * len(emotion_labels)
        for label in labels:
            if label in label_to_index:
                binary_row[label_to_index[label]] = 1
        binary_labels.append(binary_row)

    return binary_labels

In [42]:
test_preds = convert_to_binary(predictions, emotion_labels)

In [43]:
test_df[emotion_labels] = test_preds

In [45]:
# In[11]: Save results
binary_predictions.to_csv('hw8_zero_shot_predictions.csv', index=False)
print("\nPredictions saved to 'hw8_zero_shot_predictions.csv'")


Predictions saved to 'hw8_zero_shot_predictions.csv'


In [46]:
#download hw7_zero_shot_prediction.csv
from google.colab import files
files.download('hw8_zero_shot_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>