Import Necessary Libraries

In [None]:
import json
import numpy as np

# Optional: If you use any specific library for deep learning like PyTorch or TensorFlow, import them as well
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset


Define the Function to Format the Dataset


In [None]:
def format_dataset_with_pairs(input_file_path, output_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    formatted_data = []
    for conversation in data:
        context = [utterance['text'] for utterance in conversation['conversation']]
        emotion_cause_pairs = conversation['emotion-cause_pairs']

        for pair in emotion_cause_pairs:
            emotion_utterance, cause_utterance = pair[0].split('_'), pair[1].split('_')
            emotion_id, emotion = int(emotion_utterance[0]), emotion_utterance[1]
            cause_id = int(cause_utterance[0])
            emotion_text = conversation['conversation'][emotion_id - 1]['text']
            cause_text = conversation['conversation'][cause_id - 1]['text']

            formatted_data.append({
                'context': ' '.join(context),
                'emotion_utterance': emotion_text,
                'emotion': emotion,
                'cause_utterance': cause_text,
                'emotion_id': emotion_id,
                'cause_id': cause_id
            })

    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(formatted_data, outfile, indent=4, ensure_ascii=False)


In [None]:
train_file = 'Subtask_1_train.json'
output_train_json = 'formatted_train.json'
format_dataset_with_pairs(train_file, output_train_json)

Define the Function to Generate Instructional Prompts

In [None]:
def generate_prompts_with_cause(data):
    prompts = []
    for entry in data:
        prompt = f"Context: {entry['context']} \nIdentify the emotion and the cause in the conversation. Emotion appears in: '{entry['emotion_utterance']}' and the cause appears in: '{entry['cause_utterance']}'"
        expected_response = f"Emotion: {entry['emotion']} (Utterance {entry['emotion_id']}), Cause: Utterance {entry['cause_id']}"
        prompts.append({'prompt': prompt, 'expected_response': expected_response})
    return prompts


Hypothetical Function to Fine-Tune Vicuna (assuming API availability)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
model = AutoModelForCausalLM.from_pretrained("lmsys/vicuna-7b-v1.5")

# Example prompt
prompt = "Translate the following English text to French: 'Hello, how are you?'"

# Encode the prompt and generate response
inputs = tokenizer(prompt, return_tensors="pt")
output = model.generate(**inputs, max_length=100)

# Decode and print the response
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def fine_tune_model(prompts):
    # This is a placeholder; actual implementation would involve API calls to Vicuna
    print("Fine-tuning the model with provided prompts...")
    # Example: model.train(prompts)
    return "Trained Model"


Define the Evaluation Function

In [None]:
def evaluate_model(test_data, model):
    correct_predictions = 0
    total_predictions = len(test_data)
    for test_instance in test_data:
        # Hypothetical prediction call
        prediction = f"The emotion is {test_instance['emotion']}."
        if prediction.strip() == test_instance['expected_response'].strip():
            correct_predictions += 1
    accuracy = correct_predictions / total_predictions
    return accuracy


Main Execution Flow

In [None]:
def main():
    # Step 1: Format the dataset
    format_dataset('Subtask_1_train.json', 'formatted_train_data.json')

    # Step 2: Load formatted data and generate prompts
    with open('formatted_train_data.json', 'r') as f:
        formatted_data = json.load(f)
    prompts = generate_prompts(formatted_data)

    # Step 3: Fine-tune the model
    trained_model = fine_tune_model(prompts)

    # Step 4: Evaluate the model (assuming you have test data formatted similarly)
    test_accuracy = evaluate_model(formatted_data[:10], trained_model)  # Simplified for example
    print(f"Model accuracy on test data: {test_accuracy:.2%}")

if __name__ == "__main__":
    main()


NameError: name 'format_dataset' is not defined