In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM

# # Load tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
# model = AutoModelForCausalLM.from_pretrained("lmsys/vicuna-7b-v1.5")

# # Example prompt
# prompt = "Translate the following English text to French: 'Hello, how are you?'"

# # Encode the prompt and generate response
# inputs = tokenizer(prompt, return_tensors="pt")
# output = model.generate(**inputs, max_length=100)

# # Decode and print the response
# decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
# print(decoded_output)


In [None]:
# !pip install "transformers==4.34.0" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.23.0" "bitsandbytes==0.41.1" "trl==0.4.7" "safetensors>=0.3.1" --upgrade
!pip install transformers datasets peft accelerate bitsandbytes trl safetensors


Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m33.4 

In [None]:
import json
import numpy as np

# Optional: If you use any specific library for deep learning like PyTorch or TensorFlow, import them as well
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset


In [None]:
def format_dataset_with_pairs(input_file_path, output_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    formatted_data = []
    for conversation in data:
        context = [utterance['text'] for utterance in conversation['conversation']]
        emotion_cause_pairs = conversation['emotion-cause_pairs']

        for pair in emotion_cause_pairs:
            emotion_utterance, cause_utterance = pair[0].split('_'), pair[1].split('_')
            emotion_id, emotion = int(emotion_utterance[0]), emotion_utterance[1]
            cause_id = int(cause_utterance[0])
            emotion_text = conversation['conversation'][emotion_id - 1]['text']
            cause_text = conversation['conversation'][cause_id - 1]['text']

            formatted_data.append({
                'context': ' '.join(context),
                'emotion_utterance': emotion_text,
                'emotion': emotion,
                'cause_utterance': cause_text,
                'emotion_id': emotion_id,
                'cause_id': cause_id
            })

    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(formatted_data, outfile, indent=4, ensure_ascii=False)


In [None]:
import os
# os.listdir('/kaggle/input/subtask-1-train-json')
base_dir = './'

In [None]:
train_file = base_dir+'Subtask_1_train.json'
output_train_json = 'formatted_train.json'
format_dataset_with_pairs(train_file, output_train_json)

In [None]:
def generate_prompts_with_cause(data):
    prompts = []
    for entry in data:
        prompt = f"Context: {entry['context']} \nIdentify the emotion and the cause in the conversation. Emotion appears in: '{entry['emotion_utterance']}' and the cause appears in: '{entry['cause_utterance']}'"
        expected_response = f"Emotion: {entry['emotion']} (Utterance {entry['emotion_id']}), Cause: Utterance {entry['cause_id']}"
        prompts.append({'prompt': prompt, 'expected_response': expected_response})
    return prompts


In [None]:
def format_instruction1(entry):
    prompt = f"Context: {entry['context']} \nIdentify the emotion and the cause in the conversation. Emotion appears in: '{entry['emotion_utterance']}' and the cause appears in: '{entry['cause_utterance']}'"
    expected_response = f"Emotion: {entry['emotion']} (Utterance {entry['emotion_id']}), Cause: Utterance {entry['cause_id']}"
    return f"""### Prompt: {prompt}
            ### response: {expected_response}
            """


In [None]:
def format_instruction(entry):
    prompt = f"Context: {entry['context']} \nIdentify the emotion and the cause in the conversation. Emotion appears in: '{entry['emotion_utterance']}' and the cause appears in: '{entry['cause_utterance']}'"
    expected_response = f"Emotion: {entry['emotion']} (Utterance {entry['emotion_id']}), Cause: Utterance {entry['cause_id']}"
    return {'prompt': prompt, 'expected_response': expected_response}


In [None]:
import json
from sklearn.model_selection import train_test_split

# Load the formatted dataset
with open('formatted_train.json', 'r', encoding='utf-8') as file:
    formatted_data = json.load(file)

# Split the dataset into train and eval sets
train_data, eval_data = train_test_split(formatted_data, test_size=0.2, random_state=42)

# Save the split datasets
with open('formatted_train.json', 'w', encoding='utf-8') as train_file:
    json.dump(train_data, train_file, indent=4, ensure_ascii=False)

with open('formatted_eval.json', 'w', encoding='utf-8') as eval_file:
    json.dump(eval_data, eval_file, indent=4, ensure_ascii=False)

print("Dataset split into formatted_train.json and formatted_eval.json.")


Dataset split into formatted_train.json and formatted_eval.json.


In [None]:
import os
os.listdir('./')

['.config',
 'Subtask_1_train.json',
 'formatted_train.json',
 'formatted_eval.json',
 'sample_data']

In [None]:
def finetune_withSFFT(prompts, model):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.config.use_cache = False  # Set use_cache to False to be compatible with gradient checkpointing

    print(device)

    args = TrainingArguments(
    output_dir="vicuna_instruction_tuning",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=False,
    fp16=True,
    tf32=False,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=False
    )

    # Create SFTTrainer and start training
    trainer = SFTTrainer(
        model=model,
        train_dataset=prompts,  # Use prompts as the training dataset
        max_seq_length=1024,
        tokenizer=tokenizer,
        args=args,
        formatting_func=lambda x: x
    )

    # Train the model
    trainer.train()

    # Save the fine-tuned model
#     trainer.save_model()

    return trainer.model


In [None]:
# Step 2: Import necessary libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
from datasets import load_dataset
from random import randrange

# model_name = "lmsys/vicuna-7b-v1.5"  # Vicuna model name
model_name = "facebook/opt-350m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
def fine_tune_model_with_vicuna(prompts,model):
    # Assuming you have imported the Vicuna library and initialized the model


    # Fine-tuning with instruction tuning
    instruction_tuner = pipeline(
        task="text2text-generation",
        model=model,
        tokenizer=tokenizer,
        framework="hf",
        device=0  # Adjust to the appropriate GPU device if available
    )

    # Fine-tune the model with provided prompts
    fine_tuned_model = instruction_tuner(train_texts=prompts)

    return fine_tuned_model


In [None]:
def extract_emotion_and_cause(generated_response):
    # Assuming the format is: "Emotion: <emotion>, Cause: <cause>"
    # Splitting the generated response into emotion and cause
    parts = generated_response.split(',')
    for part in parts:
        if "Emotion:" in part:
            extracted_emotion = part.split(':')[1].strip()
        elif "Cause:" in part:
            extracted_cause = part.split(':')[1].strip()
    return extracted_emotion, extracted_cause


In [None]:
def evaluate_model(test_data, fine_tuned_model):
    # Initialize variables
    emotion_tp, emotion_fp, emotion_fn, emotion_correct = 0, 0, 0, 0
    cause_tp, cause_fp, cause_fn, cause_correct = 0, 0, 0, 0
    pair_tp, pair_fp, pair_fn, pair_correct = 0, 0, 0, 0
    total_instances = len(test_data)

    for instance in test_data:
        # Generate prompt for the instance
        prompt = f"Context: {instance['context']} \nIdentify the emotion and the cause in the conversation. Emotion appears in: '{instance['emotion_utterance']}' and the cause appears in: '{instance['cause_utterance']}'"

        # Generate response using the fine-tuned model
        generated_response = fine_tuned_model(prompt)[0]['generated_text'].strip()

        # Extract emotion and cause from generated response
        generated_emotion, generated_cause = extract_emotion_and_cause(generated_response)

        # Evaluate emotion prediction
        if generated_emotion == instance['emotion']:
            emotion_correct += 1
            emotion_tp += 1
        else:
            emotion_fp += 1
            emotion_fn += 1

        # Evaluate cause prediction
        if generated_cause == f"Utterance {instance['cause_id']}":
            cause_correct += 1
            cause_tp += 1
        else:
            cause_fp += 1
            cause_fn += 1

        # Evaluate pair prediction
        if generated_emotion == instance['emotion'] and generated_cause == f"Utterance {instance['cause_id']}":
            pair_correct += 1
            pair_tp += 1
        else:
            pair_fp += 1
            pair_fn += 1

    # Calculate accuracy, precision, recall, and F1-score for each task
    emotion_accuracy = emotion_correct / total_instances
    emotion_precision = emotion_tp / (emotion_tp + emotion_fp) if emotion_tp + emotion_fp != 0 else 0
    emotion_recall = emotion_tp / (emotion_tp + emotion_fn) if emotion_tp + emotion_fn != 0 else 0
    emotion_f1_score = 2 * (emotion_precision * emotion_recall) / (emotion_precision + emotion_recall) if emotion_precision + emotion_recall != 0 else 0

    cause_accuracy = cause_correct / total_instances
    cause_precision = cause_tp / (cause_tp + cause_fp) if cause_tp + cause_fp != 0 else 0
    cause_recall = cause_tp / (cause_tp + cause_fn) if cause_tp + cause_fn != 0 else 0
    cause_f1_score = 2 * (cause_precision * cause_recall) / (cause_precision + cause_recall) if cause_precision + cause_recall != 0 else 0

    pair_accuracy = pair_correct / total_instances
    pair_precision = pair_tp / (pair_tp + pair_fp) if pair_tp + pair_fp != 0 else 0
    pair_recall = pair_tp / (pair_tp + pair_fn) if pair_tp + pair_fn != 0 else 0
    pair_f1_score = 2 * (pair_precision * pair_recall) / (pair_precision + pair_recall) if pair_precision + pair_recall != 0 else 0

    return emotion_accuracy, emotion_precision, emotion_recall, emotion_f1_score, \
           cause_accuracy, cause_precision, cause_recall, cause_f1_score, \
           pair_accuracy, pair_precision, pair_recall, pair_f1_score



In [None]:
import json
from datasets import Dataset

def load_custom_dataset(file_path):
    dataset = Dataset.from_json(file_path)
    return dataset

In [None]:
training_data_file = 'formatted_train.json'

load_custom_dataset(training_data_file)


Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['cause_utterance', 'emotion_id', 'cause_id', 'emotion', 'context', 'emotion_utterance'],
    num_rows: 7491
})

In [None]:
import json


training_data_file = 'formatted_train.json'
eval_data_file = 'formatted_eval.json'
# Step 1: Load formatted data
#     with open('formatted_train.json', 'r', encoding='utf-8') as f:
#         formatted_data = json.load(f)
#     with open('formatted_eval.json', 'r') as f:
#         eval_data = json.load(f)
train_dataset = load_custom_dataset(training_data_file)
eval_dataset = load_custom_dataset(eval_data_file)

# Step 2: Combine datasets
combined_dataset = {'train': train_dataset, 'eval': eval_dataset}
#     eval_prompts = generate_prompts_with_cause(eval_data)
# Step 2: Generate prompts
#     prompts = generate_prompts_with_cause(formatted_data)
# Step 3: Fine-tune the model with Vicuna
#     trained_model = fine_tune_model_with_vicuna(prompts,model)
formatted_data = [format_instruction(entry) for entry in train_dataset]
formatted_dataset = Dataset.from_dict({
    'prompt': [entry['prompt'] for entry in formatted_data],
    'expected_response': [entry['expected_response'] for entry in formatted_data]
})
fine_tuned_model = finetune_withSFFT(formatted_dataset,model)
# Step 4: Evaluate the fine-tuned model (if needed)
# test_accuracy = evaluate_model(test_data, trained_model)  # Assuming you have test data
# print(f"Model accuracy on test data: {test_accuracy:.2%}")
# Example usage:


cuda


Map:   0%|          | 0/7491 [00:00<?, ? examples/s]

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
emotion_acc, emotion_p, emotion_r, emotion_f1, \
cause_acc, cause_p, cause_r, cause_f1, \
pair_acc, pair_p, pair_r, pair_f1 = evaluate_model(eval_data, fine_tuned_model)
print(f"Emotion Accuracy: {emotion_acc:.4f}, Precision: {emotion_p:.4f}, Recall: {emotion_r:.4f}, F1 Score: {emotion_f1:.4f}")
print(f"Cause Accuracy: {cause_acc:.4f}, Precision: {cause_p:.4f}, Recall: {cause_r:.4f}, F1 Score: {cause_f1:.4f}")
print(f"Pair Accuracy: {pair_acc:.4f}, Precision: {pair_p:.4f}, Recall: {pair_r:.4f}, F1 Score: {pair_f1:.4f}")

AttributeError: 'str' object has no attribute 'size'

In [None]:
instance = eval_data[0]

In [None]:
prompt = f"Context: {instance['context']} \nIdentify the emotion and the cause in the conversation. Emotion appears in: '{instance['emotion_utterance']}' and the cause appears in: '{instance['cause_utterance']}'"
expected_response = f"Emotion: {instance['emotion']} (Utterance {instance['emotion_id']}), Cause: Utterance {instance['cause_id']}"
promptfinal= f"""### Prompt: {prompt}
        ### response: {expected_response}
        """

In [None]:

# Tokenize the prompt
inputs = tokenizer(promptfinal, return_tensors='pt')  # 'pt' creates PyTorch tensors

# Send the input to the same device as the model
device = next(fine_tuned_model.parameters()).device
inputs = {key: value.to(device) for key, value in inputs.items()}

# Make a prediction
with torch.no_grad():
    outputs = fine_tuned_model(**inputs)




In [None]:
outputs

CausalLMOutputWithPast(loss={'logits': tensor([[[ -3.8203,  -2.3184,  10.3594,  ...,  -2.2266,  -4.9844,   0.1771],
         [ -0.5801,   0.5054,   5.1133,  ...,  -0.8896,  -0.7749,  -0.6108],
         [ -9.4219, -11.4531,  -3.2246,  ..., -11.3359, -11.4375,  -9.8047],
         ...,
         [ -4.6250,  -5.5703,  -0.6377,  ...,  -6.6172,  -3.7520,  -5.9961],
         [ -6.1055,  -5.9766,  -2.2402,  ...,  -7.4609,  -5.5664,  -6.6797],
         [ -5.6562,  -5.7266,  -1.6016,  ...,  -6.8633,  -3.1484,  -6.3008]]],
       device='cuda:0')}, logits=tensor([[[ -3.8203,  -2.3184,  10.3594,  ...,  -2.2266,  -4.9844,   0.1771],
         [ -0.5801,   0.5054,   5.1133,  ...,  -0.8896,  -0.7749,  -0.6108],
         [ -9.4219, -11.4531,  -3.2246,  ..., -11.3359, -11.4375,  -9.8047],
         ...,
         [ -4.6250,  -5.5703,  -0.6377,  ...,  -6.6172,  -3.7520,  -5.9961],
         [ -6.1055,  -5.9766,  -2.2402,  ...,  -7.4609,  -5.5664,  -6.6797],
         [ -5.6562,  -5.7266,  -1.6016,  ...,  -6.8

In [None]:
predictions = torch.argmax(outputs.logits, dim=-1)

# Decode token IDs to text
decoded_text = tokenizer.decode(predictions[0], skip_special_tokens=True)
print(decoded_text)

'__ expected_ expected_ expected__ expected expected_ expectedpt_ expected_ expected__ expected___ West__ we__ expected_____ town___? expected _________ expected__pt___ expected_________ least know what__ are on?___ expected in in: '__ what town_ we near__ Freem_  _    __ expectedotive_ expected__ility___ expected_ expectedter____     expected ### expected expected


In [None]:
def perform_inference(model, tokenizer, prompt_text):
    # Tokenize the input text
    inputs = tokenizer(prompt_text, return_tensors='pt', truncation=True, max_length=1024)

    # Send input to the device the model is on
    device = next(model.parameters()).device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate output using the model
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=1024)

    # Decode the generated ids to text
    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response_text

In [None]:
sample_instance = {
    'context': "John was visibly upset after the meeting.",
    'emotion_utterance': "He sighed and looked away.",
    'cause_utterance': "The meeting did not go as planned."
}

# Prepare the prompt
prompt_text = format_instruction(sample_instance)

# Perform inference
inference_result = perform_inference(fine_tuned_model, tokenizer, prompt_text)
print("Inference Result:", inference_result)

KeyError: 'emotion'