In [1]:
pip install transformers SentencePiece torch torchvision jsonlines scikit-learn h5py jsonlines nltk numpy


Collecting SentencePiece
  Downloading sentencepiece-0.1.99-cp38-cp38-macosx_11_0_arm64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
Installing collected packages: SentencePiece
Successfully installed SentencePiece-0.1.99
Note: you may need to restart the kernel to use updated packages.


In [None]:
# !git clone https://github.com/soujanyaporia/MUStARD
# %cd MUStARD/

[Errno 2] No such file or directory: 'MUStARD/'
/Users/jingyi/CMU/23s/research/MUStARD


In [2]:
import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to /Users/jingyi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
!pip install --upgrade --no-cache-dir gdown
!mkdir -p data/features
!gdown -O data/features --id --folder 1Ff1WDObGKqpfbvy7-H1mD8YWvBS-Kf26
!gdown --id 1GYv74vN80iX_IkEmkJhkjDRGxLvraWuZ
!unzip BERT_text_features.zip -d data/

In [11]:
import torch
import torch.nn as nn
from torchvision.models import resnet50
from collections import defaultdict
from config import CONFIG_BY_KEY
from data_loader import DataPreper, DataHelper
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
import os
from transformers import T5ForConditionalGeneration, T5Tokenizer
from utils import gpu_monitor, save_checkpoint, prompt_eng
from tqdm import tqdm  # Import tqdm
import csv


LM_VERSION = 't5-small'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = 42
torch.manual_seed(seed)

class TextFeatureOPTModel(nn.Module):
    def __init__(self, model_name, feature_types, tokenizer, feature_modes):
        super(TextFeatureOPTModel, self).__init__()
        # self.opt_model = AutoModelForCausalLM.from_pretrained(opt_model_name).to(device)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)

        self.feature_types = feature_types
        self.feature_modes = feature_modes 
        self.tokenizer = tokenizer

        self.modules = defaultdict(nn.ModuleDict)

        # Initialize modules for different feature types
        for feature_type in feature_types:
            if feature_type == 'video':
                if feature_modes.get(feature_type) == 'raw':
                    self.modules[feature_type]['encoder'] = resnet50(pretrained=True).to(device)
                    self.modules[feature_type]['encoder'].fc = nn.Identity()
                self.modules[feature_type]['embedding_transform'] = nn.Linear(2048, self.model.config.hidden_size).to(device)
        
            elif feature_type == 'audio':
                if feature_modes.get(feature_type) == 'raw':
                    self.modules[feature_type]['encoder'] = nn.Sequential(
                        nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3),
                        nn.ReLU(),
                        nn.MaxPool1d(kernel_size=2),
                    ).to(device)
                self.modules[feature_type]['embedding_transform'] = nn.Linear(283, self.model.config.hidden_size).double().to(device)

    def tokenize(self, text_input):
        return self.tokenizer(text_input, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
        
    def forward(self, text_input_ids, non_text_features, label_ids = None):
        self.model.eval()
        input_embeddings = self.model.get_input_embeddings()
        text_embeddings = input_embeddings(text_input_ids)

        # Process non-text features
        feature_inputs = []
        for i, feature_type in enumerate(self.feature_types):
            mode = self.feature_modes.get(feature_type)

            embedding_transform = self.modules[feature_type]['embedding_transform']
            
            if mode == 'raw':
                encoder = self.modules[feature_type]['encoder']

                with torch.no_grad():
                    feature_input = encoder(non_text_features[i])
                feature_input = torch.flatten(feature_input, start_dim=1)

                feature_embeddings = embedding_transform(feature_input)
                feature_inputs.append(feature_embeddings.unsqueeze(1))
            
            elif mode == 'precomputed':
                if non_text_features[i].dim() == 1:
                    feature_input = non_text_features[i].unsqueeze(0).unsqueeze(0)
                else:
                    feature_input = non_text_features[i].unsqueeze(1)
                                    
                # Directly use the precomputed features
                feature_embeddings = embedding_transform(feature_input)
                feature_inputs.append(feature_embeddings)

        # Concatenate feature embeddings with text embeddings
        combined_embeddings = [text_embeddings] + feature_inputs
        combined_embeddings = torch.cat(combined_embeddings, dim=1)
        
        # print('combined feature shape' + str(combined_embeddings.shape))

        # Handling both training and evaluation
        if label_ids is not None:
            # with torch.no_grad():
            loss = self.model(inputs_embeds=combined_embeddings.float(), labels=label_ids, return_dict=True).loss
                # print(f'output shape: {outputs.logits.shape}')
            return loss
        else:
            with torch.no_grad():
                outputs = self.model.generate(inputs_embeds=combined_embeddings.float(), max_length=50)
            decoded_texts = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
            return decoded_texts

def evaluate_model(model, test_features, test_output, criterion, device):
    model.eval()  # Set the model to evaluation mode
    predictions = []
    actuals = []

    with torch.no_grad():
        # Wrap the range function with tqdm for a progress bar
        progress_bar = tqdm(range(len(test_output)), desc='Evaluating', unit='batch')

        for i in progress_bar:
            text_input_ids = model.tokenize(test_features['text'][i])

            non_text_feature_inputs = []
            for feature_type in list(test_features.keys())[1:]:
                non_text_feature_inputs.append(torch.tensor(test_features[feature_type][i]).to(device))

            predicted = model(text_input_ids, non_text_feature_inputs, label_ids=None)

            predictions.extend(predicted)
            actuals.extend(test_output)

    accuracy = np.mean(np.array(predictions) == np.array(actuals))
    print(f'Test Accuracy: {accuracy:.4f}')
    
    
    # Save predictions and actuals to a file
    with open('predictions_actuals.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Prediction', 'Actual'])
        for pred, act in zip(predictions, actuals):
            writer.writerow([pred, act])

    # # Confusion Matrix
    # print("Confusion Matrix:")
    # print(confusion_matrix(actuals, predictions))

    # # Classification Report
    # print("Classification Report:")
    # print(classification_report(actuals, predictions))

    return accuracy


def train_model(model, train_features, train_output, optimizer, criterion, device, num_epochs, checkpoint_path):
    
    os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)

    model.train()
    best_loss = float('inf')
    
    for epoch in range(num_epochs):
        total_loss = 0

        # Wrap the range function with tqdm for a progress bar
        progress_bar = tqdm(range(len(train_output)), desc=f'Epoch {epoch+1}/{num_epochs}', unit='batch')

        for i in progress_bar:
            text_input_ids = model.tokenize(train_features['text'][i])
            label_ids = model.tokenize(train_output[i])

            # Prepare non-text features
            non_text_feature_inputs = []
            if len(train_features.keys()) > 1:
                for feature_type in list(train_features.keys())[1:]:
                    non_text_feature_inputs.append(torch.tensor(train_features[feature_type][i]).to(device))
                    
            optimizer.zero_grad()
            loss = model(text_input_ids, non_text_feature_inputs, label_ids)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            del text_input_ids, non_text_feature_inputs, label_ids
            torch.cuda.empty_cache()

            # Update progress bar
            progress_bar.set_postfix({'loss': total_loss / (i + 1)})

        average_loss = total_loss / len(train_output)
        print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {average_loss:.4f}')

        # Save checkpoint if it's the best model so far
        if average_loss < best_loss:
            best_loss = average_loss
            checkpoint_filename = os.path.join(checkpoint_path, f'model_checkpoint_epoch_{epoch+1}.pth')
            save_checkpoint(model, optimizer, epoch, checkpoint_filename)

def train_io(config, data, train_index, test_index):
    train_input, train_output = data.get_split(train_index)
    test_input, test_output = data.get_split(test_index)

    datahelper = DataHelper(train_input, train_output, test_input, test_output, config, data)

    train_features = {}
    test_features = {}

    if config.use_target_text:
        if config.use_bert:
            train_features['text'] = datahelper.get_target_bert_feature(mode="train")
            test_features['text'] = datahelper.get_target_bert_feature(mode="test")
        else:
            train_features['text'] = datahelper.vectorize_utterance(mode="train")
            test_features['text'] = datahelper.vectorize_utterance(mode="test")
            
    if config.use_target_video:
        train_features['video'] = datahelper.get_target_video_pool(mode="train")
        test_features['video'] = datahelper.get_target_video_pool(mode="test")
        
    if config.use_target_audio:
        train_features['audio'] = datahelper.get_target_audio_pool(mode="train")
        test_features['audio'] = datahelper.get_target_audio_pool(mode="test")

    # Check if any modality is being used
    if all(len(features) == 0 for features in train_features.values()):
        raise ValueError("Invalid modalities")

    return train_features, train_output, test_features, test_output
    
    
def proprocess_output(train_output, test_output, class_mapping):
    train_output = [class_mapping[i] for i in train_output]
    test_output = [class_mapping[i] for i in test_output]
    return train_output, test_output

def train(config, data):
    all_indices = data.get_all_indices_shuffled()

    split_point = int(len(all_indices) * 0.8)  
    train_index = all_indices[:split_point]
    test_index = all_indices[split_point:]

    # prepare data
    train_features, train_output, test_features, test_output = train_io(config=config, data=data, train_index=train_index, test_index=test_index)
    
    sarcasm_mapping = {
        0: "Non-Sarcastic",
        1: "Sarcastic"
    }
    train_output, test_output = proprocess_output(train_output, test_output, class_mapping =  sarcasm_mapping)

    template = "Examine the input and categorize it as 'Sarcastic' or 'Non-Sarcastic' in the context of binary sarcasm detection: "
    train_features, test_features = prompt_eng(train_features, test_features, template)  # add the instructions and prompts
    non_text_feature_modes = {'video': 'precomputed', 'audio': 'precomputed'}

    # prepare model
    tokenizer = T5Tokenizer.from_pretrained(LM_VERSION)
    model = TextFeatureOPTModel(LM_VERSION, list(non_text_feature_modes.keys()), tokenizer, feature_modes=non_text_feature_modes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    num_epochs = 3
    # train_model(model, train_features, train_output, optimizer, criterion, device, num_epochs, checkpoint_path = 'checkpoints/')
    accuracy = evaluate_model(model, test_features, test_output, criterion, device)
    
if __name__ == "__main__":
    
    torch.cuda.empty_cache()

    config = CONFIG_BY_KEY["tav"]
    
    print("Before running")
    gpu_monitor()
    
    data = DataPreper(config)
    train(config, data)

Before running
CUDA is not available. No GPU detected.
Vocab size: 1692


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Evaluating: 100%|██████████| 138/138 [00:32<00:00,  4.31batch/s]

Test Accuracy: 0.0000



  accuracy = np.mean(np.array(predictions) == np.array(actuals))


In [3]:
# def main():
#     text_input = ["A beautiful sunset over the mountains.", "Delicious food at a local restaurant."]
#     sample_images = torch.rand(2, 3, 224, 224).to(device)

#     # sample_audio = torch.rand(2, 1, audio_length).to(device)  # Move audio tensors to the specified device
#     feature_data = {
#         'video': sample_images,
#         # 'audio': sample_audio
#     }
#     feature_types = list(feature_data.keys())
#     features = list(feature_data.values())

#     tokenizer = AutoTokenizer.from_pretrained(LM_VERSION, use_fast=False)

#     # Tokenize the text input
#     text_input_ids = tokenizer(text_input, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)

#     # Create an instance of TextFeatureOPTModel
#     text_feature_opt_model = TextFeatureOPTModel(LM_VERSION, feature_types, tokenizer).to(device)

#     # Perform inference with both image and audio features
#     outputs = text_feature_opt_model(text_input_ids, features)
#     print(outputs)

# if __name__ == "__main__":
#     main()

In [None]:
!pwd

/content/MUStARD
