In [None]:
pip install transformers torch torchvision jsonlines scikit-learn


In [4]:
conda env create -f environment.yml

Collecting package metadata (repodata.json): | ^C
\ 
Note: you may need to restart the kernel to use updated packages.


In [None]:
# !git clone https://github.com/soujanyaporia/MUStARD
%cd MUStARD/

[Errno 2] No such file or directory: 'MUStARD/'
/Users/jingyi/CMU/23s/research/MUStARD


In [None]:
import nltk

nltk.download("punkt")

!pip install --upgrade --no-cache-dir gdown
!mkdir -p data/features
!gdown -O data/features --id --folder 1Ff1WDObGKqpfbvy7-H1mD8YWvBS-Kf26
!gdown --id 1GYv74vN80iX_IkEmkJhkjDRGxLvraWuZ
!unzip BERT_text_features.zip -d data/

In [5]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from torchvision.models import resnet50
from collections import defaultdict
from config import CONFIG_BY_KEY, Config
from data_loader import DataLoader, DataHelper
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
import os

LM_VERSION = 'facebook/opt-1.3b'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

seed = 42
torch.manual_seed(seed)

class TextFeatureOPTModel(nn.Module):
    def __init__(self, opt_model_name, feature_types, tokenizer, feature_modes):
        super(TextFeatureOPTModel, self).__init__()
        self.opt_model = AutoModelForCausalLM.from_pretrained(opt_model_name).to(device)
        self.feature_types = feature_types
        self.feature_modes = feature_modes 
        self.tokenizer = tokenizer

        self.modules = defaultdict(nn.ModuleDict)

        # Initialize modules for different feature types
        for feature_type in feature_types:
            if feature_type == 'video' and feature_modes.get(feature_type) == 'raw':
                self.modules[feature_type]['encoder'] = resnet50(pretrained=True).to(device)
                self.modules[feature_type]['encoder'].fc = nn.Identity()
                self.modules[feature_type]['embedding_transform'] = nn.Linear(2048, self.opt_model.config.hidden_size).to(device)
        
            elif feature_type == 'audio' and feature_modes.get(feature_type) == 'raw':
                self.modules[feature_type]['encoder'] = nn.Sequential(
                    nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3),
                    nn.ReLU(),
                    nn.MaxPool1d(kernel_size=2),
                ).to(device)
                self.modules[feature_type]['embedding_transform'] = nn.Linear(32, self.opt_model.config.hidden_size).to(device)

    def forward(self, text_input_ids, features):
        input_embeddings = self.opt_model.get_input_embeddings()
        text_embeddings = input_embeddings(text_input_ids)

        feature_inputs = []

        # Process non-text features
        for i, feature_type in enumerate(self.feature_types):
            mode = self.feature_modes.get(feature_type)

            if mode == 'raw':
                encoder = self.modules[feature_type]['encoder']
                embedding_transform = self.modules[feature_type]['embedding_transform']

                with torch.no_grad():
                    feature_input = encoder(features[i])
                feature_input = torch.flatten(feature_input, start_dim=1)

                feature_embeddings = embedding_transform(feature_input)
                feature_inputs.append(feature_embeddings.unsqueeze(1))
            
            elif mode == 'precomputed':
                # Directly use the precomputed features
                feature_embeddings = embedding_transform(features[i])
                feature_inputs.append(features[i].unsqueeze(1))

        # Concatenate feature embeddings with text embeddings
        combined_embeddings = [text_embeddings] + feature_inputs
        combined_embeddings = torch.cat(combined_embeddings, dim=1)

        with torch.no_grad():
            outputs = self.opt_model.generate(inputs_embeds=combined_embeddings)

        decoded_texts = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        return decoded_texts
    
def evaluate_model(model, test_features, test_output, criterion, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    predictions = []
    actuals = []

    with torch.no_grad():
        for i in range(len(test_output)):
            text_input_ids = torch.tensor(test_features['text'][i], dtype=torch.long).to(device)
            non_text_feature_inputs = []

            # Check and process each type of feature
            for feature_type in list(test_features.keys())[1:]:
                if test_features[feature_type]:
                    non_text_feature_inputs.append(torch.tensor(test_features[feature_type][i]).to(device))
                else:
                    pass

            labels = torch.tensor(test_output[i], dtype=torch.long).to(device)

            outputs = model(text_input_ids, non_text_feature_inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    average_loss = total_loss / len(test_output)
    accuracy = np.mean(np.array(predictions) == np.array(actuals))
    print(f'Test Loss: {average_loss:.4f}, Test Accuracy: {accuracy:.4f}')

    # Confusion Matrix
    print("Confusion Matrix:")
    print(confusion_matrix(actuals, predictions))

    # Classification Report
    print("Classification Report:")
    print(classification_report(actuals, predictions))

    return average_loss, accuracy

def save_checkpoint(model, optimizer, epoch, filename):
    # Create directory if it does not exist
    # os.makedirs(os.path.dirname(filename), exist_ok=True)
    state = {
        'epoch': epoch,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }
    torch.save(state, filename)


def train_model(model, train_features, train_output, optimizer, criterion, device, num_epochs, checkpoint_path):
    
    os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)

    model.train()
    best_loss = float('inf')

    for epoch in range(num_epochs):
        total_loss = 0
        for i in range(len(train_output)):
            text_input_ids = torch.tensor(train_features['text'][i], dtype=torch.long).to(device)
            non_text_feature_inputs = []

            for feature_type in list(train_features.keys())[1:]:
                if train_features[feature_type]:
                    non_text_feature_inputs.append(torch.tensor(train_features[feature_type][i]).to(device))
                else:
                    pass

            labels = torch.tensor(train_output[i], dtype=torch.long).to(device)

            optimizer.zero_grad()
            outputs = model(text_input_ids, non_text_feature_inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        average_loss = total_loss / len(train_output)
        print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {average_loss:.4f}')

        # Save checkpoint if it's the best model so far
        if average_loss < best_loss:
            best_loss = average_loss
            checkpoint_filename = os.path.join(checkpoint_path, f'model_checkpoint_epoch_{epoch+1}.pth')
            save_checkpoint(model, optimizer, epoch, checkpoint_filename)

def train_io(config, data, train_index, test_index):
    train_input, train_output = data.get_split(train_index)
    test_input, test_output = data.get_split(test_index)

    datahelper = DataHelper(train_input, train_output, test_input, test_output, config, data)

    train_features = {'text': [], 'video': [], 'audio': []}
    test_features = {'text': [], 'video': [], 'audio': []}

    if config.use_target_text:
        if config.use_bert:
            train_features['text'] = datahelper.get_target_bert_feature(mode="train")
            test_features['text'] = datahelper.get_target_bert_feature(mode="test")
        else:
            train_features['text'] = datahelper.vectorize_utterance(mode="train")
            test_features['text'] = datahelper.vectorize_utterance(mode="test")

    if config.use_target_video:
        train_features['video'] = datahelper.get_target_video_pool(mode="train")
        test_features['video'] = datahelper.get_target_video_pool(mode="test")
        
    if config.use_target_audio:
        train_features['audio'] = datahelper.get_target_audio_pool(mode="train")
        test_features['audio'] = datahelper.get_target_audio_pool(mode="test")

    # print(train_features['video'].shape)
    # print(train_features['audio'].shape)


    # Check if any modality is being used
    if all(len(features) == 0 for features in train_features.values()):
        raise ValueError("Invalid modalities")

    return train_features, train_output, test_features, test_output


def train(config, data):
    all_indices = data.get_all_indices_shuffled()

    split_point = int(len(all_indices) * 0.8)  # Example: 80% for training, 20% for testing
    train_index = all_indices[:split_point]
    test_index = all_indices[split_point:]

    train_features, train_output, test_features, test_output = train_io(config=config, data=data, train_index=train_index, test_index=test_index)
    non_text_feature_modes = {'video': 'precomputed', 'audio': 'precomputed'}

    tokenizer = AutoTokenizer.from_pretrained(LM_VERSION, use_fast=False)
    model = TextFeatureOPTModel(LM_VERSION, list(non_text_feature_modes.keys()), tokenizer, feature_modes=non_text_feature_modes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    num_epochs = 1
    
    
    train_model(model, train_features, train_output, optimizer, criterion, device, num_epochs, checkpoint_path = 'checkpoints/')
    average_loss, accuracy = evaluate_model(model, test_features, test_output, criterion, device)
    

if __name__ == "__main__":
    config = CONFIG_BY_KEY["tav"]
    data = DataLoader(config)
    train(config, data)

Vocab size: 1721
(552, 2048)
(552, 283)


pytorch_model.bin:  59%|█████▉    | 1.56G/2.63G [10:04<06:53, 2.59MB/s]


KeyboardInterrupt: 

In [None]:
pip install h5py jsonlines nltk numpy


Collecting h5py
  Downloading h5py-3.10.0-cp38-cp38-macosx_11_0_arm64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: h5py
Successfully installed h5py-3.10.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
# def main():
#     text_input = ["A beautiful sunset over the mountains.", "Delicious food at a local restaurant."]
#     sample_images = torch.rand(2, 3, 224, 224).to(device)

#     # sample_audio = torch.rand(2, 1, audio_length).to(device)  # Move audio tensors to the specified device
#     feature_data = {
#         'video': sample_images,
#         # 'audio': sample_audio
#     }
#     feature_types = list(feature_data.keys())
#     features = list(feature_data.values())

#     tokenizer = AutoTokenizer.from_pretrained(LM_VERSION, use_fast=False)

#     # Tokenize the text input
#     text_input_ids = tokenizer(text_input, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)

#     # Create an instance of TextFeatureOPTModel
#     text_feature_opt_model = TextFeatureOPTModel(LM_VERSION, feature_types, tokenizer).to(device)

#     # Perform inference with both image and audio features
#     outputs = text_feature_opt_model(text_input_ids, features)
#     print(outputs)

# if __name__ == "__main__":
#     main()

In [None]:
!pwd

/content/MUStARD
