In [2]:
pip install transformers torch torchvision jsonlines scikit-learn


Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/12/dd/f17b11a93a9ca27728e12512d167eb1281c151c4c6881d3ab59eb58f4127/transformers-4.35.2-py3-none-any.whl.metadata
  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.5/123.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting jsonlines
  Obtaining dependency information for jsonlines from https://files.pythonhosted.org/packages/f8/62/d9ba6323b9202dd2fe166beab8a86d29465c41a0288cbe229fac60c1ab8d/jsonlines-4.0.0-py3-none-any.whl.metadata
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Obtaining dependency information for huggingface-hub<1.0,>=0.16.4 from https://files.pythonhosted.org/packages/05/09/1945ca6ba3ad8ad6e2872ba682ce8d68c5e63c8e55458ed8ab4885709f1d/huggingface_hub-0.19.4-py3-none-any.whl.metadat

In [None]:
# !git clone https://github.com/soujanyaporia/MUStARD
# %cd MUStARD/

[Errno 2] No such file or directory: 'MUStARD/'
/Users/jingyi/CMU/23s/research/MUStARD


In [4]:
import nltk

nltk.download("punkt")



[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
!pip install --upgrade --no-cache-dir gdown
!mkdir -p data/features
!gdown -O data/features --id --folder 1Ff1WDObGKqpfbvy7-H1mD8YWvBS-Kf26
!gdown --id 1GYv74vN80iX_IkEmkJhkjDRGxLvraWuZ
!unzip BERT_text_features.zip -d data/

In [6]:
import torch
import torch.nn as nn
from torchvision.models import resnet50
from collections import defaultdict
from config import CONFIG_BY_KEY
from data_loader import DataPreper, DataHelper
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
import os
from torch.utils.data import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer


def gpu_monitor():

    # Check if CUDA is available
    if torch.cuda.is_available():
        # Get the ID of the current GPU
        device_id = torch.cuda.current_device()

        # Get the name of the current GPU
        gpu_name = torch.cuda.get_device_name(device_id)

        # Get the total memory of the current GPU
        total_memory = torch.cuda.get_device_properties(device_id).total_memory

        # Convert bytes to megabytes
        total_memory_in_MB = total_memory / (1024**2)

        # Get the current memory usage
        current_memory_allocated = torch.cuda.memory_allocated(device_id)
        current_memory_allocated_in_MB = current_memory_allocated / (1024**2)

        # Get the current memory cached
        current_memory_cached = torch.cuda.memory_reserved(device_id)
        current_memory_cached_in_MB = current_memory_cached / (1024**2)

        # Calculate free memory
        free_memory_in_MB = total_memory_in_MB - current_memory_allocated_in_MB

        print(f"GPU: {gpu_name}")
        print(f"Total GPU Memory: {total_memory_in_MB:.2f} MB")
        print(f"Currently Allocated Memory: {current_memory_allocated_in_MB:.2f} MB")
        print(f"Currently Cached Memory: {current_memory_cached_in_MB:.2f} MB")
        print(f"Free Memory: {free_memory_in_MB:.2f} MB")
    else:
        print("CUDA is not available. No GPU detected.")



class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.text_features = features['text']
        self.video_features = features['video']
        self.audio_features = features['audio']
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.text_features[idx]
        video = self.video_features[idx]
        audio = self.audio_features[idx]
        label = self.labels[idx]
        return text, video, audio, label

LM_VERSION = 't5-small'
# 'facebook/opt-2.7b'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = 42
torch.manual_seed(seed)

class TextFeatureOPTModel(nn.Module):
    def __init__(self, model_name, feature_types, tokenizer, feature_modes):
        super(TextFeatureOPTModel, self).__init__()
        # self.opt_model = AutoModelForCausalLM.from_pretrained(opt_model_name).to(device)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)

        self.feature_types = feature_types
        self.feature_modes = feature_modes 
        self.tokenizer = tokenizer

        self.modules = defaultdict(nn.ModuleDict)

        # Initialize modules for different feature types
        for feature_type in feature_types:
            if feature_type == 'video':
                if feature_modes.get(feature_type) == 'raw':
                    self.modules[feature_type]['encoder'] = resnet50(pretrained=True).to(device)
                    self.modules[feature_type]['encoder'].fc = nn.Identity()
                self.modules[feature_type]['embedding_transform'] = nn.Linear(2048, self.model.config.hidden_size).to(device)
        
            elif feature_type == 'audio':
                if feature_modes.get(feature_type) == 'raw':
                    self.modules[feature_type]['encoder'] = nn.Sequential(
                        nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3),
                        nn.ReLU(),
                        nn.MaxPool1d(kernel_size=2),
                    ).to(device)
                self.modules[feature_type]['embedding_transform'] = nn.Linear(283, self.model.config.hidden_size).double().to(device)

    def forward(self, text_input, non_text_features):
        print("NEW DP")
        text_input_ids = self.tokenizer(text_input, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
        input_embeddings = self.model.get_input_embeddings()
        text_embeddings = input_embeddings(text_input_ids)

        feature_inputs = []
        for i in non_text_features:
            print(i.shape)

        # Process non-text features
        for i, feature_type in enumerate(self.feature_types):
            mode = self.feature_modes.get(feature_type)

            embedding_transform = self.modules[feature_type]['embedding_transform']
            
            if mode == 'raw':
                encoder = self.modules[feature_type]['encoder']

                with torch.no_grad():
                    feature_input = encoder(non_text_features[i])
                feature_input = torch.flatten(feature_input, start_dim=1)

                feature_embeddings = embedding_transform(feature_input)
                feature_inputs.append(feature_embeddings.unsqueeze(1))
            
            elif mode == 'precomputed':
                if non_text_features[i].dim() == 1:
                    feature_input = non_text_features[i].unsqueeze(0).unsqueeze(0)
                else:
                    feature_input = non_text_features[i].unsqueeze(1)
                    
                print(feature_input.shape)
                
                # Directly use the precomputed features
                feature_embeddings = embedding_transform(feature_input)
                feature_inputs.append(feature_embeddings)

        # Concatenate feature embeddings with text embeddings
        
        combined_embeddings = [text_embeddings] + feature_inputs
        for i in combined_embeddings:
            print(i.shape)
        combined_embeddings = torch.cat(combined_embeddings, dim=1)

        with torch.no_grad():
            outputs = self.model.generate(inputs_embeds=combined_embeddings.float(), max_length = 50)

        decoded_texts = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        print(decoded_texts)
        return decoded_texts
    
def evaluate_model(model, test_features, test_output, criterion, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    predictions = []
    actuals = []

    with torch.no_grad():
        for i in range(len(test_output)):
            text_input = test_features['text'][i]
            non_text_feature_inputs = []

            # Check and process each type of feature
            for feature_type in list(test_features.keys())[1:]:
                if test_features[feature_type]:
                    non_text_feature_inputs.append(torch.tensor(test_features[feature_type][i]).to(device))
                else:
                    pass

            labels = torch.tensor(test_output[i], dtype=torch.long).to(device)

            outputs = model(text_input, non_text_feature_inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.cpu().numpy())
            actuals.extend(labels.cpu().numpy())
            

    average_loss = total_loss / len(test_output)
    accuracy = np.mean(np.array(predictions) == np.array(actuals))
    print(f'Test Loss: {average_loss:.4f}, Test Accuracy: {accuracy:.4f}')

    # Confusion Matrix
    print("Confusion Matrix:")
    print(confusion_matrix(actuals, predictions))

    # Classification Report
    print("Classification Report:")
    print(classification_report(actuals, predictions))

    return average_loss, accuracy

def save_checkpoint(model, optimizer, epoch, filename):
    # Create directory if it does not exist
    # os.makedirs(os.path.dirname(filename), exist_ok=True)
    state = {
        'epoch': epoch,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }
    torch.save(state, filename)

def train_model(model, train_features, train_output, optimizer, criterion, device, num_epochs, checkpoint_path):
    
    os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)

    model.train()
    best_loss = float('inf')
    
    # train_dataset = CustomDataset(train_features, train_output)
    # train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    for epoch in range(num_epochs):
        total_loss = 0
        for i in range(len(train_output)):
            text_input = train_features['text'][i]
            non_text_feature_inputs = []
            print(len(train_features.keys()))
            if len(train_features.keys()) > 1:
                for feature_type in list(train_features.keys())[1:]:
                    non_text_feature_inputs.append(torch.tensor(train_features[feature_type][i]).to(device))
                    
            labels = torch.tensor(train_output[i], dtype=torch.long).to(device)
            
            optimizer.zero_grad()
            outputs = model(text_input, non_text_feature_inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            del text_input, non_text_feature_inputs, labels
            torch.cuda.empty_cache()

        average_loss = total_loss / len(train_output)
        print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {average_loss:.4f}')

        # Save checkpoint if it's the best model so far
        if average_loss < best_loss:
            best_loss = average_loss
            checkpoint_filename = os.path.join(checkpoint_path, f'model_checkpoint_epoch_{epoch+1}.pth')
            save_checkpoint(model, optimizer, epoch, checkpoint_filename)


def train_io(config, data, train_index, test_index):
    train_input, train_output = data.get_split(train_index)
    test_input, test_output = data.get_split(test_index)

    datahelper = DataHelper(train_input, train_output, test_input, test_output, config, data)

    train_features = {'text': [], 'video': [], 'audio': []}
    test_features = {'text': [], 'video': [], 'audio': []}

    if config.use_target_text:
        if config.use_bert:
            train_features['text'] = datahelper.get_target_bert_feature(mode="train")
            test_features['text'] = datahelper.get_target_bert_feature(mode="test")
        else:
            train_features['text'] = datahelper.vectorize_utterance(mode="train")
            test_features['text'] = datahelper.vectorize_utterance(mode="test")

    if config.use_target_video:
        train_features['video'] = datahelper.get_target_video_pool(mode="train")
        test_features['video'] = datahelper.get_target_video_pool(mode="test")
        
    if config.use_target_audio:
        train_features['audio'] = datahelper.get_target_audio_pool(mode="train")
        test_features['audio'] = datahelper.get_target_audio_pool(mode="test")

    # Check if any modality is being used
    if all(len(features) == 0 for features in train_features.values()):
        raise ValueError("Invalid modalities")

    return train_features, train_output, test_features, test_output


def train(config, data):

    
    all_indices = data.get_all_indices_shuffled()

    split_point = int(len(all_indices) * 0.8)  # Example: 80% for training, 20% for testing
    train_index = all_indices[:split_point]
    test_index = all_indices[split_point:]

    gpu_monitor()

    train_features, train_output, test_features, test_output = train_io(config=config, data=data, train_index=train_index, test_index=test_index)
    non_text_feature_modes = {'video': 'precomputed', 'audio': 'precomputed'}

    # tokenizer = AutoTokenizer.from_pretrained(LM_VERSION, use_fast=False)
    tokenizer = T5Tokenizer.from_pretrained(LM_VERSION)
    print("load tokenizer")
    gpu_monitor()

    model = TextFeatureOPTModel(LM_VERSION, list(non_text_feature_modes.keys()), tokenizer, feature_modes=non_text_feature_modes).to(device)
    print("load model")
    gpu_monitor()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    num_epochs = 1
    
    train_model(model, train_features, train_output, optimizer, criterion, device, num_epochs, checkpoint_path = 'checkpoints/')
    average_loss, accuracy = evaluate_model(model, test_features, test_output, criterion, device)
    
if __name__ == "__main__":
    
    torch.cuda.empty_cache()

    config = CONFIG_BY_KEY["tav"]
    # print("before data")
    # gpu_monitor()
    data = DataPreper(config)
    train(config, data)

GPU: Tesla T4
Total GPU Memory: 15102.06 MB
Currently Allocated Memory: 945.72 MB
Currently Cached Memory: 968.00 MB
Free Memory: 14156.34 MB
Vocab size: 1700
load tokenizer
GPU: Tesla T4
Total GPU Memory: 15102.06 MB
Currently Allocated Memory: 945.72 MB
Currently Cached Memory: 968.00 MB
Free Memory: 14156.34 MB
load model
GPU: Tesla T4
Total GPU Memory: 15102.06 MB
Currently Allocated Memory: 1181.79 MB
Currently Cached Memory: 1204.00 MB
Free Memory: 13920.27 MB
3
NEW DP
torch.Size([2048])
torch.Size([283])
torch.Size([1, 1, 2048])
torch.Size([1, 1, 283])
torch.Size([1, 38, 512])
torch.Size([1, 1, 512])
torch.Size([1, 1, 512])
["-, in about seven months you're gonna have something that you're gonna love more than any guy you've ever been out with."]


TypeError: cross_entropy_loss(): argument 'input' (position 1) must be Tensor, not list

In [None]:
pip install h5py jsonlines nltk numpy


In [3]:
# def main():
#     text_input = ["A beautiful sunset over the mountains.", "Delicious food at a local restaurant."]
#     sample_images = torch.rand(2, 3, 224, 224).to(device)

#     # sample_audio = torch.rand(2, 1, audio_length).to(device)  # Move audio tensors to the specified device
#     feature_data = {
#         'video': sample_images,
#         # 'audio': sample_audio
#     }
#     feature_types = list(feature_data.keys())
#     features = list(feature_data.values())

#     tokenizer = AutoTokenizer.from_pretrained(LM_VERSION, use_fast=False)

#     # Tokenize the text input
#     text_input_ids = tokenizer(text_input, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)

#     # Create an instance of TextFeatureOPTModel
#     text_feature_opt_model = TextFeatureOPTModel(LM_VERSION, feature_types, tokenizer).to(device)

#     # Perform inference with both image and audio features
#     outputs = text_feature_opt_model(text_input_ids, features)
#     print(outputs)

# if __name__ == "__main__":
#     main()

In [None]:
!pwd

/content/MUStARD
