In [1]:
from models import *
from utils import *
from tqdm.notebook import tqdm
import pickle

In [2]:
!pwd

/DATA/sarmistha_2221cs21/basha/multi_label_classifier/Repo


# Create Models 
    > ENC Video segment encoder
    > main_model is MultiLabelVideoClassifier

In [3]:
device = torch.device('cuda')
evs = EncodeVideoSegment().to(device)
main_model = MultiLabelVideoClassifier(512).to(device)


In [4]:
evs

EncodeVideoSegment(
  (position_embeddings): Embedding(130, 512)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-7): 8 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
)

In [5]:
main_model

MultiLabelVideoClassifier(
  (position_embeddings): Embedding(1026, 512)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-15): 16 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (project): Linear(in_features=512, out_features=512, bias=True)
  (classification_layer): Linear(in_features=512, out_features=15, bias=True)
)

# Load datasets

In [6]:
with open("data/train_dataset.pkl", "rb") as f:
    train_labels = pickle.load(f)
with open("data/test_dataset.pkl", "rb") as f:
    test_labels = pickle.load(f)

In [7]:
train_labels.head()

Unnamed: 0,Video ID,Transcript,Multi Hot Label,Complaint
0,3fo85SENXC0.mp3,"Hi, my name is Larry Mentorank. I'm a certifi...","[0, 0, 0, 1, 0]","[0, 0, 0, 0, 0]"
1,3fo85SENXC0.mp3,"Hi, my name is Larry Mentorank. I'm a certifi...","[0, 0, 0, 1, 0]","[0, 0, 0, 0, 0]"
2,1703547247898644932.mp3,"Welcome back to Wellness Warriors Hub, the pl...","[0, 1, 0, 1, 0]","[0, 1, 0, 0, 0]"
3,S8rVtPKDFbo.mp3,All these talks that will be in focus on the ...,"[0, 1, 0, 0, 0]","[0, 0, 0, 0, 0]"
4,CMKsS9E0Mpc.mp3,Today we're talking about the Capital One 360...,"[1, 0, 0, 1, 0]","[0, 0, 0, 0, 0]"


In [8]:
train_labels.isna().sum(),test_labels.isna().sum()

(Video ID           0
 Transcript         0
 Multi Hot Label    0
 Complaint          0
 dtype: int64,
 Video ID           0
 Transcript         0
 Multi Hot Label    0
 Complaint          0
 dtype: int64)

In [9]:
from collections import Counter

ids = [x for x, y in Counter(train_labels["Video ID"]).items() if y > 1]
train_labels = train_labels[~train_labels["Video ID"].isin(ids)]
print("Videos with issues ", [x[:-4] for x in ids])

Videos with issues  ['3fo85SENXC0', 'zMWkItNZQSc', 'CxGNCi7Jh2L', '5N4fkoiUy5Y']


In [10]:
train_labels['Video ID'] = train_labels['Video ID'].apply(lambda x: x[:-4]+'.mp4')
test_labels['Video ID'] = test_labels['Video ID'].apply(lambda x: x[:-4]+'.mp4')
train_labels = train_labels.set_index("Video ID")
test_labels = test_labels.set_index("Video ID")

In [11]:
train_labels_dict = dict(train_labels["Multi Hot Label"])
train_complaint_dict = dict(train_labels["Complaint"])
test_labels_dict = dict(test_labels["Multi Hot Label"])
test_complaint_dict = dict(test_labels["Complaint"])

In [12]:
with open("data/video_audio_clip_features.pkl", "rb") as f:
    dataset = pickle.load(f)

In [13]:
train_dataset = []
for video_id, video_data in dataset.items():
    if video_id in train_labels_dict:
        if video_data:
            train_dataset.append(
                [
                    video_data,
                    torch.tensor(train_labels_dict[video_id])
                    + torch.tensor(train_complaint_dict[video_id]),
                    video_id,
                ]
            )
        else:
            print(video_id)
test_dataset = []
for video_id, video_data in dataset.items():
    if video_id in test_labels_dict:
        if video_data:
            test_dataset.append(
                [
                    video_data,
                    torch.tensor(test_labels_dict[video_id])
                    + torch.tensor(test_complaint_dict[video_id]),
                    video_id,
                ]
            )
        else:
            print(video_id)

1706053016808456553.mp4


In [14]:
len(train_dataset[50][0])

11

In [15]:
loss = nn.CrossEntropyLoss()
evs_optimizer = torch.optim.AdamW(evs.parameters(), lr=1e-5)
main_optimizer = torch.optim.AdamW(main_model.parameters(), lr=1e-6)

In [16]:
evaluate = Evaluate(
    main_model, evs_model=evs, device=device, loss_function=loss, multi_model=True
)
mapper = Mapper()

In [17]:
aspect_score, complaint_score, loss_value, data = evaluate.eval(test_dataset[:12])

In [18]:
batch_size = 8
epochs = 100
best_score = -1e10
scores = []
loss_cache = []
for epoch in tqdm(range(epochs)):

    for i in range(0, len(train_dataset), batch_size):
        batch = [x[0] for x in train_dataset[i : i + batch_size]]
        labels = torch.stack([x[1] for x in train_dataset[i : i + batch_size]])
        audio_data = [torch.stack([y[1] for y in x]).squeeze(1) for x in batch]
        video_data = [[y[0] for y in x] for x in batch]
        video_data = [
            [torch.stack(y).squeeze(1) if type(y) is list else y for y in x]
            for x in video_data
        ]
    
        video_data = [evs(*pad(x, evs, device=device)) for x in video_data]
        video_data, v_mask = pad(video_data, main_model, device=device)
        audio_data, a_mask = pad(audio_data, main_model, device=device)
        final_data = video_data + audio_data
        label_output = main_model(final_data, attention_mask=a_mask)
        
        
        l1 = loss(label_output.view(-1, 3), labels.to(device).view(-1))
        main_optimizer.zero_grad()
        evs_optimizer.zero_grad()
        print(f"\r{epoch}/{epochs} Loss = {l1.item()}", end=" ")
        l1.backward()
        main_optimizer.step()
        evs_optimizer.step()
    test_aspect_score, test_complaint_score, test_loss_value, test_data = evaluate.eval(
        test_dataset
    )
    train_aspect_score, train_complaint_score, train_loss_value, train_data = (
        evaluate.eval(train_dataset)
    )
    scores.append(
        [
            train_aspect_score,
            test_aspect_score,
            train_complaint_score,
            test_complaint_score,
        ]
    )
    if best_score < test_aspect_score:
        best_score = test_aspect_score
        torch.save(evs,"weights/evs_model_V2.pth")
        torch.save(main_model,"weights/main_model_v2.pth")
    loss_cache.append([train_loss_value, test_loss_value])
    print(
        f"""

{"*"*90}
Train Aspect scores = {train_aspect_score}
Train Complaint score = {train_complaint_score}
Test Aspect score = {test_aspect_score}
Test Complaint score = {test_complaint_score}
{"*"*90}

          """
    )

  0%|          | 0/100 [00:00<?, ?it/s]

0/100 Loss = 1.117842197418213  

OutOfMemoryError: CUDA out of memory. Tried to allocate 18.00 MiB. GPU 0 has a total capacty of 23.68 GiB of which 15.00 MiB is free. Process 2770165 has 12.59 GiB memory in use. Process 3484835 has 6.36 GiB memory in use. Including non-PyTorch memory, this process has 4.71 GiB memory in use. Of the allocated memory 4.29 GiB is allocated by PyTorch, and 183.33 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF