In [210]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils import data
import glob
from collections import Counter
!pip3 install pickle5
import pickle5 as pickle
torch.manual_seed(0)



<torch._C.Generator at 0x7f29a44d18f0>

# Data Loading Method

In [211]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [212]:
def get_data(path):
    train_X = []
    train_Y = []
    train_lengths = []

    val_X = []
    val_Y = []
    val_lengths = []

    test_X = []
    test_Y = []
    test_lengths = []

    for e, file in enumerate(sorted(glob.glob(path + '/*.pkl'))):
        with open(f'{file}', 'rb') as f:
            data = pickle.load(f)
        att = (data['place'], data['cast'], data['action'], data['audio'])
        att = torch.cat(att, dim=1)
        target = torch.cat((data['scene_transition_boundary_ground_truth'].long(),
                            torch.tensor([0])), axis=0)
        
        assert(len(target) == len(att))
        if e <= 49:
            train_X.append(att)
            train_Y.append(target)
            train_lengths.append(len(data['place']))
        elif 50 <= e <= 56:
            val_X.append(att)
            val_Y.append(target)
            val_lengths.append(len(data['place']))
        else:
            test_X.append(att)
            test_Y.append(target)
            test_lengths.append(len(data['place']))
    
    return((train_X, train_Y, train_lengths), (val_X, val_Y, val_lengths), (test_X, test_Y, test_lengths))

# PATHS

In [213]:
data_path = '/content/drive/MyDrive/eluvio_data/data'
model_path = '/content/drive/MyDrive/eluvio_data/saved_models/model_new6.pt'
dump_path = '/content/drive/MyDrive/eluvio_data/data_pred/'

# Train, Validation, Test Split

In [214]:
train_data, val_data, test_data = get_data(data_path)

trainX, trainY, train_movie_lengths = train_data
valX, valY, val_movie_lengths = val_data
testX, testY, test_movie_lengths = test_data

# Hyperparameters and Parameters




In [215]:
sequence_length = 20
batch_size = 8
epochs = 20

In [216]:
num_context_shots_before = 1
num_context_shots_after = 2
new_sequence_length = sequence_length + num_context_shots_before + num_context_shots_after
num_context_shots = num_context_shots_before + num_context_shots_after + 1

In [217]:
place_feat_dim = 2048
other_feat_dim = 512
feat_dim = place_feat_dim + (3 * other_feat_dim)

load_pretrained_model = True
num_out_channels = 512

In [218]:
ctx = np.arange(-num_context_shots_before,
                sequence_length + num_context_shots_after)
ctx

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21])

# Data Preprocessing

In [219]:
def get_padded_tensors_idx(X, Y, test_mode=False):
    idx = []
    start = 0

    if test_mode is False:
        for i in range(len(X)):
            rem_len = len(X[i]) % sequence_length
            if rem_len < num_context_shots_after:
                X[i] = F.pad(X[i], (0, 0, 0, num_context_shots_before + num_context_shots_after - rem_len),
                            mode='constant', value=0)
                Y[i] = F.pad(Y[i], (0, num_context_shots_before + num_context_shots_after - rem_len),
                            mode='constant', value=-100)
            else:
                X[i] = X[i][:len(X[i]) - rem_len + num_context_shots_after]
                Y[i] = Y[i][:len(Y[i]) - rem_len + num_context_shots_after]

                X[i] = F.pad(X[i], (0, 0, 0, num_context_shots_before),
                            mode='constant', value=0)
                Y[i] = F.pad(Y[i], (0, num_context_shots_before),
                            mode='constant', value=-100)

            end = start + len(X[i]) - \
                (num_context_shots_before + num_context_shots_after)
            idx.extend(np.arange(start, end, sequence_length))
            start = end + (num_context_shots_before + num_context_shots_after)
    else:
        pad_spec_X = (0, 0, 0, new_sequence_length)
        pad_spec_Y = (0, new_sequence_length)
        start = 0
        for i in range(len(X)):
            padded_data = F.pad(X[i], pad_spec_X,
                                mode='constant', value=0)
            X[i] = padded_data
            Y[i] = F.pad(Y[i], pad_spec_Y, mode='constant', value=-100)
            end = start + len(padded_data) - new_sequence_length
            idx.extend(np.arange(start, end, step=sequence_length))
            start = end + new_sequence_length


    X = torch.cat((X))
    Y = torch.cat((Y))

    X = F.pad(X, (0, 0, num_context_shots_before, 0),
                mode='constant', value=0)
    Y = F.pad(Y, (num_context_shots_before, 0),
                mode='constant', value=-100)
    idx = np.array(idx) + num_context_shots_before

    return(X, Y, idx)

In [220]:
trainX, trainY, train_idx = get_padded_tensors_idx(trainX, trainY)
valX, valY, val_idx = get_padded_tensors_idx(valX, valY)
testX, testY, test_idx = get_padded_tensors_idx(testX, testY, test_mode=True)

# Data Set and Sampler

In [221]:
class MovieDataset(data.Dataset):
    def __init__(self, X, Y, length_data):
        self.X = X
        self.Y = Y
        self.batch_size = batch_size
        self.len = length_data

    def __getitem__(self, index):
        index = np.expand_dims(index, axis=1)
        seq_idx = index + ctx
        label_idx = (index + np.arange(sequence_length)).flatten()
        X_batched = self.X[seq_idx, :].reshape(-1,
                                               new_sequence_length,
                                               feat_dim)
        Y_batched = self.Y[label_idx]
        return(X_batched, Y_batched)

    def __len__(self):
        return (self.len)

In [222]:
class RandomBatchSampler(data.sampler.Sampler):
    def __init__(self, batch_size, ids, test_mode=False):
        self.batch_size = batch_size
        self.ids = ids
        self.length = self.ids.shape[0]
        self.test_mode = test_mode

    def __iter__(self):
        if self.test_mode is False:
            inter = torch.randperm(self.ids.shape[0]).tolist()
            rand_idx = self.ids[inter]
        else:
            rand_idx = self.ids
        data_iter = iter([rand_idx[i:i + self.batch_size]
                          for i in range(0, len(rand_idx), self.batch_size)])
        return data_iter

    def __len__(self):
        return (self.length // self.batch_size)

# Train  Dataloader and Dataset

In [223]:
sampler_train = RandomBatchSampler(batch_size=batch_size, ids=train_idx)
train_set = MovieDataset(trainX, trainY, len(train_idx))
train_loader = data.DataLoader(train_set,
                               batch_size=1,
                               num_workers=2,
                               sampler=sampler_train,
)

# Val  Dataloader and Dataset

In [224]:
sampler_val = RandomBatchSampler(batch_size=batch_size, ids=val_idx)
val_set = MovieDataset(valX, valY, len(val_idx))
val_loader = data.DataLoader(val_set,
                            batch_size=1,
                            num_workers=2,
                            sampler=sampler_val,
                            pin_memory=True
)

# Test Dataloader and Datset

In [225]:
sampler_test = RandomBatchSampler(batch_size=1, ids=test_idx, test_mode=True)
test_set = MovieDataset(testX, testY, len(test_idx))
test_loader = data.DataLoader(test_set,
                              batch_size=1,
                              num_workers=2,
                              sampler=sampler_test,
                              pin_memory=True
                             )

# Model

In [226]:
class Cos(nn.Module):
    def __init__(self):
        super(Cos, self).__init__()
        self.shot_num = num_context_shots
        self.channel = num_out_channels
        self.conv1 = nn.Conv2d(in_channels=1,
                               out_channels=self.channel,
                               kernel_size=(self.shot_num // 2, 1))
        self.bn = nn.BatchNorm2d(num_features=num_out_channels)

    def forward(self, x):  # [batch_size, seq_len + padding, feat_dim]
        x = x.unsqueeze(dim=1)

        # batch_size, num_out_channels, seq_len, feat_dim
        x = self.conv1(x)
        x = F.relu(self.bn(x))
        x = x.permute(0, 2, 1, -1)
        part1 = x[:, :-self.shot_num // 2, :, :]
        part2 = x[:, self.shot_num // 2:, :, :]
        part1 = part1.reshape(-1, self.channel, part1.shape[-1])
        part2 = part2.reshape(-1, self.channel, part2.shape[-1])

        # batch_size, num_out_channels
        x = F.cosine_similarity(part1.squeeze(),
                                part2.squeeze(), dim=2)
        return(x)

In [227]:
class BNet(nn.Module):
    def __init__(self):
        super(BNet, self).__init__()
        self.shot_num = num_context_shots
        self.channel = num_out_channels
        self.conv1 = nn.Conv2d(in_channels=1,
                               out_channels=self.channel,
                               kernel_size=(num_context_shots, 1))
        self.bn = nn.BatchNorm2d(num_features=num_out_channels)
        self.max3d = nn.MaxPool3d(kernel_size=(1, self.channel, 1))
        self.cos = Cos()

    def forward(self, x):  # [batch_size, seq_len + num_context_shots - 1, feat_dim]
        context = x.unsqueeze(dim=1)

        # batch_size, num_out_channels, seq_len, feat_dim
        context = F.relu(self.bn(self.conv1(context)))

        # batch_size, seq_len, num_out_channels, feat_dim
        context = context.permute(0, 2, 1, -1)

        # batch_size,seq_len,1,1,feat_dim
        context = self.max3d(context.unsqueeze(dim=2))

        # batch_size * seq_len, feat_dim
        context = context.squeeze().reshape(-1, context.shape[-1])

        # batch_size * seq_len, num_out_channels
        sim = self.cos(x)

        # batch_size * seq_len, feat_dim + num_out_channels
        cat_feat = torch.cat((context, sim), dim=1)

        return(cat_feat)

In [228]:
class LGSSone(nn.Module):
    def __init__(self, mode="place"):
        super(LGSSone, self).__init__()
        self.seq_len = sequence_length
        self.num_layers = 2
        self.lstm_hidden_size = 512
        self.bidirectional = True

        if mode == "place":
            self.input_dim = (place_feat_dim + num_out_channels)
            self.bnet = BNet()
        elif mode == "cast":
            self.bnet = BNet()
            self.input_dim = (other_feat_dim + num_out_channels)
        elif mode == "act":
            self.bnet = BNet()
            self.input_dim = (other_feat_dim + num_out_channels)
        elif mode == "aud":
            self.bnet = BNet()
            self.input_dim = (other_feat_dim + num_out_channels)
        else:
            pass
        
        self.dropout = nn.Dropout(p=0.2)
        self.lstm = nn.LSTM(input_size=self.input_dim,
                            hidden_size=self.lstm_hidden_size,
                            num_layers=self.num_layers,
                            batch_first=True,
                            bidirectional=self.bidirectional)

        if self.bidirectional:
            self.fc1 = nn.Linear(in_features=self.lstm_hidden_size * 2, 
                                 out_features=100)
        else:
            self.fc1 = nn.Linear(in_features=self.lstm_hidden_size, 
                                 out_features=100)

        
    def forward(self, x):
        x = self.bnet(x)
        x = x.reshape(-1, self.seq_len, x.shape[-1])
        self.lstm.flatten_parameters()
        out, (_, _) = self.lstm(x, None)

        out = F.relu(self.dropout(self.fc1(out)))
        return out

In [229]:
class LGSS(nn.Module):
    def __init__(self):
        super(LGSS, self).__init__()
        self.seq_len = sequence_length
        self.mode = ['place', 'cast', 'act', 'aud']
        if 'place' in self.mode:
            self.bnet_place = LGSSone("place")
            self.instancenorm = nn.InstanceNorm1d(new_sequence_length)
        if 'cast' in self.mode:
            self.bnet_cast = LGSSone("cast")
        if 'act' in self.mode:
            self.bnet_act = LGSSone("act")
        if 'aud' in self.mode:
            self.bnet_aud = LGSSone("aud")

        self.fc1 = nn.Linear(in_features=4 * 100, 
                             out_features=100)
        
        self.fc2 = nn.Linear(in_features=100, 
                             out_features=2)


        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out',
                                        nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def forward(self, inp):
        
        place_feat = inp[:, :, :place_feat_dim]

        inp = inp[:, :, place_feat_dim:].reshape(-1, new_sequence_length, 3, other_feat_dim)
        cast_feat, act_feat, aud_feat = torch.split(inp, split_size_or_sections=1, dim=-2)

        cast_feat = cast_feat.squeeze(dim=-2)
        act_feat = act_feat.squeeze(dim=-2)
        aud_feat = aud_feat.squeeze(dim=-2)

        with torch.no_grad():
            place_feat = self.instancenorm(place_feat)
            cast_feat = self.instancenorm(cast_feat)
            act_feat = self.instancenorm(act_feat)
            aud_feat = self.instancenorm(aud_feat)

        place_bound = self.bnet_place(place_feat)
        cast_bound = self.bnet_cast(cast_feat)
        act_bound = self.bnet_act(act_feat)
        aud_bound = self.bnet_aud(aud_feat)
        
        final_out = torch.cat((place_bound, cast_bound, act_bound, aud_bound), dim=-1)

        final_out = F.relu(self.fc1(final_out))
        final_out = self.fc2(final_out).reshape(-1, 2)

        return final_out

In [230]:
model = LGSS().cuda()

weights = [0.0768, 0.4]
class_weights = torch.FloatTensor(weights).cuda()
criterion = nn.CrossEntropyLoss(weight=class_weights)

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=2e-1)

Helper Method

In [231]:
# Calculates Accuracy
def get_num_correct(preds, labels):
    return((preds.argmax(dim=1).eq(labels)).sum().item(), len(labels))

# Validation Loop

In [232]:
def val_loop():
    print("Validation...")
    with torch.no_grad():
        model.eval()
        total_loss = 0.0
        total_correct = 0
        total_samples = 0
        for e, (inp, label) in enumerate(val_loader):
            inp = inp.squeeze(0).cuda()
            label = label.squeeze().cuda()
            pred = model(inp)

            loss = criterion(pred, label)

            total_loss += float(loss.item())
            temp = get_num_correct(pred, label)
            total_correct += temp[0]
            total_samples += temp[1]
            del loss
            torch.cuda.empty_cache()
        
        total_loss = total_loss / len(val_loader)
        val_acc = 100 * (total_correct / total_samples)

        print(
            f"Vaidation Stats:\n"
            f"Accuracy: {val_acc:.2f}%",
            f"Average Loss Per Batch: {total_loss}"
        )

# Train Loop

In [233]:
def train_loop(num_epochs):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        total_correct = 0
        total_samples = 0
        for e, (inp, label) in enumerate(train_loader):
            optimizer.zero_grad()

            inp = inp.squeeze(0).cuda()
            label = label.squeeze().cuda()
            pred = model(inp)
            loss = criterion(pred, label)

            loss.backward()
            optimizer.step()

            with torch.no_grad():
                total_loss += float(loss.item())
                temp = get_num_correct(pred, label)
                total_correct += temp[0]
                total_samples += temp[1]
            del loss
            torch.cuda.empty_cache()
        scheduler.step()

        torch.save(model.state_dict(),
                '/content/drive/MyDrive/eluvio_data/saved_models/model' + str(epoch + 1) + '.pt')
        
        val_loop()
        
        total_loss = total_loss / len(train_loader)
        train_acc = 100 * (total_correct / total_samples)

        print(
            f"Training Stats:\n"
            f"Accuracy: {train_acc:.2f}%",
            f"Average Loss Per Batch: {total_loss}"
        )


In [234]:
if load_pretrained_model is False:
    print('Training...')
    train_loop(epochs)
else:
    print("Loading Pretrained Model")
    model.load_state_dict(torch.load(model_path))
    print("Pretrained Model Loaded")

Loading Pretrained Model
Pretrained Model Loaded


In [51]:
cnt1 = 0
cnt2 = 0
cnt3 = 0
cnt4 = 0
with torch.no_grad():
    model.eval()
    for e, (inp, label) in enumerate(val_loader):

        inp = inp.squeeze(0).cuda()
        label = label.squeeze().cuda()

        pred = model(inp)
        for i, j in zip(label, pred):
            if i.item() == 1:
                cnt1 += 1
                if F.softmax(j, dim=0).argmax().item() == 1:
                    cnt2 += 1
            if F.softmax(j, dim=0).argmax().item() == 1:
                cnt3 += 1

                if i.item() == 1:
                    cnt4 += 1
            #print(i.item(), F.softmax(j, dim=0))

# Test Set Prediction and Dumping

In [177]:
def dump_predictions(scene_transition_pred):
    for e, file in enumerate(sorted(glob.glob(data_path + '/*.pkl'))):
        if e >= 57:
            new_dict = {}
            with open(f'{file}', 'rb') as f:
                data = pickle.load(f)
            new_dict['imdb_id'] = data['imdb_id']
            new_dict['scene_transition_boundary_ground_truth'] = data['scene_transition_boundary_ground_truth'].cpu().numpy()
            new_dict['shot_end_frame'] = data['shot_end_frame'].cpu().numpy()
            new_dict['scene_transition_boundary_prediction'] = scene_transition_pred[e - 57][:-1].cpu().numpy()
            assert(len(new_dict['scene_transition_boundary_prediction']) == len(data['scene_transition_boundary_prediction']))

            f_name = file.split('/')[-1]
            with open(dump_path + f'{f_name}', 'wb') as handle:
                pickle.dump(new_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [178]:
def unpad_predictions(predictions, movie_lengths):    
    start_idx = 0
    prediction_list = []
    for e, movie_len in enumerate(movie_lengths):
        if movie_len % sequence_length != 0:
            prediction_list.append(torch.cat(predictions[start_idx: start_idx + movie_len//sequence_length + 1])[:movie_len])
            start_idx += movie_len//sequence_length + 1
        else:
            prediction_list.append(torch.cat(predictions[start_idx: start_idx + movie_len//sequence_length])[:movie_len])
            start_idx += movie_len//sequence_length 
        
    return(prediction_list)

In [179]:
def test_loop():
    scene_transition_pred = []
    with torch.no_grad():
        model.eval()
        for e, (inp, label) in enumerate(test_loader):
            inp = inp.squeeze(0).cuda()
            label = label.squeeze().cuda()

            pred = model(inp)

            scene_transition_prob = F.softmax(pred, dim=1)[:, 1]
            scene_transition_pred.append(scene_transition_prob)
    return(scene_transition_pred)


In [180]:
scene_transition_pred = unpad_predictions(test_loop(), test_movie_lengths)
dump_predictions(scene_transition_pred)

In [208]:
%run '/content/drive/MyDrive/Colab Notebooks/eval.ipynb'

# of IMDB IDs: 7
/content/drive/MyDrive/eluvio_data/data_pred/tt1205489.pkl
/content/drive/MyDrive/eluvio_data/data_pred/tt1375666.pkl
/content/drive/MyDrive/eluvio_data/data_pred/tt1412386.pkl
/content/drive/MyDrive/eluvio_data/data_pred/tt1707386.pkl
/content/drive/MyDrive/eluvio_data/data_pred/tt2024544.pkl
/content/drive/MyDrive/eluvio_data/data_pred/tt2488496.pkl
/content/drive/MyDrive/eluvio_data/data_pred/tt2582846.pkl
Scores: {
    "AP": 0.519881324814132,
    "mAP": 0.5327515530520018,
    "Miou": 0.528036302457737,
    "Precision": 0.4403790682782162,
    "Recall": 0.6041040503907537,
    "F1": 0.5020978697516447
}
