In [1]:
import json, pickle, os
import pandas as pd
from PIL import Image
import requests
from torch.autograd import Variable
import torchvision.models as models
import torchvision.transforms as transforms
from torch import nn
import torch
import gc, sys, psutil
import numpy as np


In [2]:
with open('./videodatainfo_2017_ustc.json', 'r') as f:
    parsed = json.load(f)

In [3]:
print(parsed.keys())
data = pd.DataFrame(parsed['sentences'])
data.head()

dict_keys(['info', 'videos', 'sentences'])


Unnamed: 0,caption,sen_id,video_id
0,a cartoon animals runs through an ice cave in ...,0,video2960
1,a cartoon character runs around inside of a vi...,1,video2960
2,a character is running in the snow,2,video2960
3,a person plays a video game centered around ic...,3,video2960
4,a person plays online and records themselves,4,video2960


In [4]:
def create_caption_dict()->tuple:
    '''
    Creates two dictionaries: 
    1.) caption_dict with key as video_id and value as list of captions for that video.
    {video_id:[caption_1, caption_2, caption_3 ....], ..}   
    2.) sentence_ids which has video_id as the key and list of sentence_ids as value. 
    sen_ids are ids attached with each caption.
    '''
    captions_dict = {}
    sentence_ids = {} 

    for video_id in list(data['video_id'].unique()):

        caption_list = list(data[data['video_id'] == video_id].caption)
        sentence_list = list(data[data['video_id'] == video_id].sen_id)
        captions_dict[video_id] = caption_list
        sentence_ids[video_id] = sentence_list
        
    return captions_dict, sentence_ids

In [5]:
with open('captions_dict.pickle', 'rb') as handle:
    captions_dict = pickle.load(handle)

In [14]:
captions_dict = create_caption_dict()[0]

In [17]:
with open('captions_dict.pickle', 'wb') as handle:
    pickle.dump(captions_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
def gather_text(captions_dict:dict)->list:
    '''Returns all the captions from all the videos as a list'''
    
    caption_text = []
    for k, v in captions_dict.items():
        for caption in v:
            caption = ''.join([ch for ch in caption if ch not in punctuation])
            caption  = '<sos> ' + caption + ' <eos>'
            caption_text.append(caption)
    
    return caption_text

In [7]:
from collections import Counter
from string import punctuation

In [8]:
caption_text = gather_text(captions_dict)

In [9]:
caption_text[0:10]

['<sos> a cartoon animals runs through an ice cave in a video game <eos>',
 '<sos> a cartoon character runs around inside of a video game <eos>',
 '<sos> a character is running in the snow <eos>',
 '<sos> a person plays a video game centered around ice age the movie <eos>',
 '<sos> a person plays online and records themselves <eos>',
 '<sos> a scene from the ice age video game is shown <eos>',
 '<sos> a video game character is jumping about in a cave <eos>',
 '<sos> a video game of a little animal running through an ice tunnel <eos>',
 '<sos> a video game of a small animal <eos>',
 '<sos> a video shows gameplay from ice age <eos>']

In [10]:
def create_vocab(caption_text:list)->tuple:
    '''
    Creates a vocabulary from the text dataset. Returns vocab_to_int which maps each word
    to an integer and vice-versa.
    '''
    words = []
    for text in caption_text:
        for word in text.split():
            words.append(word)
    
    counts = Counter(words)
    vocab = sorted(counts, key=counts.get, reverse=True)
    #vocab['<sos>'], vocab['<eos>'] = -1, -2
    vocab_to_int = {word:i for i, word in enumerate(vocab, 1)}
    int_to_vocab = {v:k for k,v in vocab_to_int.items()}
    
    return vocab, vocab_to_int, int_to_vocab

In [11]:
vocab, vocab_to_int, int_to_vocab = create_vocab(caption_text)
#vocab_to_int

In [13]:
def caption_to_int(caption_text:list, vocab_to_int:dict)->list:
    '''
    Numericalizes the captions by referring to the mapping dictionary.
    '''
    captions_to_int = []
    for i, caption in enumerate(caption_text):
        ids = []
        words = caption.split()

        for word in words:

            ids.append(vocab_to_int[word])

        captions_to_int.append(ids)
    
    return captions_to_int

In [42]:
seq_len = 20
def pad_sequences()->list:
    '''
    Pads sequences/captions with 0's if their length is less than seq_len else truncates
    them to seq_len.
    '''
    padded_captions = []
    for ids in captions_to_int:
        ids = np.array(ids)
        if len(ids) > 20:
            ids = ids[:20]
        else:
            ids = np.pad(ids,(seq_len-len(ids),0),'constant')

        padded_captions.append(list(ids))
    
    return padded_captions

In [58]:
captions_to_int = caption_to_int(caption_text,b)

In [69]:
#captions_to_int

In [57]:
with open('vocab_to_int.pickle', 'wb') as handle:
    pickle.dump(vocab_to_int, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
class Video(object):
    
    def __init__(self, video_id, captions=[], sentence_ids=[]):
        self.video_id = video_id
        self.captions = captions
        self.sentence_ids = sentence_ids
        
    def captions_to_token(self, vocab_to_int):
        
        self.caption_tokens = []
        for caption in self.captions:
            ids = []
            words = caption.split()
            for word in words:
                ids.append(vocab_to_int[word])

            self.caption_tokens.append(ids)
        
        #return self.caption_tokens
    
    
    def pad_captions(self, seq_len):
        
        self.padded_captions = []
        for ids in self.caption_tokens:
            ids = np.array(ids)
            if len(ids) > seq_len:
                ids = ids[:seq_len]
            else:
                ids = np.pad(ids,(0,seq_len-len(ids)),'constant')

            self.padded_captions.append(list(ids))
        
        #return self.padded_captions

    
    def calculate_output_captions(self):
        
        self.output_captions = []
        
        for ids in self.padded_captions:
            ids = ids[1:]
            ids.append(0)
            
            self.output_captions.append(ids)
        
        return self.output_captions
    
    def get_video_frames(self, transform):
        
        frame_features = []
        path = './frames/'+self.video_id+'/'
        #frames = os.listdir(path)
        for frame in os.listdir(path):
            img = Image.open(path+frame)
            img = transform(img)
            frame_features.append(img)
        
        if len(frame_features) < 32:
            pad_val = 32 - len(frame_features)
            pad = torch.zeros(pad_val, 3, 224, 224)
            self.frames = torch.cat([torch.stack(frame_features, dim=0), pad], dim=0)
        else:
            self.frames = torch.stack(frame_features,dim=0)
        
        #return self.frames
        
    
    def get_vgg_tensor(self, transforms, model):
    
        vgg_features = []
        path = './frames/'+self.video_id+'/'
        frames = os.listdir(path)
        for frame in frames:
            img = Image.open(path+frame)
            img = transforms(img)

            #features = model(img.unsqueeze(0))

            vgg_features.append(img)

        feats = torch.stack(vgg_features, dim=0)
        feats = feats.cuda()
        model = model.cuda()

        with torch.no_grad():
            self.vgg_tensor = model(feats)

        del feats
        gc.collect()
        model = model.cpu()
        self.vgg_tensor = vgg_tensor.cpu()

        return self.vgg_tensor

    
    def __str__(self):
        return '{}'.format(self.video_id)
    

In [13]:
def get_transforms():
    transform = transforms.Compose([transforms.Resize((224,224)),
                                 transforms.ToTensor(),
                                 transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])])
    
    #vgg = models.vgg16(pretrained=True)
    #vgg.classifier = nn.Sequential(*[vgg.classifier[i] for i in range(4)])
    
    return transform

transform = get_transforms()    



In [None]:
def get_vgg_tensor(video_id, transforms, model):
    
    vgg_features = []
    path = './frames/'+video_id+'/'
    frames = os.listdir(path)
    for frame in frames:
        img = Image.open(path+frame)
        img = transforms(img)
        
        #features = model(img.unsqueeze(0))
        
        vgg_features.append(img)
    
    feats = torch.stack(vgg_features, dim=0)
    feats = feats.cuda()
    model = model.cuda()
    
    with torch.no_grad():
        vgg_tensor = model(feats)
    
    del feats
    gc.collect()
    model = model.cpu()
    vgg_tensor = vgg_tensor.cpu()

    return vgg_tensor
        

In [14]:
def clean_caption(captions:list):
    
    clean_captions = []
    for caption in captions:
        caption = ''.join([ch for ch in caption if ch not in punctuation])
        caption  = '<sos> ' + caption + ' <eos>'
        clean_captions.append(caption)
    
    return clean_captions
        

In [15]:
def create_video_objects(num_objects)->list:
    '''Creates video objects with all the properties and methods.'''
    
    video_objects = [None] * num_objects
    for i in range(len(video_objects)):

        video_id = 'video'+str(i)
        captions = list(data[data['video_id'] == video_id].caption)
        captions = clean_caption(captions)
        sentence_ids = list(data[data['video_id'] == video_id].sen_id)

        video_objects[i] = Video(video_id=video_id, captions=captions,sentence_ids=sentence_ids)
    
    return video_objects

In [16]:
video_objects = create_video_objects(64)

In [17]:
len(video_objects)

64

In [18]:
with open('video_objects.pickle', 'wb') as handle:
    pickle.dump(video_objects, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
from torch.utils.data import Dataset, DataLoader

In [20]:
class VideoDataset(Dataset):
    
    def __init__(self, video_objects, vocab_to_int, transform):
        self.video_objects = video_objects
        self.call_funcs(vocab_to_int, transform)
        
    def call_funcs(self, vocab_to_int, transform):
        
        for video in self.video_objects:
            video.captions_to_token(vocab_to_int)
            video.pad_captions(20)
            video.get_video_frames(transform)
            
    def __len__(self):
        return len(video_objects)
        
    def __getitem__(self, index):
        return video_objects[index].frames, torch.tensor(video_objects[index].padded_captions) 
        
        

In [21]:
vid_data = VideoDataset(video_objects, vocab_to_int, transform)

In [22]:
dl = DataLoader(vid_data, batch_size=16, shuffle=False)

In [23]:
with open('dl64.pickle', 'wb') as handle:
    pickle.dump(dl, handle)

In [46]:
l = []
for i in range(64):
    l.append(vid_data[i][0].shape[0])

In [27]:
x = next(iter(dl))

In [28]:
len(x)

2

In [29]:
x[0].shape

torch.Size([16, 32, 3, 224, 224])

In [30]:
x[1].shape

torch.Size([16, 20, 20])

In [78]:
x[1][0] # 20 captions for 0th video

tensor([[   2,    1,   25,    4,   46,    3,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   2,    1,   45,    4,   55,    3,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   2,    1,    7,  289,    1,  246,  109,    5, 3772,    3,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   2,    1,    7,  289,   59,    5,  120,    6,   19, 2841,    3,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   2,    1,    7,   88,    1,   25,    3,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   2,    1,    7,    4,   88,    1,   25,    3,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   2,    1,    7,    4,   88,   59,    1,  120,    3,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   2,    1,    7, 

In [69]:
xp = x[1].permute(1,2,0)

In [70]:
xp.shape

torch.Size([20, 20, 16])

In [59]:
xp[0].shape #seqlen, bs

torch.Size([20, 16])

In [80]:
for i in range(20):
    print(xp[i][:,0])

tensor([ 2,  1, 25,  4, 46,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0], dtype=torch.int32)
tensor([ 2,  1, 45,  4, 55,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0], dtype=torch.int32)
tensor([   2,    1,    7,  289,    1,  246,  109,    5, 3772,    3,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0], dtype=torch.int32)
tensor([   2,    1,    7,  289,   59,    5,  120,    6,   19, 2841,    3,    0,
           0,    0,    0,    0,    0,    0,    0,    0], dtype=torch.int32)
tensor([ 2,  1,  7, 88,  1, 25,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0], dtype=torch.int32)
tensor([ 2,  1,  7,  4, 88,  1, 25,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0], dtype=torch.int32)
tensor([  2,   1,   7,   4,  88,  59,   1, 120,   3,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0], dtype=torch.int32)
tensor([  2,   1,   7,   4,  88,   6,   1,  25,  71, 449,   9,   1, 332,   3,
          

In [83]:
xp[0]

tensor([[    2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2],
        [    1,     6,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,  1665],
        [   25,     1, 10453,   257,    24,   338,   175,     7,   952,     7,
             7,   400,     7,  1292,   312,    13],
        [    4,    92,    38,   383,    65,     4,     8,  1178,    58,   214,
           393,   197,     4,     7,    27,  2010],
        [   46,     1,     1,     4,     1,    20,     1,    10,    44,   942,
            23,     9,  2936,  2929,     1,    10],
        [    3,    12,  1402,    40,    43,    14,   368,     1,     8,  1442,
           391,     5,    16,    14,   971,   211],
        [    0,   485,     3,  2796,    70,     1,    13,   175, 17017,    10,
             3,  4326,    11,  3713,    54,     9],
        [    0,    84,     0,     6,     3,   959,    20,   215,     6,     1,
    

In [85]:
xf = x[0].permute(1,0,2,3,4)

In [86]:
xf.shape

torch.Size([32, 16, 3, 224, 224])