In [2]:
import json, pickle, os
import pandas as pd
from PIL import Image
import requests
from torch.autograd import Variable
import torchvision.models as models
import torchvision.transforms as transforms
from torch import nn
import torch
import gc, sys, psutil
import numpy as np


In [2]:
with open('./videodatainfo_2017_ustc.json', 'r') as f:
    parsed = json.load(f)

In [3]:
print(parsed.keys())
data = pd.DataFrame(parsed['sentences'])
data.head()

dict_keys(['info', 'videos', 'sentences'])


Unnamed: 0,caption,sen_id,video_id
0,a cartoon animals runs through an ice cave in ...,0,video2960
1,a cartoon character runs around inside of a vi...,1,video2960
2,a character is running in the snow,2,video2960
3,a person plays a video game centered around ic...,3,video2960
4,a person plays online and records themselves,4,video2960


In [3]:
class Video(object):
    
    def __init__(self, video_id, captions=[], sentence_ids=[]):
        self.video_id = video_id
        self.captions = captions
        self.sentence_ids = sentence_ids
        
    def captions_to_token(self, vocab_to_int):
        
        self.caption_tokens = []
        for caption in self.captions:
            ids = []
            words = caption.split()
            for word in words:
                ids.append(vocab_to_int[word])

            self.caption_tokens.append(ids)
        
        return self.caption_tokens
    
    
    def pad_captions(self, seq_len):
        
        self.padded_captions = []
        for ids in self.caption_tokens:
            ids = np.array(ids)
            if len(ids) > seq_len:
                ids = ids[:seq_len]
            else:
                ids = np.pad(ids,(0,seq_len-len(ids)),'constant')

            self.padded_captions.append(list(ids))
        
        return self.padded_captions

    
    def calculate_output_captions(self):
        
        self.output_captions = []
        
        for ids in self.padded_captions:
            ids = ids[1:]
            ids.append(0)
            
            self.output_captions.append(ids)
        
        return self.output_captions
    
    def get_vgg_tensor(self, transforms, model):
    
        vgg_features = []
        path = './frames/'+self.video_id+'/'
        frames = os.listdir(path)
        for frame in frames:
            img = Image.open(path+frame)
            img = transforms(img)

            #features = model(img.unsqueeze(0))

            vgg_features.append(img)

        feats = torch.stack(vgg_features, dim=0)
        feats = feats.cuda()
        model = model.cuda()

        with torch.no_grad():
            self.vgg_tensor = model(feats)

        del feats
        gc.collect()
        model = model.cpu()
        self.vgg_tensor = vgg_tensor.cpu()

        return self.vgg_tensor

    
    def __str__(self):
        return '{}'.format(self.video_id)
    

In [None]:
class Model(nn.Module):
    
    
    def __init__(self):
        
    def forward(self, x):
        

In [5]:
def get_vgg_tensor(video_id, transforms, model):
    
    vgg_features = []
    path = './frames/'+video_id+'/'
    frames = os.listdir(path)
    for frame in frames:
        img = Image.open(path+frame)
        img = transforms(img)
        
        #features = model(img.unsqueeze(0))
        
        vgg_features.append(img)
    
    feats = torch.stack(vgg_features, dim=0)
    feats = feats.cuda()
    model = model.cuda()
    
    with torch.no_grad():
        vgg_tensor = model(feats)
    
    del feats
    gc.collect()
    model = model.cpu()
    vgg_tensor = vgg_tensor.cpu()

    return vgg_tensor
        

In [6]:
def get_transforms_vgg():
    transform = transforms.Compose([transforms.Resize((224,224)),
                                 transforms.ToTensor(),
                                 transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])])
    
    vgg = models.vgg16(pretrained=True)
    vgg.classifier = nn.Sequential(*[vgg.classifier[i] for i in range(4)])
    
    return transform, vgg
    

In [7]:
transform, model = get_transforms_vgg()

In [8]:
l = get_vgg_tensor('video0', transform, model)

In [9]:
l.device


device(type='cpu')

In [11]:
def memReport():
    for obj in gc.get_objects():
        if torch.is_tensor(obj):
            print(type(obj), obj.size())
    
def cpuStats():
        print(sys.version)
        print(psutil.cpu_percent())
        print(psutil.virtual_memory())  # physical memory usage
        pid = os.getpid()
        py = psutil.Process(pid)
        memoryUse = py.memory_info()[0] / 2. ** 30  # memory use in GB...I think
        print('memory GB:', memoryUse)



In [12]:
cpuStats()

3.6.3 (v3.6.3:2c5fed8, Oct  3 2017, 18:11:49) [MSC v.1900 64 bit (AMD64)]
4.4
svmem(total=17046654976, available=7899828224, percent=53.7, used=9146826752, free=7899828224)
memory GB: 2.3004684448242188


In [13]:
memReport()

<class 'torch.nn.parameter.Parameter'> torch.Size([4096, 25088])
<class 'torch.nn.parameter.Parameter'> torch.Size([4096])
<class 'torch.nn.parameter.Parameter'> torch.Size([4096, 4096])
<class 'torch.nn.parameter.Parameter'> torch.Size([4096])
<class 'torch.Tensor'> torch.Size([32, 4096])
<class 'torch.nn.parameter.Parameter'> torch.Size([64, 3, 3, 3])
<class 'torch.nn.parameter.Parameter'> torch.Size([64])
<class 'torch.nn.parameter.Parameter'> torch.Size([64, 64, 3, 3])
<class 'torch.nn.parameter.Parameter'> torch.Size([64])
<class 'torch.nn.parameter.Parameter'> torch.Size([128, 64, 3, 3])
<class 'torch.nn.parameter.Parameter'> torch.Size([128])
<class 'torch.nn.parameter.Parameter'> torch.Size([128, 128, 3, 3])
<class 'torch.nn.parameter.Parameter'> torch.Size([128])
<class 'torch.nn.parameter.Parameter'> torch.Size([256, 128, 3, 3])
<class 'torch.nn.parameter.Parameter'> torch.Size([256])
<class 'torch.nn.parameter.Parameter'> torch.Size([256, 256, 3, 3])
<class 'torch.nn.paramet

In [9]:
def create_video_objects():
    video_objects = [None] * 1
    for i in range(len(video_objects)):

        video_id = 'video'+str(i)
        captions = list(data[data['video_id'] == video_id].caption)
        sentence_ids = list(data[data['video_id'] == video_id].sen_id)

        video_objects[i] = Video(video_id=video_id, captions=captions,sentence_ids=sentence_ids)
    
    return video_objects

In [10]:
def create_caption_dict()->tuple:
    
    captions_dict = {}
    sentence_ids = {} 

    for video_id in list(data['video_id'].unique()):

        caption_list = list(data[data['video_id'] == video_id].caption)
        sentence_list = list(data[data['video_id'] == video_id].sen_id)
        captions[video_id] = caption_list
        sentence_ids[video_id] = sentence_list
        
    return captions_dict, sentence_ids

In [10]:
with open('captions.pickle', 'wb') as handle:
    pickle.dump(captions, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
with open('captions.pickle', 'rb') as handle:
    captions_dict = pickle.load(handle)

In [16]:
def gather_text(captions_dict:dict)->list:

    caption_text = []
    for k, v in captions_dict.items():
        for caption in v:
            caption = ''.join([ch for ch in caption if ch not in punctuation])
            caption_text.append(caption)
    
    return caption_text

In [17]:
from collections import Counter
from string import punctuation

In [18]:
caption_text = gather_text(captions_dict)

In [13]:
def create_vocab(caption_text):
    words = []
    for text in caption_text:
        for word in text.split():
            words.append(word)
    
    counts = Counter(words)
    vocab = sorted(counts, key=counts.get, reverse=True)
    vocab_to_int = {word:i for i, word in enumerate(vocab, 1)}
    int_to_vocab = {v:k for k,v in vocab_to_int.items()}
    
    return vocab, vocab_to_int, int_to_vocab

In [19]:
a,b,c = create_vocab(caption_text)

In [23]:
def caption_to_int(caption_text, vocab_to_int):
    captions_to_int = []
    for i, caption in enumerate(caption_text):
        ids = []
        words = caption.split()

        for word in words:

            ids.append(vocab_to_int[word])

        captions_to_int.append(ids)
    
    return captions_to_int

In [1]:
seq_len = 20
def pad_sequences():
    padded_captions = []
    for ids in captions_to_int:
        ids = np.array(ids)
        if len(ids) > 20:
            ids = ids[:20]
        else:
            ids = np.pad(ids,(seq_len-len(ids),0),'constant')

        padded_captions.append(list(ids))
    
    return padded_captions