In [1]:
import os
import sys
import json
import numpy as np
import torch
from torch import nn
from torch import optim
from torch.optim import lr_scheduler

from opts import parse_opts
from model import generate_model
from mean import get_mean, get_std
from spatial_transforms import (
    Compose, Normalize, Scale, CenterCrop, CornerCrop, MultiScaleCornerCrop,
    MultiScaleRandomCrop, RandomHorizontalFlip, ToTensor)
from temporal_transforms import LoopPadding, TemporalRandomCrop
from target_transforms import ClassLabel, VideoID
from target_transforms import Compose as TargetCompose
from dataset import get_training_set, get_validation_set, get_test_set
from utils import Logger
from train import train_epoch
from validation import val_epoch
import test
import collections
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from joblib import dump, load
from sklearn import preprocessing
from scipy import stats
from sklearn.metrics import accuracy_score

from models import *

In [2]:
os.environ['CUDA_VISIBLE_DEVICES']='2'

In [3]:
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

In [15]:
from models.resnext import get_fine_tuning_parameters
from models import resnet_caffe
#model_name = 'resnet-18-kinetics'
#model_name = 'resnext-101-64f-kinetics'
#model_name = 'resnext-101-64f-kinetics-ucf101_split1'
#model_name = 'resnext-101-64f-kinetics-hmdb51_split1'
#model_name = 'resnext-101-kinetics-ucf101_split1'
#model_name = 'resnext-101-kinetics-hmdb51_split1'
#model_name = 'c3d_resnet18_ucf101_r2_ft_iter_20000.torchmodel' ## convert from caffe
#model_name = 'c3d'
# model_name = 'c3d_ft_30'
#model_name = 'I3D'
model_name = 'resnext-101-kinetics'
# model_name = 'densenet-121-kinetics_new'
# model = resnet_caffe.resnet18(
#             num_classes=101,
#             shortcut_type='B',
#             #cardinality=32,
#             sample_size=112,
#             sample_duration=16)
# model = resnet.resnet18(
#             num_classes=400,
#             shortcut_type='A',
#             #cardinality=32,
#             sample_size=112,
#             sample_duration=16)

model = resnext.resnet101(
            num_classes=400,
            shortcut_type='B',
            cardinality=32,
            sample_size=112,
            sample_duration=16)

#model = C3D.C3D(num_classes=487)

#model = I3D.InceptionI3d()

#model = densenet.densenet121(sample_size=112, sample_duration=16, num_classes=400)

model = model.cuda()
model = nn.DataParallel(model, device_ids=None)
pretrain = torch.load('./models/'+model_name+'.pth')
#print(pretrain.keys())
#model.module.fc = nn.Linear(model.module.fc.in_features, 101)
#model.module.fc = model.module.fc.cuda()
model.load_state_dict(pretrain['state_dict'])
#model.load_state_dict(pretrain)


IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [16]:
## change state dict
# pretrain = torch.load('./models/'+model_name+'.pth')
# new_dict = {}
# for key, value in pretrain['state_dict'].items():
#     key = key.replace('.1', '_1')
#     key = key.replace('.2', '_2')
#     new_dict[key] = value
# pretrain['state_dict'] = new_dict
# torch.save(pretrain, './models/'+model_name+'_new.pth')
from torchsummary import summary
#summary(model, (3, 64, 224, 224))
summary(model, (3, 16, 112, 112))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv3d-1       [-1, 64, 16, 56, 56]          65,856
       BatchNorm3d-2       [-1, 64, 16, 56, 56]             128
              ReLU-3       [-1, 64, 16, 56, 56]               0
         MaxPool3d-4        [-1, 64, 8, 28, 28]               0
            Conv3d-5       [-1, 128, 8, 28, 28]           8,192
       BatchNorm3d-6       [-1, 128, 8, 28, 28]             256
              ReLU-7       [-1, 128, 8, 28, 28]               0
            Conv3d-8       [-1, 128, 8, 28, 28]          13,824
       BatchNorm3d-9       [-1, 128, 8, 28, 28]             256
             ReLU-10       [-1, 128, 8, 28, 28]               0
           Conv3d-11       [-1, 256, 8, 28, 28]          32,768
      BatchNorm3d-12       [-1, 256, 8, 28, 28]             512
           Conv3d-13       [-1, 256, 8, 28, 28]          16,384
      BatchNorm3d-14       [-1, 256, 8,

In [7]:
from datasets.ucf101 import UCF101
root_path = '/data/chengcheng/ucf101/'
video_path = root_path + 'frames/'
annotation_path = root_path+'annotation/ucf101_01.json'
#sample_size = 112 ## for c3d res3d
sample_size = 224 ## for i3d 
#sample_duration = 16 ##for c3d res3d
sample_duration = 64 ##for i3d
norm_value = 1
mean = get_mean(norm_value, dataset='kinetics')
std = get_std(norm_value)
norm_method = Normalize(mean, [1,1,1])
# norm_method = Normalize(mean, std)

batch_size = 64
n_threads = 6

spatial_transform = Compose([
    Scale(sample_size),
    CornerCrop(sample_size, 'c'),
    ToTensor(norm_value), norm_method
])
# spatial_transform = Compose([
#     Scale(sample_size),
#     CornerCrop(sample_size, 'c'),
#     ToTensor(norm_value)
# ])
temporal_transform = LoopPadding(sample_duration)
target_transform = VideoID()

# training_data = UCF101(
#     video_path,
#     annotation_path,
#     'training',
#     0,
#     spatial_transform=spatial_transform,
#     temporal_transform=temporal_transform,
#     target_transform=target_transform)

training_data = UCF101(
    video_path,
    annotation_path,
    'training',
    0,
    spatial_transform=spatial_transform,
    temporal_transform=temporal_transform,
    target_transform=target_transform,
    sample_duration=16)


train_loader = torch.utils.data.DataLoader(
    training_data,
    batch_size=batch_size,
    shuffle=False,
    num_workers=n_threads,
    pin_memory=True)

val_data = UCF101(
    video_path,
    annotation_path,
    'validation',
    0,
    spatial_transform=spatial_transform,
    temporal_transform=temporal_transform,
    target_transform=target_transform,
    sample_duration=16)


val_loader = torch.utils.data.DataLoader(
    val_data,
    batch_size=batch_size,
    shuffle=False,
    num_workers=n_threads,
    pin_memory=True)


dataset loading [0/9537]
dataset loading [1000/9537]
dataset loading [2000/9537]
dataset loading [3000/9537]
dataset loading [4000/9537]
dataset loading [5000/9537]
dataset loading [6000/9537]
dataset loading [7000/9537]
dataset loading [8000/9537]
dataset loading [9000/9537]
115550
dataset loading [0/3783]
dataset loading [1000/3783]
dataset loading [2000/3783]
dataset loading [3000/3783]
45106


In [6]:
from datasets.hmdb51 import HMDB51
root_path = '/data/chengcheng/hmdb51/'
video_path = root_path + 'frames/'
annotation_path = root_path+'annotation/hmdb51_1.json'
#sample_size = 112 ## for c3d res3d
sample_size = 224 ## for i3d 
#sample_duration = 16 ##for c3d res3d
sample_duration = 64 ##for i3d
norm_value = 1
mean = get_mean(norm_value, dataset='kinetics')
std = get_std(norm_value)
#norm_method = Normalize(mean, [1,1,1])
norm_method = Normalize(mean, std)

batch_size = 32
n_threads = 6

spatial_transform = Compose([
    Scale(sample_size),
    CornerCrop(sample_size, 'c'),
    ToTensor(norm_value), norm_method
])
# spatial_transform = Compose([
#     Scale(sample_size),
#     CornerCrop(sample_size, 'c'),
#     ToTensor(norm_value)
# ])
temporal_transform = LoopPadding(sample_duration)
target_transform = VideoID()

# training_data = UCF101(
#     video_path,
#     annotation_path,
#     'training',
#     0,
#     spatial_transform=spatial_transform,
#     temporal_transform=temporal_transform,
#     target_transform=target_transform)

training_data = HMDB51(
    video_path,
    annotation_path,
    'training',
    0,
    spatial_transform=spatial_transform,
    temporal_transform=temporal_transform,
    target_transform=target_transform,
    sample_duration=sample_duration)


train_loader = torch.utils.data.DataLoader(
    training_data,
    batch_size=batch_size,
    shuffle=False,
    num_workers=n_threads,
    pin_memory=True)

val_data = HMDB51(
    video_path,
    annotation_path,
    'validation',
    0,
    spatial_transform=spatial_transform,
    temporal_transform=temporal_transform,
    target_transform=target_transform,
    sample_duration=sample_duration)


val_loader = torch.utils.data.DataLoader(
    val_data,
    batch_size=batch_size,
    shuffle=False,
    num_workers=n_threads,
    pin_memory=True)

dataset loading [0/3570]
dataset loading [1000/3570]
dataset loading [2000/3570]
dataset loading [3000/3570]
dataset loading [0/1530]
dataset loading [1000/1530]


In [7]:
print(len(train_loader))
# for i, (inputs, target) in enumerate(train_loader):
#     print(inputs.shape)
#     break

229


In [14]:
## for c3d
model.module.fc6.register_forward_hook(get_activation('fc6'))
model.module.fc7.register_forward_hook(get_activation('fc7'))
model.module.pool5.register_forward_hook(get_activation('pool5'))
model.eval()
fc6 = []
fc6_val = []
fc7 = []
fc7_val = []
pool5 = []
pool5_val = []
targets = []
targets_val = []
with torch.no_grad():
    for i, (inputs, target) in enumerate(train_loader):
        if len(target) != batch_size:
            output = model(inputs)
            fc6.append(activation['fc6'].view(len(target), -1).cpu())
            fc7.append(activation['fc7'].view(len(target), -1).cpu())
            pool5.append(activation['pool5'].view(len(target), -1).cpu())
            targets.append(target)
            break
        output = model(inputs)
        fc6.append(activation['fc6'].view(batch_size, -1).cpu())
        fc7.append(activation['fc7'].view(len(target), -1).cpu())
        pool5.append(activation['pool5'].view(len(target), -1).cpu())
        targets.append(target)
        
    for i, (inputs, target) in enumerate(val_loader):
        if len(target) != batch_size:
            output = model(inputs)
            fc6_val.append(activation['fc6'].view(len(target), -1).cpu())
            fc7_val.append(activation['fc7'].view(len(target), -1).cpu())
            pool5_val.append(activation['pool5'].view(len(target), -1).cpu())
            targets.append(target)
            break
        output = model(inputs)
        fc6_val.append(activation['fc6'].view(batch_size, -1).cpu())
        fc7_val.append(activation['fc7'].view(len(target), -1).cpu())
        pool5_val.append(activation['pool5'].view(len(target), -1).cpu())
        targets_val.append(target)

In [15]:
## for c3d fc6 fc7 and pool5
fc6 = np.concatenate([i.numpy() for i in fc6], axis=0)
np.save(model_name+'_fc6_hmdb_train.npy', fc6)
fc7 = np.concatenate([i.numpy() for i in fc7], axis=0)
np.save(model_name+'_fc7_hmdb_train.npy', fc7)
pool5 = np.concatenate([i.numpy() for i in pool5], axis=0)
np.save(model_name+'_pool5_hmdb_train.npy', pool5)
fc6_val = np.concatenate([i.numpy() for i in fc6_val], axis=0)
np.save(model_name+'_fc6_hmdb_val.npy', fc6_val)
fc7_val = np.concatenate([i.numpy() for i in fc7_val], axis=0)
np.save(model_name+'_fc7_hmdb_val.npy', fc7_val)
pool5_val = np.concatenate([i.numpy() for i in pool5_val], axis=0)
np.save(model_name+'_pool5_hmdb_val.npy', pool5_val)

In [6]:
## for i3d last conv: but the dims is 40w too large, ignore it for now
# model.module.Mixed_5c.register_forward_hook(get_activation('5c'))
# mixed_5c = []
# targets = []
# mixed_5c_val = []
# targets_val = []
# with torch.no_grad():
#     for i, (inputs, target) in enumerate(train_loader):
#         if len(target) != batch_size:
#             break
#         output = model(inputs)
#         mixed_5c.append(activation['5c'].view(batch_size, -1).cpu())
#         targets.append(target)
        
#     for i, (inputs, target) in enumerate(val_loader):
#         if len(target) != batch_size:
#             break
#         output = model(inputs)
#         mixed_5c_val.append(activation['5c'].view(batch_size, -1).cpu())
#         targets_val.append(target)

In [11]:
print(avg_pool.shape)

(7297, 2048)


In [7]:
## for i3d avg_pool
model.module.avg_pool.register_forward_hook(get_activation('avg_pool'))
model.eval()
avg_pool = []
targets = []
avg_pool_val = []
targets_val = []
with torch.no_grad():
    for i, (inputs, target) in enumerate(train_loader):
        if len(target) != batch_size:
            output = model(inputs)
            avg_pool.append(activation['avg_pool'].view(len(target), -1).cpu())
            targets.append(target)
            break
        output = model(inputs)
        avg_pool.append(activation['avg_pool'].view(batch_size, -1).cpu())
        targets.append(target)
        
    for i, (inputs, target) in enumerate(val_loader):
        if len(target) != batch_size:
            output = model(inputs)
            avg_pool_val.append(activation['avg_pool'].view(len(target), -1).cpu())
            targets_val.append(target)
            break
        output = model(inputs)
        avg_pool_val.append(activation['avg_pool'].view(batch_size, -1).cpu())
        targets_val.append(target)

In [9]:
## for densenet features
model.module.features.register_forward_hook(get_activation('avg_pool'))
avg_pool = []
targets = []
avg_pool_val = []
targets_val = []
with torch.no_grad():
    for i, (inputs, target) in enumerate(train_loader):
        if len(target) != batch_size:
            output = model(inputs)
            avg_pool.append(activation['avg_pool'].view(len(target), -1).cpu())
            targets.append(target)
            break
        output = model(inputs)
        avg_pool.append(activation['avg_pool'].view(batch_size, -1).cpu())
        targets.append(target)
        
    for i, (inputs, target) in enumerate(val_loader):
        if len(target) != batch_size:
            output = model(inputs)
            avg_pool_val.append(activation['avg_pool'].view(len(target), -1).cpu())
            targets_val.append(target)
            break
        output = model(inputs)
        avg_pool_val.append(activation['avg_pool'].view(batch_size, -1).cpu())
        targets_val.append(target)

In [10]:
print(avg_pool[0])

tensor([[1.0726, 0.3272, 0.6502,  ..., 0.2205, 0.1618, 0.0413],
        [0.8820, 0.7313, 0.7571,  ..., 0.1168, 0.1274, 0.0613],
        [0.3677, 0.2491, 0.2381,  ..., 0.1679, 0.4759, 0.4003],
        ...,
        [0.6967, 0.6182, 0.6850,  ..., 0.0406, 0.0202, 0.0512],
        [0.7084, 0.7825, 0.8169,  ..., 0.2203, 0.1137, 0.0997],
        [0.7456, 0.2729, 0.3258,  ..., 0.0000, 0.0000, 0.0170]])


In [8]:
model.module.layer3.register_forward_hook(get_activation('fc6'))
tmp = []
with torch.no_grad():
    for i,(inputs, target) in enumerate(val_loader):
        print(inputs[0].shape)
        output = model(inputs)
        print(output)
        print(np.argmax(output.cpu().numpy(), axis = 1))
        print(target)
        tmp.append(activation['fc6'].cpu())
        if i == 0:
            break

torch.Size([3, 16, 112, 112])
tensor([[ 0.6114,  2.3453, -1.3323,  ...,  0.4873, -1.5079, -0.6875],
        [ 0.6616,  2.5497, -0.5557,  ...,  0.6992, -1.2938, -1.0833],
        [-0.5493,  1.7295, -0.6552,  ...,  0.0281, -1.1301,  1.2492],
        ...,
        [ 1.5558,  1.1590, -1.1916,  ..., -1.0635, -0.8214, -1.4435],
        [ 1.8603,  1.0700, -1.1427,  ..., -1.0343, -1.1321, -0.3943],
        [ 0.4386, -0.3195, -0.6254,  ...,  0.2447, -0.6758, -3.0588]],
       device='cuda:0')
[127 265  86 277 244 234 186  67 125 294  63 247 159  34 186 186 186 186
 159 233 344 197 197 115 116 197   1 139 224  24 224 224 365 353  37 379
 115 224 365  37 382 224 352  14 277  14 352 365 224 109 107 378 378 316
 378 316 316 316 126 316 316 316 123 316 112  36 123 128  36 123 116 365
 109   3   3   3 233 256 306 120 260 260   5 197 398 398 398 284 252 396
 396 159  34  60 245 369 159  99  90 252 252 261 261 262  59 253  60  18
  18 245 262 325 325 346 346 346 346  69  69  69 346 346 346 346 129 360
 

In [8]:
model.module.avgpool.register_forward_hook(get_activation('fc6'))
model.eval()
tmp = []
with torch.no_grad():
    for i,(inputs, target) in enumerate(val_loader):
        #print(inputs)
        #print(inputs[0].shape)
        output = model(inputs)
        #print(output)
        print(np.argmax(output.cpu().numpy(), axis = 1))
        #print(target)
        tmp.append(activation['fc6'].cpu())
        if i == 100:
            break

[99 99 99  0  0  0 90 90 99 99 99 90 59 99 99 59 90 99 59 77  0 98 98  0
 99 98 99 98 98 99 98 98 99 99 98 99 99 99 99 99 99 90 59 98 99 59 90 99
 99 59 59 33 98 33 33 38  0  0  0 98  0  0  0 70]
[98 99 99 99 33 99  0  0 99  0 99 99 99 99 90 90 90 90 90 90 90 90 90 90
 90 90 99 99 90 90 90 99 90 90 90 90 90 90 90 90 99 99 90 99 99 80 40 59
 99 90 99 99 99 99 99 99 99 90 99 99 80 80 98 80]
[98 80 99 99 98 80 98  0  0 99  0 98  0 98 98 98 98 80 80 99 99 80 99 99
 34  0 99 34 80 59 59 59 80 59  0 59 59 59 99 59 80 99  0 99 80 99 80 99
 99  0  0  0 59 80 99 99 99 99 99 99 99 99 99 99]
[99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 55 99 99 99
 99 55 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
 99 99 99 99 99 99 99 99 99 99 99 55 33 70 70 70]
[70 59 70 59 70 70 70 70 70 70 70 70 70 70 70 70 70  0 70 70 70 70  0  0
 70 70 70 70 70 70 70 59 59 70 59 59 70 70 70 59 70 59 70 70 70 90  0 59
 59 59 70 70 59 59 59 59 70 59 70  0 70 59 59 59]
[59 59 59 70 70 70 7

[ 99  14  99 100  90  99  14  14  14  14  90  90  14  14  80  14  80  14
  99  14  14  14  14  80  80  80  14  90  80  14  90  14  14  14  80  80
  80  14  80  14  90  80  27   4  27  27 100 100 100 100 100  14  40  40
  40 100   9  47 100  47  14  27  47  55]
[89 47 47 47 55 75 99 27 47 47 40 14 39 14 47 47 14 80 14 14 40 39 14 39
 40 40 40 39 80 80 80 14 40 80 40 80 80 14 80 14 14 14 80 14 14 14 40 40
 40 14 14 80 80 40 14 80 40 80 14 39 80 14 40 39]
[40 40 80 80 80 80 40 14 80 80 80 80 80 27 14 14 27 14 14 99 14 14 14 14
 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14
 14 14 14 14 14 14 14 14 14 14 14 14 14 90 14 14]
[14 14 14 14 14 14 14 14 14 14 46 14 16 55 14 14 14 14 14 14 14 14 90 14
 14 55 14 14 90 14 90 14 14 14 14 14 14 80 85 80  2 90 80 99 14 40 40 40
 40 40 40 90 90 40 90 90 40 40 80 40 80 40 40 80]
[80 80 40 80 80 80 51 80 80 80 80 80 90 80 80 27 14 14 27 14 14 99 14 47
 14 14 14 14 14 21 99 47 99 21 21 21 90 90 99 47 21 14  4  4 99 99 14 21
 43 4

[99 99 80 65 90 55 55 55 59 38 14 90 90 70 90 55 90 14 14 90 14 99 55 40
 90 90 90 90 40 40 40 40 55 90 24 90 70 40 40 90 99 40 90 70 70 14 14 90
 70 14 90 90 55 59 55 40 40 40 70 90 99 55 14 99]
[99 99 90 90 98 14 70 14 99 99 14 70 70 70 14 14 98 70 90 90 99 14 99 98
 99 99 99 14 14 99 14 90 55 90 14 90 90 14 14 14 90 90 99 99 14 59 59 59
 59 59 76  9  9 59 40 55 55 55 55 59 59 70 14 70]
[80 38 38 55 59 59 38 59 59 55 99 55 62 59 59 40 99 14 80 14 80 99 99 99
 99 99 80 80 80 80 80 80 80 80 14 99 14 80 21 80 80 80 80 80 80 99 80 80
 80 80 40 59 99 99 90 99 99 99 99 99 90 90 99 90]
[80 55 59 46 59  0 14 59 14 59 55 55 14 46 46 21 14 14 21 14 14 59 99 55
 55 59 98 98 59 99 98 14 98 14 14 90 99 99 99 14 99 99 98 99 99 99 99 59
 14 99 99 65 90 98 14 14 59 14 14 59 59 59 59 80]
[ 14  99  34  80  55  55  59  80  80  38  80  80  59  80  14  59  34  59
  59  34  59  99  99  14  99  99  99  99  14  99  99  99  99  99  99  99
  99  99  99  90 100  99  99  34  99  99 100  14  99 100 100  99  99  

In [9]:
for i in tmp:
    print(i.shape)
tmp2 = np.concatenate([i.numpy() for i in tmp], axis=0)
print(tmp2.shape)

torch.Size([128, 256, 2, 7, 7])
(128, 256, 2, 7, 7)


In [8]:
print(avg_pool[0].shape)

torch.Size([32, 7168])


In [13]:
## for dense3d 1024*9
#avg_pool = np.concatenate([i.numpy() for i in avg_pool], axis=0)
np.save(model_name+'_avg_pool_train.npy', avg_pool)
#avg_pool_val = np.concatenate([i.numpy() for i in avg_pool_val], axis=0)
np.save(model_name+'_avg_pool_val.npy', avg_pool_val)

In [9]:
## for i3d 1024*7
avg_pool = np.concatenate([i.numpy() for i in avg_pool], axis=0)
np.save(model_name+'_hmdb_avgpool_7168_train.npy', avg_pool)
avg_pool_val = np.concatenate([i.numpy() for i in avg_pool_val], axis=0)
np.save(model_name+'_hmdb_avgpool_7168_val.npy', avg_pool_val)

In [9]:
## for i3d 1024*4 4096
avg_pool = np.concatenate([i.numpy() for i in avg_pool], axis=0)
np.save(model_name+'_avg_pool_nonorm_train.npy', avg_pool)
avg_pool_val = np.concatenate([i.numpy() for i in avg_pool_val], axis=0)
np.save(model_name+'_avg_pool_nonorm_val.npy', avg_pool_val)

In [10]:
## for c3d 
fc6 = np.concatenate([i.numpy() for i in fc6], axis=0)
np.save(model_name+'_fc6_train.npy', fc6)
fc6_val = np.concatenate([i.numpy() for i in fc6_val], axis=0)
np.save(model_name+'_fc6_val.npy', fc6_val)

In [11]:
print(fc6.shape)
# print(avg_pool.shape)

(115550, 4096)


In [6]:
## for r3d layer4 and avgpool
### temp change as layer 4 and layer 3 and avgpool
model.module.avgpool.register_forward_hook(get_activation('avgpool'))
model.module.layer4.register_forward_hook(get_activation('layer4'))
model.module.layer3.register_forward_hook(get_activation('layer3'))
model.eval()
avgpool = []
layer4 = []
layer3 = []
targets = []
avgpool_val = []
layer4_val = []
layer3_val = []
targets_val = []
with torch.no_grad():
    for i, (inputs, target) in enumerate(train_loader):
        if len(target) != batch_size:
            output = model(inputs)
            avgpool.append(activation['avgpool'].view(len(target), -1).cpu())
            layer4.append(activation['layer4'].view(len(target), -1).cpu())
            layer3.append(activation['layer3'].view(len(target), -1).cpu())
            targets.append(target)
            break
        output = model(inputs)
        avgpool.append(activation['avgpool'].view(batch_size, -1).cpu())
        layer4.append(activation['layer4'].view(batch_size, -1).cpu())
        layer3.append(activation['layer3'].view(len(target), -1).cpu())
        targets.append(target)
        
    for i, (inputs, target) in enumerate(val_loader):
        if len(target) != batch_size:
            output = model(inputs)
            avgpool_val.append(activation['avgpool'].view(len(target), -1).cpu())
            layer4_val.append(activation['layer4'].view(len(target), -1).cpu())
            layer3_val.append(activation['layer3'].view(len(target), -1).cpu())
            targets_val.append(target)
            break
        output = model(inputs)
        avgpool_val.append(activation['avgpool'].view(batch_size, -1).cpu())
        layer4_val.append(activation['layer4'].view(batch_size, -1).cpu())
        layer3_val.append(activation['layer3'].view(len(target), -1).cpu())
        targets_val.append(target)
        
# print(model(x))
# print(activation['avgpool'].shape)
# print(activation['layer4'].shape)

In [8]:
## for r3d layer3
model.module.layer3.register_forward_hook(get_activation('layer4'))
layer4 = []
targets = []
layer4_val = []
targets_val = []
with torch.no_grad():
    for i, (inputs, target) in enumerate(train_loader):
        if len(target) != batch_size:
            break
        output = model(inputs)
        layer4.append(activation['layer4'].view(batch_size, -1).cpu())
        targets.append(target)
        
    for i, (inputs, target) in enumerate(val_loader):
        if len(target) != batch_size:
            break
        output = model(inputs)
        layer4_val.append(activation['layer4'].view(batch_size, -1).cpu())
        targets_val.append(target)

In [14]:
print(avg_pool.shape)

(7297, 2048)


In [8]:
## temp for layer3 and layer2 of resnet18
avgpool = np.concatenate([i.numpy() for i in avgpool], axis=0)
np.save(model_name+'_hmdb_avgpool_train.npy', avgpool)
layer4 = np.concatenate([i.numpy() for i in layer4], axis=0)
np.save(model_name+'_hmdb_layer4_train.npy', layer4)
layer3 = np.concatenate([i.numpy() for i in layer3], axis=0)
np.save(model_name+'_hmdb_layer3_train.npy', layer3)
avgpool_val = np.concatenate([i.numpy() for i in avgpool_val], axis=0)
np.save(model_name+'_hmdb_avgpool_val.npy', avgpool_val)
layer4_val = np.concatenate([i.numpy() for i in layer4_val], axis=0)
np.save(model_name+'_hmdb_layer4_val.npy', layer4_val)
layer3_val = np.concatenate([i.numpy() for i in layer3_val], axis=0)
np.save(model_name+'_hmdb_layer3_val.npy', layer3_val)

In [9]:
avgpool = np.concatenate([i.numpy() for i in avgpool], axis=0)
np.save(model_name+'_avgpool_train.npy', avgpool)
layer4 = np.concatenate([i.numpy() for i in layer4], axis=0)
np.save(model_name+'_layer4_train.npy', layer4)
layer3 = np.concatenate([i.numpy() for i in layer3], axis=0)
np.save(model_name+'_layer3_train.npy', layer3)
avgpool_val = np.concatenate([i.numpy() for i in avgpool_val], axis=0)
np.save(model_name+'_avgpool_val.npy', avgpool_val)
layer4_val = np.concatenate([i.numpy() for i in layer4_val], axis=0)
np.save(model_name+'_layer4_val.npy', layer4_val)
layer3_val = np.concatenate([i.numpy() for i in layer3_val], axis=0)
np.save(model_name+'_layer3_val.npy', layer3_val)

In [9]:
layer4 = np.concatenate([i.numpy() for i in layer4], axis=0)
np.save(model_name+'_layer3_train.npy', layer4)

layer4_val = np.concatenate([i.numpy() for i in layer4_val], axis=0)
np.save(model_name+'_layer3_val.npy', layer4_val)

In [10]:
print(avgpool_val.shape)

(45106, 2048)


In [11]:
targets = np.concatenate(np.array(targets), axis=0)
np.save('class_names_hmdb_train_f64.npy', targets)
targets_val = np.concatenate(np.array(targets_val), axis=0)
np.save('class_names_hmdb_val_f64.npy', targets_val)

In [12]:
print(targets.shape)

(7297,)


In [13]:
### get class index for hmdb
label = ["brush_hair", "cartwheel", "catch", "chew", "clap", "climb", "climb_stairs", "dive", "draw_sword", "dribble", "drink", "eat", "fall_floor", "fencing", "flic_flac", "golf", "handstand", "hit", "hug", "jump", "kick", "kick_ball", "kiss", "laugh", "pick", "pour", "pullup", "punch", "push", "pushup", "ride_bike", "ride_horse", "run", "shake_hands", "shoot_ball", "shoot_bow", "shoot_gun", "sit", "situp", "smile", "smoke", "somersault", "stand", "swing_baseball", "sword", "sword_exercise", "talk", "throw", "turn", "walk", "wave"]
name2idx = {}
index = 1
for class_label in label:
    name2idx[class_label] = index
    index += 1
print(name2idx)

{'brush_hair': 1, 'cartwheel': 2, 'catch': 3, 'chew': 4, 'clap': 5, 'climb': 6, 'climb_stairs': 7, 'dive': 8, 'draw_sword': 9, 'dribble': 10, 'drink': 11, 'eat': 12, 'fall_floor': 13, 'fencing': 14, 'flic_flac': 15, 'golf': 16, 'handstand': 17, 'hit': 18, 'hug': 19, 'jump': 20, 'kick': 21, 'kick_ball': 22, 'kiss': 23, 'laugh': 24, 'pick': 25, 'pour': 26, 'pullup': 27, 'punch': 28, 'push': 29, 'pushup': 30, 'ride_bike': 31, 'ride_horse': 32, 'run': 33, 'shake_hands': 34, 'shoot_ball': 35, 'shoot_bow': 36, 'shoot_gun': 37, 'sit': 38, 'situp': 39, 'smile': 40, 'smoke': 41, 'somersault': 42, 'stand': 43, 'swing_baseball': 44, 'sword': 45, 'sword_exercise': 46, 'talk': 47, 'throw': 48, 'turn': 49, 'walk': 50, 'wave': 51}


In [17]:
### get class index
label_file = '/data/chengcheng/ucf101/annotation/classInd.txt'
name2idx = {}
content = [line.rstrip('\n') for line in open(label_file)]
for i, line in enumerate(content):
    index = int(line.split(' ')[0])
    name = line.split(' ')[1]
    name2idx[name] = index
print(name2idx)

{'ApplyEyeMakeup': 1, 'ApplyLipstick': 2, 'Archery': 3, 'BabyCrawling': 4, 'BalanceBeam': 5, 'BandMarching': 6, 'BaseballPitch': 7, 'Basketball': 8, 'BasketballDunk': 9, 'BenchPress': 10, 'Biking': 11, 'Billiards': 12, 'BlowDryHair': 13, 'BlowingCandles': 14, 'BodyWeightSquats': 15, 'Bowling': 16, 'BoxingPunchingBag': 17, 'BoxingSpeedBag': 18, 'BreastStroke': 19, 'BrushingTeeth': 20, 'CleanAndJerk': 21, 'CliffDiving': 22, 'CricketBowling': 23, 'CricketShot': 24, 'CuttingInKitchen': 25, 'Diving': 26, 'Drumming': 27, 'Fencing': 28, 'FieldHockeyPenalty': 29, 'FloorGymnastics': 30, 'FrisbeeCatch': 31, 'FrontCrawl': 32, 'GolfSwing': 33, 'Haircut': 34, 'Hammering': 35, 'HammerThrow': 36, 'HandStandPushups': 37, 'HandstandWalking': 38, 'HeadMassage': 39, 'HighJump': 40, 'HorseRace': 41, 'HorseRiding': 42, 'HulaHoop': 43, 'IceDancing': 44, 'JavelinThrow': 45, 'JugglingBalls': 46, 'JumpingJack': 47, 'JumpRope': 48, 'Kayaking': 49, 'Knitting': 50, 'LongJump': 51, 'Lunges': 52, 'MilitaryParade': 

In [14]:
print(targets[0:10])

['21_turn_h_cm_np1_fr_goo_12' '21_turn_h_cm_np1_fr_goo_12'
 '21_turn_h_nm_np1_fr_goo_14' '21_turn_h_nm_np1_fr_goo_14'
 '21_turn_h_nm_np1_fr_goo_4' '21_turn_h_nm_np1_fr_goo_4'
 '50_FIRST_DATES_turn_f_cm_np1_fr_med_11'
 '50_FIRST_DATES_turn_f_nm_np1_le_bad_9'
 'A_Beautiful_Mind_2_turn_h_cm_np1_ri_med_5'
 'A_Beautiful_Mind_2_turn_h_cm_np1_ri_med_5']


In [15]:
### for hmdb
data = json.load(open('/data/chengcheng/hmdb51/annotation/hmdb51_1.json'))
filename2label = {}
for key, value in data['database'].items():
    filename2label[key] = value['annotations']['label']
print(filename2label)

{'21_turn_h_cm_np1_fr_goo_12': 'turn', '21_turn_h_nm_np1_fr_goo_14': 'turn', '21_turn_h_nm_np1_fr_goo_4': 'turn', '50_FIRST_DATES_turn_f_cm_np1_fr_med_11': 'turn', '50_FIRST_DATES_turn_f_nm_np1_le_bad_9': 'turn', 'A_Beautiful_Mind_1_turn_h_nm_np1_ri_goo_10': 'turn', 'A_Beautiful_Mind_1_turn_u_cm_np1_ri_goo_9': 'turn', 'A_Beautiful_Mind_2_turn_h_cm_np1_ri_med_5': 'turn', 'A_Beautiful_Mind_2_turn_u_cm_np1_fr_goo_1': 'turn', 'A_Beautiful_Mind_2_turn_u_cm_np2_ba_med_4': 'turn', 'A_Beautiful_Mind_5_turn_h_cm_np1_fr_goo_9': 'turn', 'A_Beautiful_Mind_5_turn_u_cm_np1_le_med_2': 'turn', 'AgentCodyBanks_turn_h_cm_np1_ba_goo_8': 'turn', 'AgentCodyBanks_turn_h_nm_np1_fr_med_2': 'turn', 'AgentCodyBanks_turn_u_cm_np1_le_med_7': 'turn', 'HP_PRISONER_OF_AZKABAN_turn_u_nm_np1_fr_med_21': 'turn', 'Italian_Job_2_turn_h_nm_np1_ri_med_7': 'turn', 'Italian_Job_2_turn_u_nm_np1_ba_med_4': 'turn', 'Juno_turn_u_cm_np1_ba_med_3': 'turn', 'LONGESTYARD_turn_u_nm_np1_ba_med_11': 'turn', 'LivefreeorDieHard_turn_u_cm

In [16]:
# targets_idx = [name2idx[name.split('_')[1]] for name in targets]
# print(targets_idx[0:10])
# np.save('idx.npy', targets_idx)
# targets_val_idx = [name2idx[name.split('_')[1]] for name in targets_val]
# print(targets_val_idx[0:10])
# np.save('idx_val.npy', targets_val_idx)
targets_idx = [name2idx[filename2label[name]] for name in targets]
print(targets_idx[0:10])
np.save('idx_hmdb_f64.npy', targets_idx)
targets_val_idx = [name2idx[filename2label[name]] for name in targets_val]
print(targets_val_idx[0:10])
np.save('idx_hmdb_val_f64.npy', targets_val_idx)

[49, 49, 49, 49, 49, 49, 49, 49, 49, 49]
[49, 49, 49, 49, 49, 49, 49, 49, 49, 49]


In [17]:
print(len(targets_val_idx))

3061


In [21]:
### average layer activation to get video representation
#targets_idx = np.load('./idx_val.npy')
#clip_features = np.load('./resnext-101-kinetics-ucf101_split1_avgpool_val.npy')
### for i3d avg_pool
previous_name = targets[0]
previous_label = targets_idx[0]
video_label = []
video_reps = []
features_tmp = []
for label, name, clip_feature in zip(targets_idx, targets, fc6):
    #print(name, previous_name)
    if name != previous_name:
        previous_name = name
        video_label.append(previous_label)
        previous_label = label
        video_reps.append(np.average(np.array(features_tmp), axis = 0))
        features_tmp = []
        features_tmp.append(clip_feature)
    else:
        features_tmp.append(clip_feature)

In [18]:
### for r3d avgpool
previous_name = targets[0]
previous_label = targets_idx[0]
video_label = []
video_reps = []
features_tmp = []
for label, name, clip_feature in zip(targets_idx, targets, avgpool):
    #print(name, previous_name)
    if name != previous_name:
        previous_name = name
        video_label.append(previous_label)
        previous_label = label
        video_reps.append(np.average(np.array(features_tmp), axis = 0))
        features_tmp = []
        features_tmp.append(clip_feature)
    else:
        features_tmp.append(clip_feature)

In [22]:
### svm for clip-aggregated feature
X = preprocessing.normalize(np.array(video_reps), norm='l2')
y = np.array(video_label)
print(X.shape, y.shape)
clf = LinearSVC(C=10, random_state=0, tol=1e-5)
#clf = SVC()
clf.fit(X, y)

(9534, 4096) (9534,)


LinearSVC(C=10, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=0, tol=1e-05,
          verbose=0)

In [27]:
print(len(video_reps_val))

3778


In [23]:
previous_name = targets_val[0]
previous_label = targets_val_idx[0]
video_label_val = []
video_reps_val = []
features_tmp = []
for label, name, clip_feature in zip(targets_val_idx, targets_val, fc6_val):
    #print(name, previous_name)
    if name != previous_name:
        previous_name = name
        video_label_val.append(previous_label)
        previous_label = label
        video_reps_val.append(np.average(np.array(features_tmp), axis = 0))
        features_tmp = []
        features_tmp.append(clip_feature)
    else:
        features_tmp.append(clip_feature)

In [24]:
X_val = preprocessing.normalize(np.array(video_reps_val), norm='l2')
y_val = np.array(video_label_val)
predict = clf.predict(X_val)
accuracy_score(predict, y_val)
###0.7706955831790532 for c3d fc6
###0.818037556202063 for i3d avgpool
###0.8400317292437863 for i3d avgpool 4096 (change stride from (1,1,1) to (2,1,1)) note: 0.858011634056055 when set C=10 for svm
###0.8419798835362625 for layer4 of r3d-18 , 0.8364213869772367 for avgpool r3d-18, 0.9025 for avgpool rx3d-101_ft， 0.9018 for layer4 rx3d-101_ft
###0.8817 for avgpool rx3d101.
###0.5947591318157756 for features of densenet 121

0.7706955831790532

In [12]:
print(X_val)

NameError: name 'X_val' is not defined

In [17]:
def getVideoFeature(clip_features, targets, targets_idx):
    previous_name = targets[0]
    previous_label = targets_idx[0]
    video_label = []
    video_reps = []
    features_tmp = []
    for label, name, clip_feature in zip(targets_idx, targets, clip_features):
        #print(name, previous_name)
        if name != previous_name:
            previous_name = name
            video_label.append(previous_label)
            previous_label = label
            video_reps.append(np.average(np.array(features_tmp), axis = 0))
            features_tmp = []
            features_tmp.append(clip_feature)
        else:
            features_tmp.append(clip_feature)
    return np.array(video_label), np.array(video_reps)

In [18]:
### feature fusion: concatenate
### c3d fc6 and i3d avgpool
c3d_clip_features = np.load('./resnet-18-kinetics_avgpool_train.npy')
c3d_clip_features_val = np.load('./resnet-18-kinetics_avgpool_val.npy')
c3d_targets = np.load('./I3D_class_names_train.npy')
c3d_targets_val = np.load('./I3Dclass_names_val.npy')
c3d_targets_idx = np.load('./idx.npy')
c3d_targets_idx_val = np.load('./idx_val.npy')
c3d_video_label, c3d_video_reps = getVideoFeature(c3d_clip_features, c3d_targets, c3d_targets_idx)
c3d_video_label_val, c3d_video_reps_val = getVideoFeature(c3d_clip_features_val, c3d_targets_val, c3d_targets_idx_val)

In [22]:
c3d_clip_features = np.load('./resnet-18-kinetics_layer4_train.npy')
c3d_clip_features_val = np.load('./resnet-18-kinetics_layer4_val.npy')
c3d_targets = np.load('./I3D_class_names_train.npy')
c3d_targets_val = np.load('./I3Dclass_names_val.npy')
c3d_targets_idx = np.load('./idx.npy')
c3d_targets_idx_val = np.load('./idx_val.npy')
c3d_video_label, c3d_video_reps = getVideoFeature(c3d_clip_features, c3d_targets, c3d_targets_idx)
c3d_video_label_val, c3d_video_reps_val = getVideoFeature(c3d_clip_features_val, c3d_targets_val, c3d_targets_idx_val)

In [19]:
i3d_video_label = np.array(video_label)
i3d_video_reps = np.array(video_reps)
i3d_video_label_val = np.array(video_label_val)
i3d_video_reps_val = np.array(video_reps_val)

In [23]:
## feature concatenate
X = preprocessing.normalize(np.concatenate((i3d_video_reps,c3d_video_reps), axis = 1), norm='l2')
y = c3d_video_label
print(X.shape, y.shape)
clf = LinearSVC(random_state=0, tol=1e-5)
clf.fit(X, y)

(9529, 10240) (9529,)


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=0, tol=1e-05,
          verbose=0)

In [51]:
## feature addition
X = preprocessing.normalize(np.add(i3d_video_reps[0:-1],c3d_video_reps), norm='l2')
y = c3d_video_label
print(X.shape, y.shape)
clf = LinearSVC(random_state=0, tol=1e-5)
clf.fit(X, y)

(9534, 4096) (9534,)


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=0, tol=1e-05,
          verbose=0)

In [24]:
X_val = preprocessing.normalize(np.concatenate((i3d_video_reps_val,c3d_video_reps_val), axis = 1), norm='l2')
y_val = c3d_video_label_val
predict = clf.predict(X_val)
accuracy_score(predict, y_val)

0.8710958178930651

In [52]:
X_val = preprocessing.normalize(np.add(i3d_video_reps_val[0:-1],c3d_video_reps_val), norm='l2')
y_val = c3d_video_label_val
predict = clf.predict(X_val)
accuracy_score(predict, y_val)

0.7058979106056599

In [21]:
### for validation, we poll the video label from multipul clips 
predict_clip = clf.predict(preprocessing.normalize(layer4_val))
previous_name = targets_val[0]
previous_label = targets_val_idx[0]
video_label_val = []
video_predict_val = []
### aggregate clip label to video label
pred_tmp = []
for label, name, pred in zip(targets_val_idx, targets_val, predict_clip):
    #print(name, previous_name)
    if name != previous_name:
        previous_name = name
        video_label_val.append(previous_label)
        previous_label = label
        video_predict_val.append(stats.mode(pred_tmp)[0][0])
        pred_tmp = []
        pred_tmp.append(pred)
    else:
        pred_tmp.append(pred)

In [22]:
accuracy_score(video_predict_val, y_val)

0.3575966119640021

In [None]:
targets_arr = np.concatenate(np.array(targets), axis=0)
print(targets_arr[0], targets_arr.shape)

In [23]:
np.save('./clip_names_val.npy', targets_arr)

In [14]:
np.save('./resnext-101-kinetics-ucf101_split1_avgpool_video_train.npy', np.array(video_reps))
np.save('./video_label.npy', np.array(video_label))

In [None]:
### svm for clip-aggregated feature
X = preprocessing.normalize(np.load('./resnext-101-kinetics-ucf101_split1_avgpool_video_train.npy'))
y = np.load('./video_label.npy')
print(X.shape, y.shape)
#clf = LinearSVC(random_state=0, tol=1e-5)
clf = SVC(kernel='linear')
clf.fit(X, y)

(9536, 2048) (9536,)


In [15]:
dump(clf, 'linearsvc_rx3d101_avgpool_2048_video.joblib') 

['linearsvc_rx3d101_avgpool_2048_video.joblib']

In [17]:
avgpool_normalized = preprocessing.normalize(np.load('./resnext-101-kinetics-ucf101_split1_avgpool_val.npy'))
predict = clf.predict(avgpool_normalized)

In [27]:
video_predict = clf.predict(preprocessing.normalize(np.array(video_reps)))
print(video_predict[0:100])

[71 71 71 71 71 24 71 71 71 71 71 71 71 71 71 71 71 71 24 71 24 71 71 71
 71 71 71 71 71 71 71 71 71 24 71 71 71 71 71 71 71 71 71 24 59 71 71 71
 71 24 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71
 71 71 71 78 71 24 71 71 71 71 24 71 71 71 71 71 71 71 24 71 71 71 71 71
 71 71 71 71]


In [28]:
accuracy_score(video_label, video_predict)

0.01850872554204125

In [35]:
print(video_predict[0:200])

[71 71 71 71 71 24 71 71 71 71 71 71 71 71 71 71 71 71 24 71 24 71 71 71
 71 71 71 71 71 71 71 71 71 24 71 71 71 71 71 71 71 71 71 24 59 71 71 71
 71 24 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71
 71 71 71 78 71 24 71 71 71 71 24 71 71 71 71 71 71 71 24 71 71 71 71 71
 71 71 71 71 71 71 71 71 71 24 24 59 71 71 71 71 71 71 71 71 71 24 71 71
 71 24 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71 71
 71 71 71 71 71 71 71 71 24 24 71 71 71 71 24 71 71 71 71 71 71 71 71 71
 71 71 71 71 71 71 71 71 71 71 71 24 71 71 24 71 71 24 71 71 71 71 71 71
 71 71 71 71 71 71 71 71]


In [19]:
print(predict[0:100])

[24 24 24 24 10 24 24 24 24 24 24 24 10 10 78 24 59 78 10 24 24 24 24 24
 78 24 78 24 24 24 24 24 24 78 24 78 24 24 71 24 24 24 24 10 10 10 24 24
 78 24 10 78 78 78 78 10 78 78 24 24 24 24 24 24 24 24 24 24 24 24 24 24
 78 78 78 24 24 10 24 78 10 24 24 24 24 24 24 24 10 24 24 24 24 24 24 78
 10 24 24 24]


In [13]:
print(np.array(video_reps).shape)
print(np.array(video_label).shape)
print(video_reps[0])

(9536, 2048)
(9536,)
[0.32373673 0.32624754 0.23443274 ... 0.3154096  0.31098387 0.31159514]


In [9]:
avgpool_arr = np.concatenate([i.numpy() for i in avgpool], axis=0)
print(avgpool_arr.shape)

layer4_arr = np.concatenate([i.numpy() for i in layer4], axis=0)
print(layer4_arr.shape)

(45106, 2048)


In [11]:
np.save('resnext-101-kinetics-ucf101_split1_avgpool_val.npy', avgpool_arr)
np.save('resnext-101-kinetics-ucf101_split1_layer4_val.npy', layer4_arr)

In [12]:
label_file = '/data/chengcheng/ucf101/annotation/classInd.txt'
name2idx = {}
content = [line.rstrip('\n') for line in open(label_file)]
for i, line in enumerate(content):
    index = int(line.split(' ')[0])
    name = line.split(' ')[1]
    name2idx[name] = index
print(name2idx)

{'ApplyEyeMakeup': 1, 'ApplyLipstick': 2, 'Archery': 3, 'BabyCrawling': 4, 'BalanceBeam': 5, 'BandMarching': 6, 'BaseballPitch': 7, 'Basketball': 8, 'BasketballDunk': 9, 'BenchPress': 10, 'Biking': 11, 'Billiards': 12, 'BlowDryHair': 13, 'BlowingCandles': 14, 'BodyWeightSquats': 15, 'Bowling': 16, 'BoxingPunchingBag': 17, 'BoxingSpeedBag': 18, 'BreastStroke': 19, 'BrushingTeeth': 20, 'CleanAndJerk': 21, 'CliffDiving': 22, 'CricketBowling': 23, 'CricketShot': 24, 'CuttingInKitchen': 25, 'Diving': 26, 'Drumming': 27, 'Fencing': 28, 'FieldHockeyPenalty': 29, 'FloorGymnastics': 30, 'FrisbeeCatch': 31, 'FrontCrawl': 32, 'GolfSwing': 33, 'Haircut': 34, 'Hammering': 35, 'HammerThrow': 36, 'HandStandPushups': 37, 'HandstandWalking': 38, 'HeadMassage': 39, 'HighJump': 40, 'HorseRace': 41, 'HorseRiding': 42, 'HulaHoop': 43, 'IceDancing': 44, 'JavelinThrow': 45, 'JugglingBalls': 46, 'JumpingJack': 47, 'JumpRope': 48, 'Kayaking': 49, 'Knitting': 50, 'LongJump': 51, 'Lunges': 52, 'MilitaryParade': 

In [26]:
targets_arr = np.concatenate(np.array(targets), axis=0)
targets_idx = [name2idx[name.split('_')[1]] for name in targets_arr]
print(targets_idx)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [11]:
#print(model(x))
print(activation['avgpool'].shape)
print(activation['layer4'].shape)

torch.Size([32, 2048, 1, 1, 1])
torch.Size([32, 2048, 1, 4, 4])


In [None]:
print('test')

In [3]:
avgpool_arr = np.load('./resnext-101-kinetics-ucf101_split1_avgpool_train.npy')
print(avgpool_arr[0])

[0.         0.19841485 0.6230076  ... 0.49189273 0.15967341 1.2884941 ]


In [7]:
avgpool_normalized = preprocessing.normalize(avgpool_arr)
print(avgpool_normalized[0])

[0.         0.00967616 0.03038241 ... 0.02398829 0.00778684 0.06283641]


In [27]:
np.save('idx_val.npy', targets_idx)

In [None]:
### svm 
X = preprocessing.normalize(np.load('./resnext-101-kinetics-ucf101_split1_avgpool_train.npy'))
y = np.load('./idx.npy')
#clf = LinearSVC(random_state=0, tol=1e-5)
clf = SVC(kernel='linear')
clf.fit(X, y)

In [13]:
#dump(clf, 'linearsvc_avgpool.joblib') 
clf = load('./linearsvc_avgpool.joblib') 

In [28]:
avgpool_normalized = preprocessing.normalize(np.load('./resnext-101-kinetics-ucf101_split1_avgpool_val.npy'))
predict = clf.predict(avgpool_normalized)

In [29]:
print(predict[0:100])

[20 78  1 48 63 66 95 78 78 65 48 48 63  1 63 62 65 60 78 59  1  1  1  1
 60  1 63  1 95 95 65 65 78 95 78 65 60 20 63 60 66 60 48  1 65  1 20 78
 95 63 66 63  6  1 20  1  1  1 63 63 63 65 63 48 13 13 13  1  1 13 78  1
 78  1 59 20 59 12 65 78 62 44 71 12 44 27 74 48 27 27 27 48 44 60 50 78
 50  1 25  1]


In [46]:
previous_name = targets_arr[0]
previous_label = targets_idx[0]
video_label = []
video_predict = []
### aggregate clip label to video label
pred_tmp = []
for label, name, pred in zip(targets_idx, targets_arr, predict):
    #print(name, previous_name)
    if name != previous_name:
        previous_name = name
        video_label.append(previous_label)
        previous_label = label
        video_predict.append(stats.mode(pred_tmp)[0][0])
        pred_tmp = []
        pred_tmp.append(pred)
    else:
        pred_tmp.append(pred)

In [34]:
stats.mode([1,2,2,4,4,4])[0]

array([4])

In [48]:
print(len(video_label))
print(len(video_predict))
print(len(targets_idx))
print(len(predict))

3782
3782
45106
45106


In [47]:
print(video_label)
print(video_predict)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 

In [50]:
accuracy_score(video_label, video_predict)

0.19725013220518245

In [2]:
caffe_model = torch.load('./models/c3d_resnet18_ucf101_r2_ft_iter_20000.caffemodel.pt')
#print(tmp_model['bn_conv1.weight'].shape)
# print(tmp_model.keys())
# print(tmp_model['res2a_branch2a.weight'].shape)
# print(tmp_model['res2a_branch1.weight'].shape)
# print(len(tmp_model.keys()))
caffe_dict = caffe_model.keys()
torch_dict = {}
for i, key in enumerate(caffe_dict):
    reminder = i % 5
    conv1_layer = ['conv1.weight', 'bn1.running_mean', 'bn1.running_var', 'bn1.weight', 'bn1.bias']
    downsample_layer = ['0.weight', '1.running_mean', '1.running_var', '1.weight', '1.bias']
    conv2_layer = ['conv2.weight', 'bn2.running_mean', 'bn2.running_var', 'bn2.weight', 'bn2.bias']
    # if reminder == 0: ## conv weight
    if key.find('conv1') != -1: ## first layer
        new_key = 'module.' + conv1_layer[reminder]
    elif key.find('fc8-new') != -1:
        new_key = 'module.fc.'+key.split('.')[1]
    else:
        if reminder == 0:
            layer_idx = int(key[3])-1
        elif reminder == 1 or reminder == 2:
            layer_idx = int(key[2])-1
        else:
            layer_idx = int(key[5])-1
            
        if key.find('branch1') != -1: ## downsample
            new_key = 'module.layer{}.0.downsample.{}'.format(layer_idx, downsample_layer[reminder])
        elif key.find('a_branch2a') != -1:
            new_key = 'module.layer{}.0.{}'.format(layer_idx, conv1_layer[reminder])
        elif key.find('a_branch2b') != -1:
            new_key = 'module.layer{}.0.{}'.format(layer_idx, conv2_layer[reminder])
        elif key.find('b_branch2a') != -1:
            new_key = 'module.layer{}.1.{}'.format(layer_idx, conv1_layer[reminder])
        elif key.find('b_branch2b') != -1:
            new_key = 'module.layer{}.1.{}'.format(layer_idx, conv2_layer[reminder])
    torch_dict[new_key] = caffe_model[key]
    
print(torch_dict.keys())
    

dict_keys(['module.conv1.weight', 'module.bn1.running_mean', 'module.bn1.running_var', 'module.bn1.weight', 'module.bn1.bias', 'module.layer1.0.downsample.0.weight', 'module.layer1.0.downsample.1.running_mean', 'module.layer1.0.downsample.1.running_var', 'module.layer1.0.downsample.1.weight', 'module.layer1.0.downsample.1.bias', 'module.layer1.0.conv1.weight', 'module.layer1.0.bn1.running_mean', 'module.layer1.0.bn1.running_var', 'module.layer1.0.bn1.weight', 'module.layer1.0.bn1.bias', 'module.layer1.0.conv2.weight', 'module.layer1.0.bn2.running_mean', 'module.layer1.0.bn2.running_var', 'module.layer1.0.bn2.weight', 'module.layer1.0.bn2.bias', 'module.layer1.1.conv1.weight', 'module.layer1.1.bn1.running_mean', 'module.layer1.1.bn1.running_var', 'module.layer1.1.bn1.weight', 'module.layer1.1.bn1.bias', 'module.layer1.1.conv2.weight', 'module.layer1.1.bn2.running_mean', 'module.layer1.1.bn2.running_var', 'module.layer1.1.bn2.weight', 'module.layer1.1.bn2.bias', 'module.layer2.0.downsamp

In [3]:
# res_model = torch.load('./models/resnet-18-kinetics.pth')
# print(res_model['state_dict'].keys())
# print(res_model['state_dict']['module.bn1.running_mean'])

In [2]:
# for caffe, pyt in zip(tmp_model.keys(), res_model['state_dict'].keys()):
#     print(tmp_model[caffe].shape, res_model['state_dict'][pyt].shape)

In [7]:
from models import resnet_caffe
model = resnet_caffe.resnet18(sample_size=112, sample_duration=16, shortcut_type='B', num_classes=101)
model = model.cuda()
model = nn.DataParallel(model)
model.eval()
model.load_state_dict(torch_dict)
#print(model.state_dict().keys())
# for i, layer in enumerate(model.state_dict().keys()):
#      print(i, layer)

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [8]:
x = torch.randn((1,3,16,112,112))
model(x)

tensor([[-130.8975,  -25.8179,   43.4690,   36.3603,  -29.6598,  -39.2980,
           -5.9699,   26.8309,   17.7373,  -36.1411,   59.9528,  -26.2589,
            9.7741,   80.8986,  -39.8367,   34.7911,  -37.2695,  -20.7076,
         -151.1088,  -56.3450,    1.1867,   97.9389,    1.2005,   30.1975,
          -50.9447,   44.1759,  -44.9805,   42.7735,   11.0413,  -45.4526,
          -93.0874,   18.9617,  -47.5712,   99.7916,   63.6919,  -23.5682,
           -6.7417,   47.0223,   -3.9296,   85.5262,   24.3903,   19.2774,
           52.8577,   33.8127,   45.5813,  -56.3989,   58.5577,  -45.1510,
           -8.5163,  -66.8661,  -12.0211,   52.1630,   98.9316,   57.1439,
           61.8596,  -78.9579,  -48.3015,  202.6144,   12.6722,  -72.2948,
           64.1149,  -29.5486,  -92.5494, -109.3122,   -3.4007,  -53.8989,
          -37.7446,   25.3518,  -41.7136,  -26.5310,   33.7696,  197.7914,
            7.7638,   20.6414,   99.2388,   22.6202,  -20.4523, -101.1449,
          119.3585,   60.

In [6]:
model(x)

tensor([[ 0.1217,  1.0752,  0.0675,  0.0295, -0.3817,  1.3798,  0.6026, -0.3443,
         -0.6521, -0.6106, -0.6721,  0.3470,  0.4814,  0.0223,  0.7652, -0.4864,
          0.7846, -0.6006,  0.2291,  0.0336,  0.4571,  0.1665, -0.9561, -0.1742,
          0.4742, -0.7311,  0.2004,  0.1071, -0.0400, -0.3949, -0.2355,  0.2926,
         -0.5792, -0.8404,  0.4926, -0.2026,  0.4633,  0.3535, -0.7756, -0.6819,
         -0.0411,  0.3419, -0.2130,  0.4013,  0.4002,  0.5178,  0.1446,  0.2262,
          0.6753,  1.0599, -0.0809,  0.3762, -0.0428,  0.0939, -0.2009,  0.1004,
          0.2245,  0.2697, -0.5784, -0.7713,  0.5592, -0.0611, -0.2215,  0.7363,
         -0.3917,  0.5239, -0.4431,  0.5771, -0.2535,  1.2788,  0.0825,  0.0817,
         -0.8194, -0.3176,  0.5321, -0.4596, -0.2984,  0.2149, -0.8052,  0.6766,
         -0.3005,  0.0413, -0.0307,  0.1323,  0.2245, -0.3893,  0.5780,  0.4382,
         -0.8076, -0.4176, -0.3427,  0.0354, -0.3200, -0.1954, -0.0390, -0.8472,
          0.0788, -0.0088, -

In [9]:
torch.save(model.state_dict(), './models/c3d_resnet18_ucf101_r2_ft_iter_20000.torchmodel.pth')

In [16]:
print(model)

DataParallel(
  (module): ResNet(
    (conv1): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace)
    (maxpool): MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace)
        (conv2): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (bn2): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (downsample): Sequential(
          (0): Conv3d(64, 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
          (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, af