In [1]:
import os
import sys
import json
import numpy as np
import torch
from torch import nn
from torch import optim
from torch.optim import lr_scheduler

from opts import parse_opts
from mean import get_mean, get_std
from spatial_transforms import (
    Compose, Normalize, Scale, CenterCrop, CornerCrop, MultiScaleCornerCrop,
    MultiScaleRandomCrop, RandomHorizontalFlip, ToTensor)
from temporal_transforms import LoopPadding, TemporalRandomCrop
from target_transforms import ClassLabel, VideoID
from target_transforms import Compose as TargetCompose
from dataset import get_training_set, get_validation_set, get_test_set
from utils import Logger
from train import train_epoch
from validation import val_epoch
import test
import collections
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from joblib import dump, load
from sklearn import preprocessing
from scipy import stats
from sklearn.metrics import accuracy_score

from models import *

In [2]:
# use two gpu devices on the server, you can customize it depending on how many available gpu devices you have
os.environ['CUDA_VISIBLE_DEVICES']='0,1'

### Set hyperparameters 



In [3]:

import easydict
opt = easydict.EasyDict({
    "result_path": 'results1',      # 'Result directory path'
    "dataset": 'ucf101-music',      # 'Used dataset‘
    "n_classes": 9,                 # 'Number of classes for fine-tuning'
    "sample_size": 112,             # 'Height and width of inputs'
    "sample_duration": 16,          # 'Temporal duration of inputs'
    "initial_scale": 1.0,           # 'Initial scale for multiscale cropping')
    "n_scales": 5,                  # 'Number of scales for multiscale cropping'
    "scale_step": 0.84089641525,    # 'Scale step for multiscale cropping'
    "train_crop": 'corner',         # 'Spatial cropping method in training. Corner is selection from 4 corners and 1 center.
    "learning_rate": 0.1,           # 'Initial learning rate (divided by 10 while training by lr scheduler)'
    "momentum": 0.9,                # 'Momentum'
    "dampening": 0.9,               # 'Dampening of SGD'
    "weight_decay": 0.001,          # 'Weight decay in SGD'
    "mean_dataset": 'kinetics',     # 'Dataset for mean values of mean subtraction'
    "no_mean_norm": False,          # 'If true, inputs are not normalized by mean'
    "std_norm": False,              # 'If true, inputs are normalized by standard deviation'
    "nesterov": False,              # 'Nesterov momentum'
    "optimizer": 'sgd',             # 'Optimizer type'
    "lr_patience": 10,              # 'Patience of LR scheduler. See documentation of Reduce on Loss Plateau Decay (ReduceLROnPlateau). https://www.deeplearningwizard.com/deep_learning/boosting_models_pytorch/lr_scheduling/#reduce-on-loss-plateau-decay'
    "batch_size": 32,               # 'Batch size'
    "n_epochs": 2,                  # 'Fine-tuning epochs'
    "begin_epoch": 1,               # 'Training begins at this epoch. Previous trained model indicated by resume_path is loaded.'
    "n_val_samples": 3,             # 'Number of validation samples for each activity'
    "ft_begin_index": 5,            # 'Begin block index of fine-tuning'
    "scale_in_test": 1.0,           # 'Spatial scale in test'
    "crop_position_in_test": 'c',   #  Cropping method: center cropping
    "no_softmax_in_test": False,    # 'If true, output for each clip is not normalized using softmax.'
    "no_cuda": False,               # 'If true, cuda is not used.'
    "n_threads": 4,                 # 'Number of threads for multi-thread loading'
    "checkpoint": 2,                #  Trained model is saved at every this epochs'
    "no_hflip": False,              # 'If true holizontal flipping is not performed'
    "norm_value": 1,                # 'If 1, range of inputs is [0-255]. If 255, range of inputs is [0-1].'
    "model": 'resnet',              
    "pretained_model_name": 'resnext-101-kinetics',
    "model_depth": 101,             # 'Depth of resnet 
    "resnet_shortcut": 'B',         # 'Shortcut type of resnet' to align the dimensionality for skip connection
    "wide_resnet_k": 2,             # 'Wide resnet k'
    "resnext_cardinality": 32,      # 'ResNeXt cardinality'
    "manual_seed": 1,               # 'Manually set random seed'
    'test_subset': 'test',          # 'Used test subset name'
})
opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
opt.root_path = '/data/qq/CSCE689/'
opt.video_path = opt.root_path + 'video/UCF-music/'
opt.annotation_path = opt.root_path + 'video/UCF-music-annotation/ucf101_music_with_testing.json'
opt.pretrained_model_path = opt.root_path + 'pretrain/' + opt.pretained_model_name + '.pth'


### Load pretrained model

In [4]:
from models import resnext

# construct model architecture
model = resnext.resnet101(
            num_classes=opt.n_classes,
            shortcut_type=opt.resnet_shortcut,
            cardinality=opt.resnext_cardinality,
            sample_size=opt.sample_size,
            sample_duration=opt.sample_duration)

model = model.cuda()
# wrap the current model again in nn.DataParallel / or we can just remove the .module keys.
model = nn.DataParallel(model, device_ids=None)


  m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')


In [5]:
# check model structure
model

DataParallel(
  (module): ResNeXt(
    (conv1): Conv3d(3, 64, kernel_size=(7, 7, 7), stride=(1, 2, 2), padding=(3, 3, 3), bias=False)
    (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): ResNeXtBottleneck(
        (conv1): Conv3d(64, 128, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
        (bn1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), groups=32, bias=False)
        (bn2): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv3d(128, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
        (bn3): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (rel

In [6]:
# summarize model input and output
from torchsummary import summary
summary(model, (3, 16, 112, 112))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv3d-1       [-1, 64, 16, 56, 56]          65,856
            Conv3d-2       [-1, 64, 16, 56, 56]          65,856
       BatchNorm3d-3       [-1, 64, 16, 56, 56]             128
       BatchNorm3d-4       [-1, 64, 16, 56, 56]             128
              ReLU-5       [-1, 64, 16, 56, 56]               0
              ReLU-6       [-1, 64, 16, 56, 56]               0
         MaxPool3d-7        [-1, 64, 8, 28, 28]               0
         MaxPool3d-8        [-1, 64, 8, 28, 28]               0
            Conv3d-9       [-1, 128, 8, 28, 28]           8,192
           Conv3d-10       [-1, 128, 8, 28, 28]           8,192
      BatchNorm3d-11       [-1, 128, 8, 28, 28]             256
             ReLU-12       [-1, 128, 8, 28, 28]               0
      BatchNorm3d-13       [-1, 128, 8, 28, 28]             256
             ReLU-14       [-1, 128, 8,

### Load pretrained weight

In [9]:
# customize the pretrained model path
pretrain = torch.load(opt.pretrained_model_path)
pretrain_dict = pretrain['state_dict']

# do not load the last layer since we want to fine-tune it
pretrain_dict.pop('module.fc.weight')
pretrain_dict.pop('module.fc.bias')
model_dict = model.state_dict()
model_dict.update(pretrain_dict) 
model.load_state_dict(model_dict)

<All keys matched successfully>

In [4]:
# load preprocessed video frames and annotation
from datasets.ucf101 import UCF101

mean = get_mean(opt.norm_value, dataset='kinetics')
std = get_std(opt.norm_value)
norm_method = Normalize(mean, [1,1,1])


spatial_transform = Compose([
    Scale(opt.sample_size),
    CornerCrop(opt.sample_size, 'c'),
    ToTensor(opt.norm_value), norm_method
])

temporal_transform = LoopPadding(opt.sample_duration)
target_transform = ClassLabel() # VideoID()

# get training data
training_data = UCF101(
    opt.video_path,
    opt.annotation_path,
    'training',
    0,
    spatial_transform=spatial_transform,
    temporal_transform=temporal_transform,
    target_transform=target_transform,
    sample_duration=16)

# wrap training data
train_loader = torch.utils.data.DataLoader(
    training_data,
    batch_size=opt.batch_size,
    shuffle=False,
    num_workers=opt.n_threads,
    pin_memory=False) # True


# get validation data
val_data = UCF101(
    opt.video_path,
    opt.annotation_path,
    'validation',
    0,
    spatial_transform=spatial_transform,
    temporal_transform=temporal_transform,
    target_transform=target_transform,
    sample_duration=16)

# wrap validation data
val_loader = torch.utils.data.DataLoader(
    val_data,
    batch_size=opt.batch_size,
    shuffle=False,
    num_workers=opt.n_threads,
    pin_memory=False) 

dataset loading [0/911]
dataset loading [0/207]


In [14]:
val_data.class_names

{0: 'PlayingCello',
 1: 'PlayingDaf',
 2: 'PlayingDhol',
 3: 'PlayingFlute',
 4: 'PlayingGuitar',
 5: 'PlayingPiano',
 6: 'PlayingSitar',
 7: 'PlayingTabla',
 8: 'PlayingViolin'}

### Fine-tune the pretrained model on the targe UCF-101 dataset

In [9]:
criterion = nn.CrossEntropyLoss()
if not opt.no_cuda:
    criterion = criterion.cuda()
    
    
from models.resnext import get_fine_tuning_parameters

# get fine-tune parameters (we fine-tune all of them)
# parameters = model.parameters()
parameters = get_fine_tuning_parameters(model, opt.ft_begin_index)

optimizer = optim.SGD(
            parameters,
            lr=opt.learning_rate,
            momentum=opt.momentum,
            dampening=opt.dampening,
            weight_decay=opt.weight_decay,
            nesterov=opt.nesterov)

scheduler = lr_scheduler.ReduceLROnPlateau(
            optimizer, 'min', patience=opt.lr_patience)

train_logger = Logger(
    os.path.join(opt.result_path, 'train.log'),
    ['epoch', 'loss', 'acc', 'lr'])
train_batch_logger = Logger(
    os.path.join(opt.result_path, 'train_batch.log'),
    ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr'])
val_logger = Logger(
            os.path.join(opt.result_path, 'val.log'), ['epoch', 'loss', 'acc'])

for i in range(opt.begin_epoch, opt.n_epochs + 1):

    train_epoch(i, train_loader, model, criterion, optimizer, 
                opt, train_logger, train_batch_logger)
    
    validation_loss = val_epoch(i, val_loader, model, criterion, 
                                opt, val_logger)
    
    scheduler.step(validation_loss)

train at epoch 1
Epoch: [1][1/492]	Time 2.308 (2.308)	Data 1.808 (1.808)	Loss 2.2647 (2.2647)	Acc 0.156 (0.156)
Epoch: [1][2/492]	Time 0.602 (1.455)	Data 0.129 (0.969)	Loss 7.7761 (5.0204)	Acc 0.000 (0.078)
Epoch: [1][3/492]	Time 0.485 (1.132)	Data 0.024 (0.654)	Loss 7.7552 (5.9320)	Acc 0.500 (0.219)
Epoch: [1][4/492]	Time 0.482 (0.969)	Data 0.020 (0.495)	Loss 15.9185 (8.4286)	Acc 0.000 (0.164)
Epoch: [1][5/492]	Time 0.476 (0.871)	Data 0.022 (0.401)	Loss 22.9557 (11.3340)	Acc 0.000 (0.131)
Epoch: [1][6/492]	Time 0.480 (0.806)	Data 0.016 (0.336)	Loss 26.7363 (13.9011)	Acc 0.000 (0.109)
Epoch: [1][7/492]	Time 0.468 (0.757)	Data 0.016 (0.291)	Loss 6.7254 (12.8760)	Acc 0.344 (0.143)
Epoch: [1][8/492]	Time 0.487 (0.724)	Data 0.025 (0.257)	Loss 19.8920 (13.7530)	Acc 0.000 (0.125)
Epoch: [1][9/492]	Time 0.624 (0.712)	Data 0.031 (0.232)	Loss 3.8206 (12.6494)	Acc 0.000 (0.111)
Epoch: [1][10/492]	Time 0.491 (0.690)	Data 0.023 (0.211)	Loss 24.7607 (13.8605)	Acc 0.000 (0.100)
Epoch: [1][11/492]	Ti

KeyboardInterrupt: 

In [10]:
# We skip the training process here since it takes too long and directly load a saved weight after 50 epochs.

### Test

In [5]:
from datasets.ucf101 import UCF101

mean = get_mean(opt.norm_value, dataset='kinetics')
std = get_std(opt.norm_value)
norm_method = Normalize(mean, [1,1,1])


spatial_transform = Compose([
    Scale(opt.sample_size),
    CornerCrop(opt.sample_size, 'c'),
    ToTensor(opt.norm_value), norm_method
])

temporal_transform = LoopPadding(opt.sample_duration)
target_transform = VideoID() # ClassLabel()



# get test data
test_data = UCF101(
    opt.video_path,
    opt.annotation_path,
    'testing',
    0,
    spatial_transform=spatial_transform,
    temporal_transform=temporal_transform,
    target_transform=target_transform,
    sample_duration=16)


# wrap test data
test_loader = torch.utils.data.DataLoader(
    test_data,
    batch_size=opt.batch_size,
    shuffle=False,
    num_workers=opt.n_threads,
    pin_memory=False)


dataset loading [0/149]


In [None]:

# load best weight (we can also refit the model on the combined train-val dataset, 
# but here we simple load the weight and do the final testing)
pretrain = torch.load('./results1/save_50.pth')
model.load_state_dict(pretrain['state_dict'])


from testing import final_test
test_results, all_output_buffer = final_test(test_loader, model, opt, test_data.class_names)

In [10]:
len(training_data)/16

983.5625

In [None]:
# We can check the training and testing results in the saved files. We'll list it in the report.