In [1]:
import os
import sys
import json
import numpy as np
import torch
from torch import nn
from torch import optim
from torch.optim import lr_scheduler

from opts import parse_opts
from mean import get_mean, get_std
from spatial_transforms import (
    Compose, Normalize, Scale, CenterCrop, CornerCrop, MultiScaleCornerCrop,
    MultiScaleRandomCrop, RandomHorizontalFlip, ToTensor)
from temporal_transforms import LoopPadding, TemporalRandomCrop
from target_transforms import ClassLabel, VideoID
from target_transforms import Compose as TargetCompose
from dataset import get_training_set, get_validation_set, get_test_set
from utils import Logger
from train import train_epoch
from validation import val_epoch
import test
import collections
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from joblib import dump, load
from sklearn import preprocessing
from scipy import stats
from sklearn.metrics import accuracy_score

from models import *

In [6]:
import easydict
opt = easydict.EasyDict({
    "result_path": 'results2',
    "dataset": 'ucf101-music', # 'ucf101',
    "n_classes": 9, 
    "sample_size": 112,
    "sample_duration": 16,
    "initial_scale": 1.0,
    "n_scales": 5,
    "scale_step": 0.84089641525,
    "train_crop": 'corner',
    "learning_rate": 0.1,
    "momentum": 0.9,
    "dampening": 0.9,
    "weight_decay": 0.001,
    "mean_dataset": 'activitynet',
    "no_mean_norm": False,
    "std_norm": False,
    "nesterov": False,
    "optimizer": 'sgd',
    "lr_patience": 10,
    "batch_size": 16,
    "n_epochs": 2,
    "begin_epoch": 1,
    "n_val_samples": 3,
    "ft_begin_index": 5,
    "scale_in_test": 1.0,
    "crop_position_in_test": 'c',
    "no_softmax_in_test": False,
    "no_cuda": False,
    "n_threads": 4,
    "checkpoint": 2,
    "no_hflip": False,
    "norm_value": 1,
    "model": 'resnet',
    "pretained_model_name": 'resnext-101-kinetics',
    "model_depth": 101,
    "resnet_shortcut": 'B',
    "wide_resnet_k": 2,
    "resnext_cardinality": 32,
    "manual_seed": 1,
    'test_subset': 'test',
})
opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
opt.root_path = '/data/qq/CSCE689/'
opt.video_path = opt.root_path + 'video/UCF-101-jpg/'
opt.annotation_path = opt.root_path + 'video/UCF-music-annotation/ucf_binary_music_annotation.json'

In [7]:
# use two gpu devices on the server, you can customize it depending on how many available gpu devices you have
os.environ['CUDA_VISIBLE_DEVICES']='0'

In [8]:
from models import resnext

# construct model architecture
model = resnext.resnet101(
            num_classes=opt.n_classes,
            shortcut_type=opt.resnet_shortcut,
            cardinality=opt.resnext_cardinality,
            sample_size=opt.sample_size,
            sample_duration=opt.sample_duration)

model = model.cuda()
# wrap the current model again in nn.DataParallel / or we can just remove the .module keys.
model = nn.DataParallel(model, device_ids=None)

# load best weight (we can also refit the model on the combined train-val dataset, 
# but here we simple load the weight and do the final testing)
pretrain = torch.load('./results1/save_50.pth')
model.load_state_dict(pretrain['state_dict'])


<All keys matched successfully>

In [9]:
from datasets.ucf101 import UCF101

mean = get_mean(opt.norm_value, dataset='kinetics')
std = get_std(opt.norm_value)
norm_method = Normalize(mean, [1,1,1])


spatial_transform = Compose([
    Scale(opt.sample_size),
    CornerCrop(opt.sample_size, 'c'),
    ToTensor(opt.norm_value), norm_method
])

temporal_transform = LoopPadding(opt.sample_duration)
target_transform = VideoID() # ClassLabel()



# get test data
test_data = UCF101(
    opt.video_path,
    opt.annotation_path,
    'testing',
    0,
    spatial_transform=spatial_transform,
    temporal_transform=temporal_transform,
    target_transform=target_transform,
    sample_duration=16)


# wrap test data
test_loader = torch.utils.data.DataLoader(
    test_data,
    batch_size=opt.batch_size,
    shuffle=False,
    num_workers=opt.n_threads,
    pin_memory=False)


dataset loading [0/1600]
dataset loading [1000/1600]


In [90]:
examples = ['v_BasketballDunk_g01_c02', 
            'v_Drumming_g05_c06', 
            'v_Rafting_g07_c03', 
            'v_PlayingFlute_g07_c04',
            'v_PlayingPiano_g01_c02']

## Examples

In [103]:
tvns = np.load(opt.root_path + 'feature_ucf_all/class_names_ucf_test.npy')
avi_path = "/data/qq/CSCE689/video/UCF-101"
clip_duration_dict = {}
real_prediction_dict = {}
import os
from moviepy.editor import VideoFileClip
for tvn in examples:
    print()
    clip = VideoFileClip(os.path.join(avi_path, tvn[2:-8], tvn + ".avi"))
    clip_duration_dict[tvn] = [clip.duration]
#     real_prediction_dict[tvn] = test_results['results'][tvn][0]








In [104]:
clip_duration_dict

{'v_BasketballDunk_g01_c02': [2.6],
 'v_Drumming_g05_c06': [8.44],
 'v_Rafting_g07_c03': [7.28],
 'v_PlayingFlute_g07_c04': [16.28],
 'v_PlayingPiano_g01_c02': [7.37]}

In [105]:
# with open('./hw6_results/clip_duration_dict_example.json', 'w') as fp:
#     json.dump(clip_duration_dict, fp)

In [106]:
# predict
clf = load('./hw6_results/logistic_ucf.joblib') 

x_test_1 = np.load(opt.root_path + 'feature_ucf_all/resnext101_avgpool_train.npy')
x_test_2 = np.load(opt.root_path + 'feature_ucf_all/resnet50_avgpool_train.npy')
x_test = np.concatenate([x_test_1, x_test_2], axis=1)
y_test = np.load(opt.root_path + 'feature_ucf_all/class_names_ucf_test.npy')
y_pred_test_raw = clf.predict(x_test)
y_pred_test_prob_raw = clf.predict_proba(x_test)


# get ground-truth split
# name_to_int = {v:k for k,v in test_data.class_names.items()}
name_to_int = {'Yes': 1, 'No': 0}


split_idx = []
for idx, y_name in enumerate(y_test):
    if idx == 0 or y_name != y_test[idx-1]:
        split_idx.append(idx)

y_pred_test, y_pred_test_prob, y_pred_test_final = {}, {}, {}
for i, split in enumerate(split_idx):
    if i < len(split_idx) - 1:
        y_pred_test[y_test[split]] = y_pred_test_raw[split:split_idx[i+1]]
        y_pred_test_prob[y_test[split]] = y_pred_test_prob_raw[split:split_idx[i+1]]
        y_pred_test_final[y_test[split]] = np.argmax(np.mean(y_pred_test_prob_raw[split:split_idx[i+1]], axis=0))
        
        

### Figure

In [108]:
import matplotlib.pyplot as plt
import numpy as np

music = ['PlayingCello',  'PlayingDhol',   'PlayingGuitar',  'PlayingSitar',  'PlayingViolin', 
         'PlayingDaf',    'PlayingFlute',  'PlayingPiano',   'PlayingTabla']

for tvn in examples:
    interval = clip_duration_dict[tvn][0]/list(y_test).count(tvn)
    x = np.arange(0, clip_duration_dict[tvn][0], interval) + interval
    idx = 0 if tvn in music else 1
    y = 1-np.array([pred[idx] for pred in y_pred_test_prob[tvn]])  # np.arange(len(test_data.class_names) + 1)    
    x = x[:len(y)]
    plt.plot(x, y)
    plt.ylim([-0.1, 1.1])
    plt.xlabel ('time/sec')
    plt.ylabel ('pred score for ground truth label')
    plt.title("Ground Truth Label:  " + tvn[2:-8]  + "\n Model Avg. Predict Score:  " + str(np.mean(y))) # str(real_prediction_dict[tvn]['score'])
    plt.savefig("./hw6_results/fig_one_label/" + tvn, bbox_inches='tight')
    plt.close()
#     break

### Json

In [109]:
import numpy as np


music = ['PlayingCello',  'PlayingDhol',   'PlayingGuitar',  'PlayingSitar',  'PlayingViolin', 
         'PlayingDaf',    'PlayingFlute',  'PlayingPiano',   'PlayingTabla']



timeTrueLabel = {}
for tvn in examples:
    interval = clip_duration_dict[tvn][0]/list(y_test).count(tvn)
    x = np.arange(0, clip_duration_dict[tvn][0], interval) + interval
    idx = 0 if tvn in music else 1
    y_one_label = 1-np.array([pred[idx] for pred in y_pred_test_prob[tvn]]) 
    x = x[:len(y_one_label)]    
    timeTrueLabel[tvn] = {tvn[2:-8]: [[str(time), str(y_one_label[idx])] for idx, time in enumerate(x)]}

In [110]:

with open('./hw6_results/fig_one_label/example.json', 'w') as fp:
    json.dump(timeTrueLabel, fp)



## All others

### Extract clip duration

In [None]:
tvns = np.load(opt.root_path + 'feature_ucf_all/class_names_ucf_test.npy')
avi_path = "/data/qq/CSCE689/video/UCF-101"
clip_duration_dict = {}
real_prediction_dict = {}
import os
from moviepy.editor import VideoFileClip
i = 0
for tvn in tvns:
    i += 1
    if i % 100 == 0:
        print(i)
    clip = VideoFileClip(os.path.join(avi_path, tvn[2:-8], tvn + ".avi"))
    clip_duration_dict[tvn] = [clip.duration]
#     real_prediction_dict[tvn] = test_results['results'][tvn][0]

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600


In [None]:
with open('./hw6_results/clip_duration_dict.json', 'w') as fp:
    json.dump(clip_duration_dict, fp)
    
    

In [14]:
# predict
clf = load('./hw6_results/logistic_ucf.joblib') 

x_test_1 = np.load(opt.root_path + 'feature_ucf_all/resnext101_avgpool_train.npy')
x_test_2 = np.load(opt.root_path + 'feature_ucf_all/resnet50_avgpool_train.npy')
x_test = np.concatenate([x_test_1, x_test_2], axis=1)
y_test = np.load(opt.root_path + 'feature_ucf_all/class_names_ucf_test.npy')
y_pred_test_raw = clf.predict(x_test)
y_pred_test_prob_raw = clf.predict_proba(x_test)


# get ground-truth split
# name_to_int = {v:k for k,v in test_data.class_names.items()}
name_to_int = {'Yes': 1, 'No': 0}


split_idx = []
for idx, y_name in enumerate(y_test):
    if idx == 0 or y_name != y_test[idx-1]:
        split_idx.append(idx)

y_pred_test, y_pred_test_prob, y_pred_test_final = {}, {}, {}
for i, split in enumerate(split_idx):
    if i < len(split_idx) - 1:
        y_pred_test[y_test[split]] = y_pred_test_raw[split:split_idx[i+1]]
        y_pred_test_prob[y_test[split]] = y_pred_test_prob_raw[split:split_idx[i+1]]
        y_pred_test_final[y_test[split]] = np.argmax(np.mean(y_pred_test_prob_raw[split:split_idx[i+1]], axis=0))
        
        

### Generate prediction plot for each video -- HW6 ensemble ResNext-101 + ResNet 50 + logistic Regression

In [52]:
import matplotlib.pyplot as plt
import numpy as np

music = ['PlayingCello',  'PlayingDhol',   'PlayingGuitar',  'PlayingSitar',  'PlayingViolin', 
         'PlayingDaf',    'PlayingFlute',  'PlayingPiano',   'PlayingTabla']

for tvn in y_pred_test_prob:
    interval = clip_duration_dict[tvn][0]/list(y_test).count(tvn)
    x = np.arange(0, clip_duration_dict[tvn][0], interval) + interval
    idx = 0 if tvn in music else 1
    y = 1-np.array([pred[idx] for pred in y_pred_test_prob[tvn]])  # np.arange(len(test_data.class_names) + 1)    
    x = x[:len(y)]
    plt.plot(x, y)
    plt.ylim([-0.1, 1.1])
    plt.xlabel ('time/sec')
    plt.ylabel ('pred score for ground truth label')
    plt.title("Ground Truth Label:  " + tvn[2:-8]  + "\n Model Avg. Predict Score:  " + str(np.mean(y))) # str(real_prediction_dict[tvn]['score'])
    plt.savefig("./hw6_results/fig_one_label/" + tvn, bbox_inches='tight')
    plt.close()
#     break

### Generate json file for each video -- HW6 ensemble ResNext-101 + ResNet 50 + logistic Regression

In [44]:
import numpy as np


timeTrueLabel = {}
for tvn in y_pred_test_prob:
    interval = clip_duration_dict[tvn][0]/list(y_test).count(tvn)
    x = np.arange(0, clip_duration_dict[tvn][0], interval) + interval
    idx = 0 if tvn in music else 1
    y_one_label = 1-np.array([pred[idx] for pred in y_pred_test_prob[tvn]]) 
    x = x[:len(y_one_label)]    
    timeTrueLabel[tvn] = {tvn[2:-8]: [[str(time), str(y_one_label[idx])] for idx, time in enumerate(x)]}

In [47]:
with open('./hw6_results/fig_one_label/timeLabel.json', 'w') as fp:
    json.dump(timeTrueLabel, fp)