In [1]:
import argparse
import os

import cPickle as pkl

import h5py
import numpy as np
import pandas as pd


from sklearn.preprocessing import LabelBinarizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize


from sparseprop.feature import C3D as FeatHelper
from sparseprop.utils import get_typical_durations
from sparseprop.train import learn_class_independent_model
from sparseprop.train import learn_class_induced_model

In [2]:
train_filename = '/mnt/data/Proposals/labels.csv'
feature_filename = '/mnt/data/c3d_features/feature.c3d.hdf5'
model_filename= 'output.pkl'
dict_size=256, 
dict_type='induced'
dataset_filename=None
verbose=True

In [3]:
###########################################################################
    # Prepare input/output files.
    ###########################################################################
    # Reading training file.
if not os.path.exists(train_filename):
    raise RuntimeError('Please provide a valid train file: not exists')
train_df = pd.read_csv(train_filename, sep=',')
rfields = ['video-name', 'f-init', 'n-frames', 'video-frames', 'label-idx']
efields = np.unique(train_df.columns)
if not all([field in efields for field in rfields]):
    raise RuntimeError('Please provide a valid train file: bad formatting')
# Feature file sanity check.
with h5py.File(feature_filename) as fobj:
    # Checks that feature file contains all the videos in train_filename.
    evideos = fobj.keys()
    rvideos = np.unique(train_df['video-name'].values)
    if not all([x in evideos for x in rvideos]):
        raise RuntimeError(('Please provide a valid feature file: '
                            'some videos are missing.'))


In [4]:
train_df['n-frames']

0          3698
1           796
2          2491
3           407
4          1956
5          1247
6          2724
7           172
8         49142
9          3327
10          720
11       138163
12         3672
13         1216
14       165892
15          455
16           65
17          702
18         5563
19        86682
20         2368
21         5456
22          717
23         4801
24         1342
25          448
26         5889
27         3396
28          157
29         4675
          ...  
39590      2739
39591       756
39592       648
39593     13530
39594      1045
39595     15342
39596      1713
39597       979
39598       548
39599       488
39600     49397
39601       348
39602       157
39603       614
39604       879
39605      2444
39606       752
39607       116
39608       122
39609     37763
39610      1313
39611      2183
39612       281
39613       669
39614       240
39615       735
39616     34075
39617       205
39618      3719
39619         0
Name: n-frames, Length: 

In [5]:
def load_dataset(df, hdf5_filename, n_clusters=256, 
                 output_filename=None, verbose=True):

    # Avoid re-computing if dataset exists.
    if output_filename:
        if os.path.exists(output_filename):
            with open(output_filename, 'rb') as fobj:
                return pkl.load(fobj)
    
    # Iterate over each annotation instance and load its features.
    video_lst, label_lst, feat_lst = [], [], []
    feat_obj = FeatHelper(hdf5_filename)
    feat_obj.open_instance()
    for k, row in df.iterrows():
      
        this_feat = feat_obj.read_feat(row['video-name'],
                                        row['f-init'],
                                     int(row['n-frames']))
        feat_lst.append(this_feat)
        label_lst.append(np.repeat(row['label-idx'], this_feat.shape[0]))
        video_lst.append(np.repeat(row['video-name'], this_feat.shape[0]))
#         except:
#             if verbose:
#                 print ('Warning: instance from video '
#                        '{} was discarded.').format(row['video-name'])
    
    # Stack features in a matrix.
    feat_stack = np.vstack(feat_lst)
    
    # Compute KMeans centers.
    km = KMeans(n_clusters=n_clusters, n_jobs=-1)
    n_samples = np.minimum(1e4, feat_stack.shape[0])
    sidx = np.random.permutation(np.arange(feat_stack.shape[0]))[:n_samples]
    km.fit(feat_stack[sidx, :])
    
    # Pack dataset in a dictionary.
    dataset = {'feat': feat_stack,
               'label': np.hstack(label_lst),
               'video-name': np.hstack(video_lst),
               'centers': km.cluster_centers_}
    
    # Save if desired.
    if output_filename:
        with open(output_filename, 'wb') as fobj:
            pkl.dump(dataset, fobj)
            
    return dataset



In [6]:

###########################################################################
    # Preprocessing.
###########################################################################
if verbose:
    print '[Preprocessing] Starting to preprocess the dataset...'
# Remove ambiguous segments in train dataframe.
train_df = train_df[train_df['label-idx']!=-1].reset_index(drop=True)
# Get dataset.
dataset = load_dataset(train_df, feature_filename, n_clusters=dict_size, 
                       output_filename=dataset_filename, verbose=verbose)


[Preprocessing] Starting to preprocess the dataset...
[1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433, 449, 465, 481, 497, 513, 529, 545, 561, 577, 593, 609, 625, 641, 657, 673, 689, 705, 721, 737, 753, 769]


ValueError: Index (257) out of range (0-250)

In [None]:
dataset['durations'] = get_typical_durations(train_df['n-frames'])
# Normalize KMeans centers.
dataset['centers'] = normalize(dataset['centers'], axis=1, norm='l2')
dataset['feat'] = normalize(dataset['feat'], axis=1, norm='l2')
# Unifying matrix definitions.
X, D_0 = dataset['feat'], dataset['centers']
Y = LabelBinarizer().fit_transform(dataset['label'])
if verbose:
    print '[Preprocessing] Dataset sucessfully loaded and pre-proccessed.'
