In [1]:
import argparse
import os

import cPickle as pkl

import h5py
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelBinarizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize


from sparseprop.feature import C3D as FeatHelper
from sparseprop.utils import get_typical_durations
from sparseprop.train import learn_class_independent_model
from sparseprop.train import learn_class_induced_model

In [2]:
train_filename = '/mnt/data/Proposals/labels.csv'
feature_filename = '/mnt/data/c3d_features/feature.c3d.hdf5'
model_filename = 'output.pkl'

In [3]:
def load_dataset(df, hdf5_filename, n_clusters=256, 
                 output_filename=None, verbose=True):
    """Load dataset containing trimmed instances.
    
    Parameters
    ----------
    df : DataFrame
        Dataframe containing the annotations info. It must 
        contain the following fields: 'video-name', 'f-init', 'n-frames'
    hdf5_filename : str
        String containing the path to the HDF5 file containing 
        the features for each video. The HDF5 file must contain 
        a group for each video where the id of the group is the name 
        of the video; and each group must contain a dataset containing
        the features.
    n_clusters : int, optional
        Number of cluster for KMeans
    output_filename : str, optional
        String containing the path to a pickle file where the dataset 
        will be stored. If the file exists, the function will skip 
        the re-compute of the dataset.
    verbose : bool, optional
        Activates verbosity.
    
    Outputs
    -------
    dataset : dict
        Dictionary containing the packed dataset containing the following 
        keys: 'feat' [ndarray containing the feature matrix]
              'label' [ndarray containing the label matrix]
              'video-name' [1darray containing the video name]
              'centers' [ndarray containing the KMeans centers]
    """
    # Avoid re-computing if dataset exists.
    if output_filename:
        if os.path.exists(output_filename):
            with open(output_filename, 'rb') as fobj:
                return pkl.load(fobj)
    
    # Iterate over each annotation instance and load its features.
    video_lst, label_lst, feat_lst = [], [], []
    feat_obj = FeatHelper(hdf5_filename)
    feat_obj.open_instance()
    for k, row in df.iterrows():
        try:
            this_feat = feat_obj.read_feat(row['video-name'],
                                           int(row['f-init']),
                                           int(row['n-frames']))
            feat_lst.append(this_feat)
            label_lst.append(np.repeat(row['label-idx'], this_feat.shape[0]))
            video_lst.append(np.repeat(row['video-name'], this_feat.shape[0]))
        except:
            if verbose:
                print ('Warning: instance from video '
                       '{} was discarded.').format(row['video-name'])
    
    # Stack features in a matrix.
    feat_stack = np.vstack(feat_lst)
    
    # Compute KMeans centers.
    km = KMeans(n_clusters=n_clusters, n_jobs=-1)
    n_samples = np.minimum(1e4, feat_stack.shape[0])
    sidx = np.random.permutation(np.arange(feat_stack.shape[0]))[:int(n_samples)]
    km.fit(feat_stack[sidx, :])
    
    # Pack dataset in a dictionary.
    dataset = {'feat': feat_stack,
               'label': np.hstack(label_lst),
               'video-name': np.hstack(video_lst),
               'centers': km.cluster_centers_}
    
    # Save if desired.
    if output_filename:
        with open(output_filename, 'wb') as fobj:
            pkl.dump(dataset, fobj)
            
    return dataset


In [4]:
dict_size=256
dict_type='induced'
dataset_filename=None
verbose=True

In [5]:
###########################################################################
    # Prepare input/output files.
    ###########################################################################
    # Reading training file.
if not os.path.exists(train_filename):
    raise RuntimeError('Please provide a valid train file: not exists')
train_df = pd.read_csv(train_filename, sep=',')
rfields = ['video-name', 'f-init', 'n-frames', 'video-frames', 'label-idx']
efields = np.unique(train_df.columns)
if not all([field in efields for field in rfields]):
    raise RuntimeError('Please provide a valid train file: bad formatting')
# Feature file sanity check.
with h5py.File(feature_filename) as fobj:
    # Checks that feature file contains all the videos in train_filename.
    evideos = fobj.keys()
    rvideos = np.unique(train_df['video-name'].values)
    if not all([x in evideos for x in rvideos]):
        raise RuntimeError(('Please provide a valid feature file: '
                            'some videos are missing.'))


In [6]:
###########################################################################
# Preprocessing.
###########################################################################
if verbose:
    print '[Preprocessing] Starting to preprocess the dataset...'
# Remove ambiguous segments in train dataframe.
train_df = train_df[train_df['label-idx']!=-1].reset_index(drop=True)
# Get dataset.
dataset = load_dataset(train_df, feature_filename, n_clusters=dict_size, 
                       output_filename=dataset_filename, verbose=verbose)
dataset['durations'] = get_typical_durations(np.array(train_df['n-frames']))
# Normalize KMeans centers.
dataset['centers'] = normalize(dataset['centers'], axis=1, norm='l2')
dataset['feat'] = normalize(dataset['feat'], axis=1, norm='l2')
# Unifying matrix definitions.
X, D_0 = dataset['feat'], dataset['centers']
Y = LabelBinarizer().fit_transform(dataset['label'])
if verbose:
    print '[Preprocessing] Dataset sucessfully loaded and pre-proccessed.'

[Preprocessing] Starting to preprocess the dataset...
[Preprocessing] Dataset sucessfully loaded and pre-proccessed.
