In [1]:
import sys, os

env_root = '/N/project/baby_vision_curriculum/pythonenvs/hfenv/lib/python3.10/site-packages/'
sys.path.insert(0, env_root)

In [2]:
import numpy as np
# import torch, torchvision
# from torchvision import transforms as tr
from tqdm import tqdm
from pathlib import Path
# import math
import argparse
import pandas as pd
import warnings
from joblib import Parallel, delayed
from copy import deepcopy

In [3]:
from sklearn import svm, preprocessing
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
def get_separability_score(df, label, method='sgd', ret_preds=False, n_jobs=60):
    # method: sgd or svm
    le = preprocessing.LabelEncoder()
    y = le.fit_transform(df[label])
    
#     print('df.shape',df.shape)
    
#     X_cols = ['dim'+str(i)
#               for i in range(df.shape[1]-2)]
    X_cols = [col for col in df.columns if 'dim' in col]
    X = df[X_cols]
    
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    if method=='svm':
        clf = make_pipeline(StandardScaler(),
                            LinearSVC(random_state=0, tol=1e-4))
    elif method=='sgd':
        clf = make_pipeline(StandardScaler(),
                            SGDClassifier(max_iter=5000, tol=1e-4, n_jobs=n_jobs))#, loss='log_loss'))
    else:
        raise ValueError()
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train,y_train)
    test_score = clf.score(X_test,y_test)
    if ret_preds:
        preds = clf.predict(X_test)
        return train_score, test_score, preds, y_test
    return train_score, test_score

In [6]:
class ToyBoxEval():
    
    def __init__(self):
        pass

    def get_labels(self, fname):
    #     example fname: truck_30_pivothead_rzminus.mp4
        fnparts = fname.split('.')[0].split('_')
        category = fnparts[0]
        identity = category+fnparts[1]
        trans = fnparts[3]
        return category, identity, trans

    def get_categorylabel(self, fname):
        fnparts = fname.split('.')[0].split('_')
        category = fnparts[0]
        return category

    def get_identitylabel(self, fname):
        fnparts = fname.split('.')[0].split('_')
        category = fnparts[0]
        identity = category+fnparts[1]
        return identity

    def get_translabel(self, fname):
        fnparts = fname.split('.')[0].split('_')
        translabel = fnparts[3]
        return translabel

    def add_labels_to_df(self, df, labels):
        if 'category' in labels:
            df['category'] = df['fnames'].apply(lambda fname: self.get_categorylabel(fname))
        if 'identity' in labels:
            df['identity'] = df['fnames'].apply(lambda fname: self.get_identitylabel(fname))
        if 'transformation' in labels:
            df['transformation'] = df['fnames'].apply(lambda fname: self.get_translabel(fname))
        return df
    
    def fix_fnames(self, df):
        df.iloc[2163,0] = 'giraffe_02_pivothead_rzplus.mp4'
        df.iloc[1851,0]='duck_06_pivothead_rxminus.mp4'
        df.iloc[751,0] = 'car_03_pivothead_rzminus.mp4'
        return df

    def eval_embs(self, fp, method='sgd', ret_preds=False):
        scores = {}

        df = pd.read_csv(fp)

    #     fix 3 filenames. for older files
        df = self.fix_fnames(df)
        df = self.add_labels_to_df(df, ['category', 'identity','transformation'])

    #     if ret_preds
        scores['category'] = get_separability_score(df, 'category', method=method)
    #     scores['identity'] = get_separability_score(df, 'identity', method=method)
        scores['transformation'] = get_separability_score(df, 'transformation', method=method)
        return scores
    
    def proc_fp(self, fp, score_type, n_jobs=80):#24):
        df = pd.read_csv(fp)
        df = self.fix_fnames(df)
        
        df = self.add_labels_to_df(df, ['category', 'identity','transformation'])
        method='sgd'
        train_score, test_score, preds, targets = get_separability_score(df, score_type, method=method, ret_preds=True,
                                                                        n_jobs=n_jobs)
        return test_score

In [7]:
class SSv2Eval():
    
    def __init__(self, label_path=None):
        if label_path is None:
            label_path = '/N/project/baby_vision_curriculum/benchmarks/ssv2/easy_labels/train_easy10.csv'
        self.labels_df = pd.read_csv(label_path)
        self.labels_df.set_index('fname', inplace=True)

    def get_categorylabel(self, fname):
        return self.labels_df.loc[str(fname)+'.webm', 'label']

    def add_labels_to_df(self, df, labels):
        if 'category' in labels:
            df['category'] = df['fnames'].apply(lambda fname: self.get_categorylabel(fname))
        return df
    
    def proc_fp(self, fp, score_type,n_jobs=80):
        if score_type!='category':
            raise ValueError
        df = pd.read_csv(fp)
        df = self.add_labels_to_df(df, ['category'])
        method='sgd'
        train_score, test_score, preds, targets = get_separability_score(df, score_type, 
                                                                         method=method, ret_preds=True,
                                                                        n_jobs=n_jobs)
        return test_score

In [8]:
class Cifar10Eval():
    
    def __init__(self):
        pass
    
    def add_labels_to_df(self, df):
        df['category'] = df['fnames']#.apply(lambda fname: self.get_categorylabel(fname))
        return df
    
    def proc_fp(self, fp, score_type, n_jobs=80):#24):#
        if score_type!='category':
            raise ValueError
        df = pd.read_csv(fp)
        df = self.add_labels_to_df(df)
        method='sgd'
        train_score, test_score, preds, targets = get_separability_score(df, score_type, method=method, ret_preds=True,
                                                                        n_jobs=n_jobs)
        return test_score

In [9]:
class UCF101Eval():
    
    def __init__(self):
        pass
    
    def add_labels_to_df(self, df):
        df['category'] = df['fnames']#.apply(lambda fname: self.get_categorylabel(fname))
        return df
    
    def proc_fp(self, fp, score_type, n_jobs=80):#1):
        if score_type!='category':
            raise ValueError
        df = pd.read_csv(fp)
        df = self.add_labels_to_df(df)
        method='sgd'
        train_score, test_score, preds, targets = get_separability_score(df, score_type, method=method, 
                                                                         ret_preds=True, n_jobs=n_jobs)
        return test_score

In [10]:
def get_traingroups(curr, stage):
    if curr=='dev':
        return 'g0g1g2'[:2*stage]
    elif curr=='adev':
        return 'g2g1g0'[:2*stage]
    else:
        return 'na'
    
def parse_fname(fp):
# embeddings_adev_1_g2_default_0_246.csv'
    parts = Path(fp).stem.split('_')
    prefix, curr, stage, current_gr, cond, fold, seed = parts
    stage=int(stage)
    train_gr = get_traingroups(curr, stage)
    
    tag_dict = {
        'Curriculum':curr,
        'Stage': stage,
        'Condition':cond,
        'Seed': seed,
        'Train Groups': train_gr,
        'data_id': '_'.join([curr, seed, cond])
    }
    return tag_dict

In [11]:
def proc_fp_model(fp, stage, num_ep, iter_per_ep):
    df = pd.read_csv(fp).iloc[:,1:]
    df = edit_epoch(df, stage, iter_per_ep, num_ep)
    tag_dict = parse_fname(fp)
    for tag in tag_dict.keys():
        df[tag] = tag_dict[tag]
    df = df.rename(columns={
    'epoch': 'Epoch',
    'iteration': 'Iteration',
    'train_loss': 'Train Loss',
    'val_loss': 'Validation Loss'
    })
    df['Train Loss Reduction'] = df['Train Loss'].diff(periods=-1)
    df['Validation Loss Reduction'] = df['Validation Loss'].diff(periods=-1)
    return df

def job_proc_file(fp, ds_task, evaluator, iter_per_stage):
    record = parse_fname(fp)
    record[ds_task] = evaluator.proc_fp(fp, ds_task, n_jobs=1)
    record['Iteration'] = iter_per_stage * record['Stage']
    return deepcopy(record)
    
def proc_result_folder(root, ds_task, iter_per_stage, n_jobs=1):
    if ds_task=='ssv2':
        evaluator = SSv2Eval()
        ds_task = 'category'
    elif ds_task=='tb_cat':
        evaluator = ToyBoxEval()
        ds_task = 'category'
    elif ds_task=='tb_trans':
        evaluator = ToyBoxEval()
        ds_task = 'transformation'
    elif ds_task=='cifar10':
        evaluator = Cifar10Eval()
        ds_task = 'category'
    elif ds_task=='ucf101':
        evaluator = UCF101Eval()
        ds_task = 'category'
        n_jobs_external=n_jobs
        n_jobs_internal=1
    else:
        raise ValueError
    record_list = []
    fpathlist = [emb_root+fname
                for fname in os.listdir(emb_root)
                if Path(emb_root+fname).suffix=='.csv']
    
    if n_jobs>1:
        record_list = Parallel(n_jobs=n_jobs_external)(
            delayed(job_proc_file)(fp, ds_task, evaluator, iter_per_stage)
            for fp in tqdm(fpathlist)
        )
    else:
        for fp in tqdm(fpathlist):
#             print(fp)
            record = parse_fname(fp)
    #         if '141' in fp:
    #             continue
            record[ds_task] = evaluator.proc_fp(fp, ds_task)
            record['Iteration']=iter_per_stage*record['Stage']
            record_list.append(deepcopy(record))
    df = pd.DataFrame.from_records(record_list)         
    return df

In [25]:
len(fpathlist)

72

In [14]:
learner = 'generative/v3/'#'contrastive/v1/'#'predictive/v1/'#
ds_task = 'ssv2'
date = 'aug11'#'aug1'#'jul315'#
num_ep,iter_per_ep = 20, 1500 #5, 2000#2, 5000

emb_root = '/N/project/baby_vision_curriculum/trained_models/'+learner+date+'/benchmarks/'+ds_task+'/'
# emb_root = '/N/project/baby_vision_curriculum/trained_models/generative/v3/jul28dev/benchmarks/ssv2/'

iter_per_stage = num_ep*iter_per_ep

df_ss = proc_result_folder(emb_root, ds_task, iter_per_stage)
# df_ss.to_csv(date+'_'+ds_task+'score.csv', index=False)

100%|███████████████████████████████████████████| 48/48 [20:14<00:00, 25.31s/it]


In [15]:
df_ss.to_csv(date+'_'+ds_task+'score.csv', index=False)

In [16]:
df_ss.groupby(['Stage', 'Condition', 'Curriculum']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,category,Iteration
Stage,Condition,Curriculum,Unnamed: 3_level_1,Unnamed: 4_level_1
0,default,adev,0.260404,0.0
0,default,adult,0.251717,0.0
0,default,dev,0.264141,0.0
0,default,rnd,0.262121,0.0
1,default,adev,0.293131,30000.0
1,default,adult,0.417879,30000.0
1,default,dev,0.458485,30000.0
1,default,rnd,0.413333,30000.0
2,default,adev,0.48697,60000.0
2,default,adult,0.5,60000.0


In [12]:
learner = 'predictive/v1/'#'contrastive/v1/'#'generative/v3/'#
ds_task = 'cifar10'
task_dir = ds_task#'toybox'
date='aug81'#'aug73'#'jul315'#'aug1'#
num_ep,iter_per_ep = 2, 5000 #1, 1500 #5, 2000#

emb_root = '/N/project/baby_vision_curriculum/trained_models/'+learner+date+'/benchmarks/'+task_dir+'/'
# emb_root = '/N/project/baby_vision_curriculum/trained_models/generative/v3/jul28dev/benchmarks/ssv2/'

iter_per_stage = num_ep*iter_per_ep

df_cf = proc_result_folder(emb_root, ds_task, iter_per_stage)
# df_cf.to_csv(date+'_'+ds_task+'score.csv', index=False)

100%|███████████████████████████████████████████| 24/24 [03:26<00:00,  8.62s/it]


In [13]:
df_cf.groupby(['Stage', 'Condition', 'Curriculum']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,category,Iteration
Stage,Condition,Curriculum,Unnamed: 3_level_1,Unnamed: 4_level_1
0,default,adev,0.392648,0.0
0,default,dev,0.390012,0.0
1,default,adev,0.410628,10000.0
1,default,dev,0.436332,10000.0
2,default,adev,0.421503,20000.0
2,default,dev,0.437487,20000.0
3,default,adev,0.415382,30000.0
3,default,dev,0.42908,30000.0


In [14]:
df_cf.to_csv(date+'_'+ds_task+'score.csv', index=False)

In [17]:
# learner = 'generative/v3/'#'predictive/v1/'
ds_task = 'tb_cat'
task_dir = 'toybox'
# date='jul315'#'aug1'
# num_ep,iter_per_ep = 5, 2000#2, 5000

emb_root = '/N/project/baby_vision_curriculum/trained_models/'+learner+date+'/benchmarks/'+task_dir+'/'
# emb_root = '/N/project/baby_vision_curriculum/trained_models/generative/v3/jul28dev/benchmarks/ssv2/'

iter_per_stage = num_ep*iter_per_ep

df_tbcat = proc_result_folder(emb_root, ds_task, iter_per_stage)
df_tbcat.to_csv(date+'_'+ds_task+'score.csv', index=False)

100%|███████████████████████████████████████████| 48/48 [01:27<00:00,  1.83s/it]


In [18]:
df_tbcat.groupby(['Stage', 'Condition', 'Curriculum']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,category,Iteration
Stage,Condition,Curriculum,Unnamed: 3_level_1,Unnamed: 4_level_1
0,default,adev,0.251524,0.0
0,default,adult,0.253399,0.0
0,default,dev,0.259728,0.0
0,default,rnd,0.246601,0.0
1,default,adev,0.304735,30000.0
1,default,adult,0.406001,30000.0
1,default,dev,0.460853,30000.0
1,default,rnd,0.398031,30000.0
2,default,adev,0.427332,60000.0
2,default,adult,0.45007,60000.0


In [19]:
# learner = 'generative/v3/'#'predictive/v1/'
ds_task = 'tb_trans'
task_dir = 'toybox'
# date='jul315'#'aug1'
# num_ep,iter_per_ep = 5, 2000#2, 5000

emb_root = '/N/project/baby_vision_curriculum/trained_models/'+learner+date+'/benchmarks/'+task_dir+'/'
# emb_root = '/N/project/baby_vision_curriculum/trained_models/generative/v3/jul28dev/benchmarks/ssv2/'

iter_per_stage = num_ep*iter_per_ep

df_tbtrans = proc_result_folder(emb_root, ds_task, iter_per_stage)
df_tbtrans.to_csv(date+'_'+ds_task+'score.csv', index=False)

100%|███████████████████████████████████████████| 48/48 [01:27<00:00,  1.83s/it]


In [20]:
df_tbtrans.groupby(['Stage', 'Condition', 'Curriculum']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,transformation,Iteration
Stage,Condition,Curriculum,Unnamed: 3_level_1,Unnamed: 4_level_1
0,default,adev,0.439522,0.0
0,default,adult,0.418425,0.0
0,default,dev,0.440225,0.0
0,default,rnd,0.427098,0.0
1,default,adev,0.466948,30000.0
1,default,adult,0.576184,30000.0
1,default,dev,0.628223,30000.0
1,default,rnd,0.595874,30000.0
2,default,adev,0.656353,60000.0
2,default,adult,0.670417,60000.0


In [21]:
# learner = 'generative/v3/'#'predictive/v1/'
ds_task = 'ucf101'
task_dir = ds_task
# date='jul315'#'aug1'
# num_ep,iter_per_ep = 5, 2000#2, 5000

emb_root = '/N/project/baby_vision_curriculum/trained_models/'+learner+date+'/benchmarks/'+task_dir+'/'
# emb_root = '/N/project/baby_vision_curriculum/trained_models/generative/v3/jul28dev/benchmarks/ssv2/'

iter_per_stage = num_ep*iter_per_ep

df_ucf = proc_result_folder(emb_root, ds_task, iter_per_stage, n_jobs=72)
# df_ucf.to_csv(date+'_'+ds_task+'score.csv', index=False)

100%|██████████████████████████████████████████| 48/48 [00:00<00:00, 634.62it/s]


In [24]:
df_ucf.to_csv(date+'_'+ds_task+'score.csv', index=False)

In [25]:
df_ucf.groupby(['Stage', 'Condition', 'Curriculum']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,category,Iteration
Stage,Condition,Curriculum,Unnamed: 3_level_1,Unnamed: 4_level_1
0,default,adev,0.165593,0.0
0,default,adult,0.156211,0.0
0,default,dev,0.16126,0.0
0,default,rnd,0.156136,0.0
1,default,adev,0.208214,30000.0
1,default,adult,0.403483,30000.0
1,default,dev,0.503745,30000.0
1,default,rnd,0.386886,30000.0
2,default,adev,0.471585,60000.0
2,default,adult,0.525014,60000.0
