In [1]:
from data import load_complexity, load_sam_features, load_fcclip_features, load_additional_features
from baseline_predictions import load_visc_handcrafted, load_visc_nn, load_ic9600_nn, load_savoias_nn, load_rsivl_handcrafted
from categories import scenes_art_split
from transformations import get_transforms, get_sqrt_transforms
from constants import DATASET_NAMES
from cross_validation import cross_validate
from util import results_to_mean_confidence_interval
from linear_regression import line_regression

In [2]:
import json
import pickle
import pandas as pd

# Load Data

In [3]:
def join_data():

    df_c = load_complexity()
    df_s = load_sam_features()
    df_f = load_fcclip_features()
    df_a = load_additional_features()
    df_v = load_visc_handcrafted()
    df_vn = load_visc_nn()
    df_in = load_ic9600_nn()
    df_sn = load_savoias_nn()
    df_r = load_rsivl_handcrafted()

    # returns 2 versions of the dataframe dictionary where data_test only contains test images for ic9600 dataset
    data_full = {}
    data_test = {}
    
    for k in DATASET_NAMES:
        data_full[k] = pd.concat(
            [df[k].set_index("filename") for df in [
                df_c, df_s, df_f, df_a, df_v, df_vn, df_sn, df_r
            ]], 
            axis=1, join='inner').reset_index()

        data_test[k] = pd.concat(
            [df[k].set_index("filename") for df in [
                df_c, df_s, df_f, df_a, df_v, df_vn, df_in, df_sn, df_r
            ]], 
            axis=1, join='inner').reset_index()

    return data_full, data_test

def num_check(data, test=False):
    if test:
        assert len(data["ic9600"]) == 2825
    else:
        assert len(data["ic9600"]) == 9425

    assert len(data["rsivl"]) == 49
    assert len(data["visc"]) == 800
    assert len(data["sav_int"]) == 100
    assert len(data["sav_obj"]) == 200
    assert len(data["sav_sce"]) == 200
    assert len(data["sav_art"]) == 420
    assert len(data["sav_sup"]) == 100

In [4]:
data_full, data_test = join_data()
num_check(data_full, test=False)
num_check(data_test, test=True)

  labels['ic9600'] = pd.concat([pd.read_csv(fn, names=["filename", "complexity"], header=None, delimiter=r"  ") for fn in ic9600])
  labels['ic9600'] = pd.concat([pd.read_csv(fn, names=["filename", "complexity"], header=None, delimiter=r"  ") for fn in ic9600])


# Label Subcategories
(for stratified sampling of train-test sets)

In [5]:
def add_ic9600_subcat(data):
    data["ic9600"]["subcat"] = data["ic9600"]["filename"].str.split('_', expand=True)[0]

def add_visc_subcat(data):
    visc_cat_map = json.load(open("/ptmp/tshen/shared/VISCHEMA_SUN/file_categories.json"))
    data["visc"]['subcat'] = data["visc"]['filename'].apply(lambda x: visc_cat_map[x])

def add_savoias_subcat(data):
    data["sav_obj"]["subcat"] = "object"
    data["sav_sce"]["subcat"] = "scene"

In [6]:
add_ic9600_subcat(data_full)
add_ic9600_subcat(data_test)
add_visc_subcat(data_full)
add_visc_subcat(data_test)
add_savoias_subcat(data_full)
add_savoias_subcat(data_test)

# Combine Datasets into Splits/Categories

In [7]:
def check_full_split(data, test=False):
    if test:
        assert len(data['rsivl']) == 49
        assert len(data['visc']) == 800
        assert len(data['sav_int']) == 100
        assert len(data['sav_art']) == 420
        assert len(data['sav_sup']) == 100
        assert len(data['sav_obj_sce']) == 400
        assert len(data['ic9600_sce']) == 1823
        assert len(data['ic9600_paint']) == 357
    else:
        assert len(data['rsivl']) == 49
        assert len(data['visc']) == 800
        assert len(data['sav_int']) == 100
        assert len(data['sav_art']) == 420
        assert len(data['sav_sup']) == 100
        assert len(data['sav_obj_sce']) == 400
        assert len(data['ic9600_sce']) == 5955
        assert len(data['ic9600_paint']) == 1200

In [8]:
data_full, cats = scenes_art_split(data_full)
data_test, _ = scenes_art_split(data_test)
check_full_split(data_full, test=False)
check_full_split(data_test, test=True)

# Cross Validated Linear Regression

In [9]:
dataset_result = {}

In [10]:
def run_regression(data, c, N, M, ic9600_test):
    df = data[c].copy()
    get_transforms(df)
    return cross_validate(df, c, N=N, M=M, ic9600_test=ic9600_test)

In [11]:
dataset_result["rsivl"] = run_regression(data_test, "rsivl", N=3, M=20, ic9600_test=True)
dataset_result["sav_obj_sce"] = run_regression(data_test, "sav_obj_sce", N=3, M=2, ic9600_test=True)
dataset_result["ic9600_sce"] = run_regression(data_test, "ic9600_sce", N=3, M=1, ic9600_test=True)
dataset_result["sav_art"] = run_regression(data_test, "sav_art", N=3, M=2, ic9600_test=True)
dataset_result["sav_sup"] = run_regression(data_test, "sav_sup", N=3, M=2, ic9600_test=True)
dataset_result["ic9600_paint"] = run_regression(data_test, "ic9600_paint", N=3, M=1, ic9600_test=True)
dataset_result["visc"] = run_regression(data_test, "visc", N=3, M=2, ic9600_test=True)
dataset_result["sav_int"] = run_regression(data_test, "sav_int", N=3, M=2, ic9600_test=True)

dataset_result["ic9600_sce_full"] = run_regression(data_full, "ic9600_sce", N=3, M=1, ic9600_test=False)
dataset_result["ic9600_paint_full"] = run_regression(data_full, "ic9600_paint", N=3, M=1, ic9600_test=False)

Running dataset rsivl, CV is stratified: False
Running dataset sav_obj_sce, CV is stratified: True
Running dataset ic9600_sce, CV is stratified: True
Running dataset sav_art, CV is stratified: False
Running dataset sav_sup, CV is stratified: False
Running dataset ic9600_paint, CV is stratified: False
Running dataset visc, CV is stratified: True
Running dataset sav_int, CV is stratified: False
Running dataset ic9600_sce, CV is stratified: True
Running dataset ic9600_paint, CV is stratified: False


In [12]:
results_stats = results_to_mean_confidence_interval(dataset_result)

In [19]:
print("Reporting Spearman correlations.")

for dset, v in results_stats.items():
    print("\nDATASET: {}".format(dset))
    for mod, vv in v.items():
        for s, (m, i) in vv.items():
            if s == "spearman_test":
                print(mod, m)

Reporting Spearman correlations.

DATASET: rsivl
sqrt_seg_64points 0.7847252942047462
sqrt_num_classes 0.7039928495023174
sqrt_seg_64points + sqrt_num_classes 0.827797811959287
sqrt_seg_64points_x_sqrt_num_classes 0.8363858962435959
sqrt_seg_64points_x_sqrt_num_classes + sqrt_seg_64points + sqrt_num_classes 0.8276833877282753
M1 + M2 + M3 + M4 + M5 + M6 + M7 + M9 + M10 + M11 0.6609087354387864
M1 + M2 + M3 + M4 + M5 + M6 + M7 + M9 + M10 + M11 + M8 0.7055646756834092
M5 + M10 0.6646779875011899
M5 + M10 + M8 0.7671808469657597
visc_symmetry + visc_clutter 0.6789085692845037
visc_nn 0.5039195637115834
savoias_nn 0.7162380618820479
ic9600_nn 0.8253291820719589
visc_symmetry 0.6969636222132699
sqrt_seg_64points + sqrt_num_classes + visc_symmetry 0.8387124113523815

DATASET: sav_obj_sce
sqrt_seg_64points 0.6531690624677315
sqrt_num_classes 0.7486879444336547
sqrt_seg_64points + sqrt_num_classes 0.784937287146239
sqrt_seg_64points_x_sqrt_num_classes 0.7934479267315896
sqrt_seg_64points_x_sqr

# Predictions from Single Split

In [None]:
for k in cats:
    get_sqrt_transforms(data_full[k])
    result = line_regression("complexity", "sqrt_seg_64points + sqrt_num_classes", data_full[k], data_full[k], return_preds=True)
    data_full[k]["predictions"] = result['predictions']
    data_full[k]["error"] = result['predictions'] - data_full[k]['complexity']

# Save Data for Analysis

In [None]:
pickle.dump(data_full, open("/ptmp/tshen/shared/Results/results.p", "wb"))