In [1]:
import os
import sys
import yaml
import torch
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from collections import defaultdict
import tqdm
import json
import pickle

base_dir = os.path.join(os.getcwd(), '..')
sys.path.append(base_dir)

import run_FaIR_experiment as main

### CV Gridsearch RF hyperparameters

In [2]:
# Load config file
with open('../config/runs/FaIR_experiment.yaml', "r") as f:
    cfg = yaml.safe_load(f)
cfg['data']['Xtrain_path'] = '../data/FaIR/Xtrain.pt'
cfg['data']['Ytrain_path'] = '../data/FaIR/Ytrain.pt'

cfg['data']['seed'] = 2
cfg['data']['n'] = 400
cfg['data']['semi_prop'] = 400
data = main.make_data(cfg=cfg)

Xtrain = (data.Xtrain - data.mu_X)
Ytrain = (data.Ytrain - data.mu_Y)
Xsemitrain = (data.Xsemitrain - data.mu_X)

In [5]:
# Load pretrained model
with open('FaIR-RF.pkl', 'rb') as f:
    rf = pickle.load(f)
print(rf)

RandomForestRegressor(max_depth=5, min_samples_leaf=10, n_estimators=200,
                      random_state=2000)


In [3]:
# # Or redo grid search
# param_grid = {'n_estimators': [200, 300, 400],
#               'max_depth': [5, 10, 20],
#               'min_samples_split': [2, 5, 10],
#               'min_samples_leaf': [2, 5, 10]}
# rf = RandomForestRegressor(random_state=2000)
# rf_cv_search = GridSearchCV(rf, param_grid, n_jobs=4)
# rf_cv_search.fit(Xtrain, Ytrain.flatten())

# print(rf_cv_search.best_score_)
# print(rf_cv_search.best_params_)

# with open('FaIR-RF.pkl', 'wb') as f:
#     pickle.dump(rf_cv_search.best_estimator_, f)
# with open('FaIR-RF_best_params.json', 'w') as f:
#     json.dump(rf_cv_search.best_params_, f)
# with open('FaIR-RF.pkl', 'rb') as f:
#     rf = pickle.load(f)

0.02112980691849651
{'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 200}


### Run with multiple seeds for dataset

In [6]:
with open('../config/runs/FaIR_experiment.yaml', "r") as f:
    cfg = yaml.safe_load(f)
cfg['data']['Xtrain_path'] = '../data/FaIR/Xtrain.pt'
cfg['data']['Ytrain_path'] = '../data/FaIR/Ytrain.pt'

Xtest = torch.load('../data/FaIR/Xval.pt') - data.mu_X
Ytest = torch.load('../data/FaIR/Yval.pt').squeeze() - data.mu_Y
zero_mse = torch.square(Ytest).mean()

lr = LinearRegression()

In [5]:
def run(seed):
    cfg['data']['seed'] = seed
    data = main.make_data(cfg=cfg)
    Xtrain = (data.Xtrain - data.mu_X)
    Ytrain = (data.Ytrain - data.mu_Y)
    Xsemitrain = (data.Xsemitrain - data.mu_X)
    
    rf.fit(Xtrain, Ytrain.flatten())
    lr.fit(Xsemitrain[:, -data.d_X2:], rf.predict(Xsemitrain))
    
    pred_baseline = torch.from_numpy(rf.predict(Xtest))
    pred_after = pred_baseline - lr.predict(Xtest[:, -data.d_X2:])

    baseline_mse = torch.square(pred_baseline - Ytest.flatten()).mean() / zero_mse
    after_mse = torch.square(pred_after - Ytest.flatten()).mean() / zero_mse
    
    baseline_snr = 10 * torch.log10(torch.square(Ytest).sum() / torch.square(Ytest.squeeze() - pred_baseline).sum())
    after_snr = 10 * torch.log10(torch.square(Ytest).sum() / torch.square(Ytest.squeeze() - pred_after).sum())
    
    baseline_corr = torch.corrcoef(torch.stack([Ytest.squeeze(), pred_baseline]))[0, 1]
    after_corr = torch.corrcoef(torch.stack([Ytest.squeeze(), pred_after]))[0, 1]

    output = {'mse-baseline': baseline_mse.item(),
              'mse-after': after_mse.item(),
              'snr-baseline': baseline_snr.item(),
              'snr-after': after_snr.item(),
              'corr-baseline': baseline_corr.item(),
              'corr-after': after_corr.item()}
    return output

In [6]:
scores = defaultdict(list)
for seed in tqdm.notebook.tqdm(range(1, 101)):
    run_scores = run(seed)
    scores['mse-baseline'].append(run_scores['mse-baseline'])
    scores['mse-after'].append(run_scores['mse-after'])
    scores['snr-baseline'].append(run_scores['snr-baseline'])
    scores['snr-after'].append(run_scores['snr-after'])
    scores['corr-baseline'].append(run_scores['corr-baseline'])
    scores['corr-after'].append(run_scores['corr-after'])

dump_dir = '../experiments/data/outputs/seeds/FaIR_experiment/RF/'
os.makedirs(dump_dir, exist_ok=True)
dump_path = os.path.join(dump_dir, 'scores.metrics')
with open(dump_path, 'w') as f:
    yaml.dump(dict(scores), f)

  0%|          | 0/100 [00:00<?, ?it/s]

### Vary number of training samples

In [9]:
with open('../config/runs/FaIR_experiment.yaml', "r") as f:
    cfg = yaml.safe_load(f)
cfg['data']['Xtrain_path'] = '../data/FaIR/Xtrain.pt'
cfg['data']['Ytrain_path'] = '../data/FaIR/Ytrain.pt'

Xtest = torch.load('../data/FaIR/Xval.pt') - data.mu_X
Ytest = torch.load('../data/FaIR/Yval.pt').squeeze() - data.mu_Y
zero_mse = torch.square(Ytest).mean()

lr = LinearRegression()

In [10]:
def run(n_train, seed):
    cfg['data']['n'] = n_train
    cfg['data']['seed'] = seed
    data = main.make_data(cfg=cfg)
    Xtrain = (data.Xtrain - data.mu_X)
    Ytrain = (data.Ytrain - data.mu_Y)
    Xsemitrain = (data.Xsemitrain - data.mu_X)
    
    rf.fit(Xtrain, Ytrain.flatten())
    lr.fit(Xsemitrain[:, -data.d_X2:], rf.predict(Xsemitrain))
    
    pred_baseline = torch.from_numpy(rf.predict(Xtest))
    pred_after = pred_baseline - lr.predict(Xtest[:, -data.d_X2:])

    baseline_mse = torch.square(pred_baseline - Ytest.flatten()).mean() / zero_mse
    after_mse = torch.square(pred_after - Ytest.flatten()).mean() / zero_mse
    return {'baseline': baseline_mse.item(), 'after': after_mse.item()}

In [11]:
scores = {'baseline':[], 'after':[], 'n_train': [], 'seed': []}
for n_train in [10, 20, 40, 60, 80, 100, 150, 200]:
    for seed in tqdm.notebook.tqdm(range(1, 41)):
        mse = run(n_train, seed)
        scores['baseline'].append(mse['baseline'])
        scores['after'].append(mse['after'])
        scores['seed'].append(seed)
        scores['n_train'].append(n_train)
dump_dir = '../experiments/data/outputs/n_train/FaIR_experiment/RF/'
os.makedirs(dump_dir, exist_ok=True)
dump_path = os.path.join(dump_dir, 'scores.metrics')
with open(dump_path, 'w') as f:
    yaml.dump(dict(scores), f)

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

### Vary number of semi-supervised samples

In [13]:
with open('../config/runs/FaIR_experiment.yaml', "r") as f:
    cfg = yaml.safe_load(f)
cfg['data']['Xtrain_path'] = '../data/FaIR/Xtrain.pt'
cfg['data']['Ytrain_path'] = '../data/FaIR/Ytrain.pt'

Xtest = torch.load('../data/FaIR/Xval.pt') - data.mu_X
Ytest = torch.load('../data/FaIR/Yval.pt').squeeze() - data.mu_Y
zero_mse = torch.square(Ytest).mean()

lr = LinearRegression()

In [14]:
def run(semi_prop, seed):
    cfg['data']['semi_prop'] = semi_prop
    cfg['data']['seed'] = seed
    data = main.make_data(cfg=cfg)
    Xtrain = (data.Xtrain - data.mu_X)
    Ytrain = (data.Ytrain - data.mu_Y)
    Xsemitrain = (data.Xsemitrain - data.mu_X)
    
    rf.fit(Xtrain, Ytrain.flatten())
    lr.fit(Xsemitrain[:, -data.d_X2:], rf.predict(Xsemitrain))
    
    pred_baseline = torch.from_numpy(rf.predict(Xtest))
    pred_after = pred_baseline - lr.predict(Xtest[:, -data.d_X2:])

    baseline_mse = torch.square(pred_baseline - Ytest.flatten()).mean() / zero_mse
    after_mse = torch.square(pred_after - Ytest.flatten()).mean() / zero_mse
    return {'baseline': baseline_mse.item(), 'after': after_mse.item()}

In [15]:
scores = {'baseline':[], 'after':[], 'semi_prop': [], 'seed': []}
for semi_prop in [0, 25, 50, 100, 200, 400, 600, 800, 1000]:
    for seed in tqdm.notebook.tqdm(range(1, 41)):
        mse = run(semi_prop, seed)
        scores['baseline'].append(mse['baseline'])
        scores['after'].append(mse['after'])
        scores['seed'].append(seed)
        scores['semi_prop'].append(semi_prop)
dump_dir = '../experiments/data/outputs/semi_prop/FaIR_experiment/RF/'
os.makedirs(dump_dir, exist_ok=True)
dump_path = os.path.join(dump_dir, 'scores.metrics')
with open(dump_path, 'w') as f:
    yaml.dump(dict(scores), f)

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

### Vary dimensionality of $X_2$

In [14]:
# Load config file
with open('../config/runs/mvn_experiment.yaml', "r") as f:
    cfg = yaml.safe_load(f)
data = main.make_data(cfg=cfg, builder=main.mvn.build_data_generator)
with open('RF_best_params.json', 'r') as f:
    rf_params = json.load(f)

In [15]:
def run(d_X2, seed):
    cfg['data']['d_X2'] = d_X2
    cfg['data']['seed'] = seed
    data = main.make_data(cfg=cfg, builder=main.mvn.build_data_generator)
    Xtest, Ytest = data.generate(n=cfg['evaluation']['n_test'], seed=seed + 1)
    
    local_rf = RandomForestRegressor(random_state=2000, **rf_params)
    lr = Ridge(alpha=1.)

    local_rf.fit(data.Xtrain, data.Ytrain.flatten())
    lr.fit(data.Xsemitrain[:, -data.d_X2:], local_rf.predict(data.Xsemitrain))
    
    pred_baseline = torch.from_numpy(local_rf.predict(Xtest))
    pred_after = pred_baseline - lr.predict(Xtest[:, -data.d_X2:])

    baseline_mse = torch.square(pred_baseline - Ytest.flatten()).mean()
    after_mse = torch.square(pred_after - Ytest.flatten()).mean()
    return {'baseline': baseline_mse.item(), 'after': after_mse.item()}

In [16]:
scores = {'baseline':[], 'after':[], 'd_X2': [], 'seed': []}
for d_X2 in [1, 2, 3, 4, 5, 6, 7, 8]:
    for seed in tqdm.notebook.tqdm(range(1, 41)):
        mse = run(d_X2, seed)
        scores['baseline'].append(mse['baseline'])
        scores['after'].append(mse['after'])
        scores['seed'].append(seed)
        scores['d_X2'].append(d_X2)
dump_dir = '../experiments/data/outputs/d_X2/mvn_experiment/RF/'
os.makedirs(dump_dir, exist_ok=True)
dump_path = os.path.join(dump_dir, 'scores.metrics')
with open(dump_path, 'w') as f:
    yaml.dump(dict(scores), f)

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]