In [1]:
import os
import sys
import yaml
import torch
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from collections import defaultdict
import tqdm
import json
import pickle

base_dir = os.path.join(os.getcwd(), '..')
sys.path.append(base_dir)

import run_mvn_experiment as main

### CV Gridsearch RF hyperparameters

In [42]:
# Load config file
with open('../config/runs/mvn_experiment.yaml', "r") as f:
    cfg = yaml.safe_load(f)
cfg['data']['seed'] = 2
cfg['data']['n'] = 400
cfg['data']['semi_prop'] = 400
data = main.make_data(cfg=cfg, builder=main.mvn.build_data_generator)

In [43]:
# Load pretrained model
with open('sandbox/RF.pkl', 'rb') as f:
    rf = pickle.load(f)
print(rf)

RandomForestRegressor(max_depth=20, min_samples_leaf=2, n_estimators=400,
                      random_state=2000)


In [4]:
# # Or redo grid search
# param_grid = {'n_estimators': [300, 400, 500],
#               'max_depth': [10, 20, 30],
#               'min_samples_split': [2, 5, 10],
#               'min_samples_leaf': [2, 5, 10]}
# rf = RandomForestRegressor(random_state=2000)
# rf_cv_search = GridSearchCV(rf, param_grid, n_jobs=4)
# rf_cv_search.fit(data.Xtrain, data.Ytrain.flatten())

# print(rf_cv_search.best_score_)
# print(rf_cv_search.best_params_)

# with open('RF.pkl', 'wb') as f:
#     pickle.dump(rf_cv_search.best_estimator_, f)
# with open('RF_best_params.json', 'w') as f:
#     json.dump(rf_cv_search.best_params_, f)
# with open('RF.pkl', 'rb') as f:
#     rf = pickle.load(f)


KeyboardInterrupt



In [46]:
def eval_most_gain(cfg, data, rf):
    X, _ = data.generate(n=cfg['evaluation']['n_test_gain'],
                         seed=cfg['evaluation']['seed'],
                         most_gain=True,
                         most_gain_samples=cfg["evaluation"]["n_gain"])
    pred = torch.from_numpy(rf.predict(X.view(-1, X.size(-1)))).float()
    pred_avg = pred.reshape(d, -1).mean(dim=0)
    most_gain = torch.square(pred_avg).mean()
    return most_gain

### Run with multiple seeds for dataset

In [47]:
with open('../config/runs/mvn_experiment.yaml', "r") as f:
    cfg = yaml.safe_load(f)
data = main.make_data(cfg=cfg, builder=main.mvn.build_data_generator)
lr = LinearRegression()

In [48]:
def run(seed):
    cfg['data']['seed'] = seed
    data = main.make_data(cfg=cfg, builder=main.mvn.build_data_generator)
    Xtest, Ytest = data.generate(n=cfg['evaluation']['n_test'], seed=seed + 1)
    
    rf.fit(data.Xtrain, data.Ytrain.flatten())
    lr.fit(data.Xsemitrain[:, -data.d_X2:], rf.predict(data.Xsemitrain))
    
    pred_baseline = torch.from_numpy(rf.predict(Xtest))
    pred_after = pred_baseline - lr.predict(Xtest[:, -data.d_X2:])

    baseline_mse = torch.square(pred_baseline - Ytest.flatten()).mean()
    after_mse = torch.square(pred_after - Ytest.flatten()).mean()

    most_gain = eval_most_gain(cfg, data, rf)

    return {'baseline': baseline_mse.item(), 
            'after': after_mse.item(),
            'most_gain': most_gain.item()}

In [53]:
scores = {'baseline': [], 'after': [], 'most_gain': []}
for seed in tqdm.notebook.tqdm(range(1, 101)):
    mse = run(seed)
    scores['baseline'].append(mse['baseline'])
    scores['after'].append(mse['after'])
    scores['most_gain'].append(mse['most_gain'])

dump_dir = '../experiments/data/outputs/seeds/mvn_experiment/RF/'
os.makedirs(dump_dir, exist_ok=True)
dump_path = os.path.join(dump_dir, 'scores.metrics')
with open(dump_path, 'w') as f:
    yaml.dump(dict(scores), f)

  0%|          | 0/100 [00:00<?, ?it/s]

### Vary number of training samples

In [54]:
# Load config file
with open('../config/runs/mvn_experiment.yaml', "r") as f:
    cfg = yaml.safe_load(f)
data = main.make_data(cfg=cfg, builder=main.mvn.build_data_generator)
lr = LinearRegression()

In [55]:
def run(n_train, seed):
    cfg['data']['n'] = n_train
    cfg['data']['seed'] = seed
    data = main.make_data(cfg=cfg, builder=main.mvn.build_data_generator)
    Xtest, Ytest = data.generate(n=cfg['evaluation']['n_test'], seed=seed + 1)
    
    rf.fit(data.Xtrain, data.Ytrain.flatten())
    lr.fit(data.Xsemitrain[:, -data.d_X2:], rf.predict(data.Xsemitrain))
    
    pred_baseline = torch.from_numpy(rf.predict(Xtest))
    pred_after = pred_baseline - lr.predict(Xtest[:, -data.d_X2:])

    baseline_mse = torch.square(pred_baseline - Ytest.flatten()).mean()
    after_mse = torch.square(pred_after - Ytest.flatten()).mean()

    most_gain = eval_most_gain(cfg, data, rf)

    return {'baseline': baseline_mse.item(), 
            'after': after_mse.item(),
            'most_gain': most_gain.item()}

In [56]:
scores = {'baseline':[], 'after':[], 'n_train': [], 'seed': [], 'most_gain': []}
for n_train in [10, 20, 40, 60, 80, 100, 150, 200]:
    for seed in tqdm.notebook.tqdm(range(1, 41)):
        mse = run(n_train, seed)
        scores['baseline'].append(mse['baseline'])
        scores['after'].append(mse['after'])
        scores['seed'].append(seed)
        scores['n_train'].append(n_train)
        scores['most_gain'].append(mse['most_gain'])
dump_dir = '../experiments/data/outputs/n_train/mvn_experiment/RF/'
os.makedirs(dump_dir, exist_ok=True)
dump_path = os.path.join(dump_dir, 'scores.metrics')
with open(dump_path, 'w') as f:
    yaml.dump(dict(scores), f)

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

### Vary number of semi-supervised samples

In [57]:
# Load config file
with open('../config/runs/mvn_experiment.yaml', "r") as f:
    cfg = yaml.safe_load(f)
data = main.make_data(cfg=cfg, builder=main.mvn.build_data_generator)
lr = LinearRegression()

In [58]:
def run(semi_prop, seed):
    cfg['data']['semi_prop'] = semi_prop
    cfg['data']['seed'] = seed
    data = main.make_data(cfg=cfg, builder=main.mvn.build_data_generator)
    Xtest, Ytest = data.generate(n=cfg['evaluation']['n_test'], seed=seed + 1)
    
    rf.fit(data.Xtrain, data.Ytrain.flatten())
    lr.fit(data.Xsemitrain[:, -data.d_X2:], rf.predict(data.Xsemitrain))
    
    pred_baseline = torch.from_numpy(rf.predict(Xtest))
    pred_after = pred_baseline - lr.predict(Xtest[:, -data.d_X2:])

    baseline_mse = torch.square(pred_baseline - Ytest.flatten()).mean()
    after_mse = torch.square(pred_after - Ytest.flatten()).mean()

    most_gain = eval_most_gain(cfg, data, rf)

    return {'baseline': baseline_mse.item(), 
            'after': after_mse.item(),
            'most_gain': most_gain.item()}

In [59]:
scores = {'baseline':[], 'after':[], 'semi_prop': [], 'seed': [], 'most_gain': []}
for semi_prop in [0, 25, 50, 100, 200, 400, 600, 800, 1000]:
    for seed in tqdm.notebook.tqdm(range(1, 41)):
        mse = run(semi_prop, seed)
        scores['baseline'].append(mse['baseline'])
        scores['after'].append(mse['after'])
        scores['seed'].append(seed)
        scores['semi_prop'].append(semi_prop)
        scores['most_gain'].append(mse['most_gain'])
dump_dir = '../experiments/data/outputs/semi_prop/mvn_experiment/RF/'
os.makedirs(dump_dir, exist_ok=True)
dump_path = os.path.join(dump_dir, 'scores.metrics')
with open(dump_path, 'w') as f:
    yaml.dump(dict(scores), f)

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

### Vary dimensionality of $X_2$

In [60]:
# Load config file
with open('../config/runs/mvn_experiment.yaml', "r") as f:
    cfg = yaml.safe_load(f)
data = main.make_data(cfg=cfg, builder=main.mvn.build_data_generator)
with open('RF_best_params.json', 'r') as f:
    rf_params = json.load(f)

In [65]:
def run(d_X2, seed):
    cfg['data']['d_X2'] = d_X2
    cfg['data']['seed'] = seed
    data = main.make_data(cfg=cfg, builder=main.mvn.build_data_generator)
    Xtest, Ytest = data.generate(n=cfg['evaluation']['n_test'], seed=seed + 1)
    
    local_rf = RandomForestRegressor(random_state=2000, **rf_params)
    lr = LinearRegression()

    local_rf.fit(data.Xtrain, data.Ytrain.flatten())
    lr.fit(data.Xsemitrain[:, -data.d_X2:], local_rf.predict(data.Xsemitrain))
    
    pred_baseline = torch.from_numpy(local_rf.predict(Xtest))
    pred_after = pred_baseline - lr.predict(Xtest[:, -data.d_X2:])

    baseline_mse = torch.square(pred_baseline - Ytest.flatten()).mean()
    after_mse = torch.square(pred_after - Ytest.flatten()).mean()

    most_gain = eval_most_gain(cfg, data, local_rf)

    return {'baseline': baseline_mse.item(), 
            'after': after_mse.item(),
            'most_gain': most_gain.item()}

In [66]:
scores = {'baseline':[], 'after':[], 'd_X2': [], 'seed': [], 'most_gain': []}
for d_X2 in [1, 2, 3, 4, 5, 6, 7, 8]:
    for seed in tqdm.notebook.tqdm(range(1, 41)):
        mse = run(d_X2, seed)
        scores['baseline'].append(mse['baseline'])
        scores['after'].append(mse['after'])
        scores['seed'].append(seed)
        scores['d_X2'].append(d_X2)
        scores['most_gain'].append(mse['most_gain'])
dump_dir = '../experiments/data/outputs/d_X2/mvn_experiment/RF/'
os.makedirs(dump_dir, exist_ok=True)
dump_path = os.path.join(dump_dir, 'scores.metrics')
with open(dump_path, 'w') as f:
    yaml.dump(dict(scores), f)

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]