In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt

from dataloader.rosen import RosenData
from visualize import get_vae, get_model, set_random, build_estimator
config = {
    'estimator': 'nngp',
    'random_seed': 43,
    'n_dim': 10,
    'data_size': 2000,
    'data_split': [0.2, 0.8, 0, 0],
    'update_size': 100,
    'al_iterations': 10,
    'verbose': True,
    'use_cache': True,
    'layers': [10, 128, 64, 32, 1],
    'patience': 5,
    'retrain': True,
    'model_path': 'model/data/rosen_visual.ckpt',
    'epochs': 20_000,
}


### UQ estimation on 10d rosen function data

In [16]:
rosen = RosenData(
    config['n_dim'], config['data_size'], config['data_split'],
    use_cache=config['use_cache'])

x_pool, y_pool = rosen.dataset('pool')
x_train, y_train = rosen.dataset('train')
x_val, y_val = rosen.dataset('train')

set_random(config['random_seed'])
model = get_model(
    config['layers'], retrain=False, model_path=config['model_path'],
    (x_train, y_train), (x_val, y_val), epochs=30_000)

In [18]:
num = 10
prediction = model(x_val).cpu().numpy()
for x, y in zip(prediction[-num:], y_val[-num:]):
    print(x, y)


In [57]:
estimator = build_estimator('mcdue', model)
estimations = estimator.estimate(x_val, x_train, y_train)
predictions = model(x_val).cpu().numpy()
errors = np.abs(predictions-y_val)/(predictions+y_val)
plt.figure(figsize=(12, 9))
plt.ylabel('Uncertainty')
plt.xlabel('Error')
plt.scatter(errors, estimations)


In [58]:
from analysis.metrics import uq_accuracy

errors = np.abs(predictions - y_val)
percentile = 0.10
acc = uq_accuracy(estimations, errors, percentile)
print(f"Accuracy on worst {percentile*100}%: {acc}")

In [131]:
from analysis.metrics import dcg, ndcg, uq_ndcg

print(uq_ndcg(errors, estimations))
random_baseline = np.random.rand(*estimations.shape)
print(uq_ndcg(errors, random_baseline))


In [50]:
np.max(errors)

### UQ estimation on blog feedback data

In [8]:
from dataloader.blog_feedback import BlogFeedbackData
from sklearn.model_selection import train_test_split
import torch

In [6]:
bf_data = BlogFeedbackData()
x_train, y_train = bf_data.dataset('train')
x_test, y_test = bf_data.dataset('test')

In [9]:
x_val, y_val = x_test, y_test

In [10]:
from model.mlp import MLP

layers = [280, 256, 128, 64, 1]
retrain = True
model_path = 'model/data/blog.ckpt'
model = MLP(layers, l2_reg=1e-3)
if retrain:
    model.fit((x_train, y_train), (x_val, y_val), epochs=30000, patience=10)
    torch.save(model.state_dict(), model_path)
else:
    model.load_state_dict(torch.load(model_path))
    model.eval()


In [11]:
predictions = model(x_val).cpu().numpy()
for x, y in zip(y_val[-10:], predictions[-10:]):
    print(x, y)

# estimator = build_estimator('nngp', model)
# estimations = estimator.estimate(x_vak, x_train, y_train)

# error = np.abs(predictions-y_val)/(predictions+y_val)
# plt.figure(figsize=(12, 9))
# plt.ylabel('Uncertainty')
# plt.xlabel('Error')
# plt.scatter(error, estimations)


In [15]:
# -*- coding: utf-8 -*-
"""
Created on Sun Jan 03 22:13:19 2016
TODO - Reduce Dimensionality of data and preprocess data to decrease error
And test on the individual files provided
@author: Rupak Chakraborty
"""
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import BayesianRidge
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

filename = "dataloader/data/blog_feedback/blogData_train.csv"
data = pd.read_csv(filename,header=None)
data = data.iloc[np.random.permutation(len(data))]
output_variables = data[len(data.columns)-1]
del data[len(data.columns)-1]

train_data,test_data,train_output,test_output = train_test_split(data.values,output_variables.values,test_size=0.3)

rf = RandomForestRegressor(n_estimators=101)
ada = AdaBoostRegressor(n_estimators=101)
bagging = BaggingRegressor(n_estimators=101)
gradBoost = GradientBoostingRegressor(n_estimators=101)
bayes = BayesianRidge()

regressors = [rf,ada,bagging,gradBoost,bayes]
regressor_names = ["Random Forests","Adaboost","Bagging","Gradient Boosting","Bayesian Ridge"]

for regressor,regressor_name in zip(regressors,regressor_names):
    
    regressor.fit(train_data,train_output)
    predicted_values = regressor.predict(test_data)
    
    print("--------------------------------\n")
    print("Mean Absolute Error for ",regressor_name," : ",metrics.mean_absolute_error(test_output,predicted_values))
    print("Median Absolute Error for ",regressor_name, " : ",metrics.median_absolute_error(test_output,predicted_values))
    print("Mean Squared Error for ",regressor_name, " : ",metrics.mean_squared_error(test_output,predicted_values))
    print("R2 score for ",regressor_name, " : ",metrics.r2_score(test_output,predicted_values))
    print("--------------------------------\n")
