In [1]:
from scipy.io import arff
import numpy as np
import pandas as pd
import os
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer
from collections import defaultdict
from scipy.stats import rankdata
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.stats import pearsonr
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [2]:
class surrogate:

    def __init__(self, ntrees=16):
        self.models = {}
        self.ntrees = ntrees

    def get_data(self, data, relevant_params):
        task_ids = data["task_id"].unique()
        data = data[relevant_params + ["task_id"] + ["y"]]
        self.data_dict = defaultdict()
        for task_id in task_ids:
            X_task = data.loc[data["task_id"] == task_id]
            y_task = np.array(X_task["y"], dtype=np.float)
            X_task.drop(["y", "task_id"], 1, inplace=True)
            categorical_names = X_task.select_dtypes(include=['object']).columns
            categorical_ids = [X_task.columns.get_loc(colname) for colname in categorical_names]
            self.data_dict[task_id] = (X_task.as_matrix(), y_task, categorical_ids)
        return task_ids

    def split_data(self, X, y, test_size = 0.15, seed = 42):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
        return X_train, X_test, y_train, y_test

    def train_model(self, X, y, categoricals, task_id):
        if task_id in self.models:
            return self.models[task_id]
        clf = Pipeline(
            steps=[('encoder', sklearn.preprocessing.OneHotEncoder(
                categorical_features=list(categoricals), handle_unknown='ignore')),
                    ('classifier', RandomForestRegressor(n_estimators=self.ntrees))])
        clf.fit(X, y)
        self.models[task_id] = clf
        return clf

    def average_rank(self, val_task_id):
        X_val, y_val, cat_val = self.data_dict[val_task_id]
        total_ranks = np.zeros(len(X_val))
        for task_id in self.data_dict:
            if task_id == val_task_id:
                continue
            X_train, y_train, cat_train = self.data_dict[task_id]
            clf = self.train_model(X_train, y_train, cat_train, task_id)
            y_val_hat = clf.predict(X_val)
            task_ranks = rankdata(y_val_hat, method='average')
            total_ranks = np.add(total_ranks, task_ranks)
        total_ranks = np.divide(total_ranks, len(X_val))
        return total_ranks

    def custom_scorer(self, y, y_hat):
        return pearsonr(y, y_hat)[0]

    def train_surrogate(self, filename="libsvm_svc.arff", relevant_params=["C", "gamma"]):
        data, meta = arff.loadarff(filename)
        df = pd.DataFrame(data)
        print("Printing sample data")
        print(df[:5])
        print("Data info")
        print(meta)
        task_ids = self.get_data(df, relevant_params)
        for task_id in task_ids:
            X, y, categoricals = self.data_dict[task_id]
            clf = self.train_model(X, y, categoricals, task_id)
            # y_hat = cross_val_predict(clf, X, y, cv=5)
            scores = cross_val_score(clf, X, y, cv=5, scoring=make_scorer(self.custom_scorer))
            # spearman = pearsonr(y, y_hat)
            print("Task %d; Pearson Spearman Correlation: %0.4f (+/- %0.4f)" %(task_id, scores.mean(), scores.std() * 2))
            average_rank = self.average_rank(task_id)
            ar_spearman = pearsonr(y, average_rank)
            print("Task %d; Average Rank: Pearson Spearman Correlation: %f" % (task_id, ar_spearman[0]))


In [3]:
s = surrogate()
s.train_surrogate()

Printing sample data
            C     coef0  degree     gamma      kernel max_iter shrinking  \
0  162.734738  0.471605     3.0  0.327973     b'poly'    b'-1'   b'True'   
1    9.845979  0.368980     4.0  0.004604     b'poly'    b'-1'   b'True'   
2   84.691746  0.002010     NaN  0.000094  b'sigmoid'    b'-1'  b'False'   
3    0.127243       NaN     NaN  0.025133      b'rbf'    b'-1'  b'False'   
4  953.723552  0.273139     NaN  0.034387  b'sigmoid'    b'-1'  b'False'   

           strategy       tol         y  task_id  
0           b'mean'  0.001108  0.993116      3.0  
1         b'median'  0.000089  0.993742      3.0  
2  b'most_frequent'  0.002602  0.939925      3.0  
3           b'mean'  0.000268  0.930538      3.0  
4  b'most_frequent'  0.015420  0.522528      3.0  
Data info
Dataset: openml-meta-flow-7707
	C's type is numeric
	coef0's type is numeric
	degree's type is numeric
	gamma's type is numeric
	kernel's type is nominal, range is ('poly', 'sigmoid', 'rbf')
	max_iter's typ

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Task 3; Pearson Spearman Correlation: 0.2920 (+/- 0.0907)
Task 3; Average Rank: Pearson Spearman Correlation: 0.375926
Task 6; Pearson Spearman Correlation: 0.3086 (+/- 0.1710)
Task 6; Average Rank: Pearson Spearman Correlation: 0.333934
Task 11; Pearson Spearman Correlation: 0.3308 (+/- 0.1372)
Task 11; Average Rank: Pearson Spearman Correlation: 0.385014
Task 12; Pearson Spearman Correlation: 0.2949 (+/- 0.2146)
Task 12; Average Rank: Pearson Spearman Correlation: 0.389577
Task 14; Pearson Spearman Correlation: 0.3945 (+/- 0.1413)
Task 14; Average Rank: Pearson Spearman Correlation: 0.502668
Task 15; Pearson Spearman Correlation: 0.4188 (+/- 0.2680)
Task 15; Average Rank: Pearson Spearman Correlation: 0.359106
Task 16; Pearson Spearman Correlation: 0.5879 (+/- 0.1402)
Task 16; Average Rank: Pearson Spearman Correlation: 0.545469
Task 18; Pearson Spearman Correlation: 0.4827 (+/- 0.1036)
Task 18; Average Rank: Pearson Spearman Correlation: 0.345527
Task 20; Pearson Spearman Correlatio

Task 9964; Average Rank: Pearson Spearman Correlation: 0.657215
Task 9967; Pearson Spearman Correlation: 0.2665 (+/- 0.1792)
Task 9967; Average Rank: Pearson Spearman Correlation: 0.349061
Task 9968; Pearson Spearman Correlation: 0.2593 (+/- 0.3634)
Task 9968; Average Rank: Pearson Spearman Correlation: 0.388650
Task 9970; Pearson Spearman Correlation: 0.4899 (+/- 0.0817)
Task 9970; Average Rank: Pearson Spearman Correlation: -0.150618
Task 9971; Pearson Spearman Correlation: 0.3721 (+/- 0.2771)
Task 9971; Average Rank: Pearson Spearman Correlation: 0.023438
Task 9976; Pearson Spearman Correlation: 0.4092 (+/- 0.2101)
Task 9976; Average Rank: Pearson Spearman Correlation: 0.439282
Task 9977; Pearson Spearman Correlation: 0.3380 (+/- 0.0648)
Task 9977; Average Rank: Pearson Spearman Correlation: 0.358506
Task 9978; Pearson Spearman Correlation: 0.1030 (+/- 0.1990)
Task 9978; Average Rank: Pearson Spearman Correlation: 0.029724
Task 9979; Pearson Spearman Correlation: 0.3464 (+/- 0.2018)

In [4]:
# y = alpha * hyperparameter_1 ^ 2 + beta * hyperparameter_2 ^ 2 + gamma * hyperparameter_1 * hyperparameter_2 + delta
# hyperparameter_1 = gamma
# hyperparameter_2 = complexity (C)
class simplified_surrogate:

    def __init__(self, surrogate):
        self.surrogate = surrogate
        self.models = {}

    def train_model(self):
        for task_id in self.surrogate.data_dict:
            X, y, _ = self.surrogate.data_dict[task_id]
            X = np.append(X, X ** 2, axis = 1)
            model = LinearRegression()
            model.fit(X, y)
            # coeff = model.coef_
            self.models[task_id] = model
            print("Task %d; Pearson Spearman Correlation: %f" % (task_id, pearsonr(y, model.predict(X))[0]))

    def plot_curve(self, task_id):
        X, y, _ = self.surrogate.data_dict[task_id]
        model = self.models[task_id]
        y_hat = model.predict(np.append(X, X ** 2, axis = 1))
        plt.plot(y_hat, c="blue")
        plt.plot(y, c="red")
        plt.axis("equal")

In [5]:
s_ = simplified_surrogate(s)
s_.train_model()

Task 3; Pearson Spearman Correlation: 0.361622
Task 6; Pearson Spearman Correlation: 0.209943
Task 11; Pearson Spearman Correlation: 0.360331
Task 12; Pearson Spearman Correlation: 0.291551
Task 14; Pearson Spearman Correlation: 0.331363
Task 15; Pearson Spearman Correlation: 0.511048
Task 16; Pearson Spearman Correlation: 0.316331
Task 18; Pearson Spearman Correlation: 0.225590
Task 20; Pearson Spearman Correlation: 0.316019
Task 21; Pearson Spearman Correlation: 0.355955
Task 22; Pearson Spearman Correlation: 0.516778
Task 23; Pearson Spearman Correlation: 0.370407
Task 24; Pearson Spearman Correlation: 0.474911
Task 28; Pearson Spearman Correlation: 0.524861
Task 29; Pearson Spearman Correlation: 0.702276
Task 31; Pearson Spearman Correlation: 0.324804
Task 32; Pearson Spearman Correlation: 0.279044
Task 36; Pearson Spearman Correlation: 0.249617
Task 37; Pearson Spearman Correlation: 0.604665
Task 41; Pearson Spearman Correlation: 0.466328
Task 43; Pearson Spearman Correlation: 0.5