In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, ClassifierMixin

import re

In [132]:
folderName = 'delta_ail'
fileName = 'delta_ail_test'

In [133]:
data = pd.read_csv("../datasets/{}/{}.csv".format(folderName, fileName))
k = 10

In [134]:
y = np.array(data['Goal'] - 1)
x = np.array(data.drop('Goal', axis=1))

In [135]:
class DCModel(BaseEstimator, ClassifierMixin):
    
    def __init__(self, k):
        self.k = k
        self.nodes = [None] * (4 * k + 1)
        
    def __fit_node_on_range(self, l, r, X, y):
        ind = np.where(np.logical_and(y >= l, y <= r))
        m = (l + r) // 2
        x_in_range = X[ind]
        y_in_range = np.where(y[ind] > m, 1, 0)
        return SGDClassifier(loss='log').fit(x_in_range, y_in_range)
        
    def fit(self, X, y):
        def build(v, l, r):
            if l == r:
                return
            self.nodes[v] = self.__fit_node_on_range(l, r, X, y)
            m = (l + r) // 2
            build(2 * v, l, m)
            build(2 * v + 1, m + 1, r)
        build(1, 0, self.k - 1)
        return self
        
    def predict(self, X):
        def run_dc(x, v, l, r):
            if l == r:
                return np.full(x.shape[0], l)
            m = (l + r) // 2
            if(x.shape[0] == 0):
                return np.array([])
            pred_binary = self.nodes[v].predict(x)
            indices_left = pred_binary == 0
            indices_right = pred_binary == 1
            preds_left = run_dc(x[indices_left], 2 * v, l, m)
            preds_right = run_dc(x[indices_right], 2 * v + 1, m + 1, r)
            pred = np.empty(x.shape[0])
            pred[indices_left] = preds_left
            pred[indices_right] = preds_right
            return pred
        return run_dc(X, 1, 0, self.k - 1)
        

In [136]:
class FHModel(BaseEstimator, ClassifierMixin):
    
    def __init__(self, k):
        self.k = k
        self.models = [None] * (k - 1)

    def fit(self, X, y):
        for i in range(self.k - 1):
            y_relative = np.where(y > i, 1, 0)
            self.models[i] = SGDClassifier(loss='log').fit(X, y_relative)
        return self
        
    def predict(self, X):
        pred = np.array([model.predict_proba(X)[:, 1] for model in self.models]).T
        r = np.append(pred, np.zeros((X.shape[0], 1)), axis=1)
        l = np.insert(pred, 0, np.ones(X.shape[0]), axis=1)
        return np.argmax(l - r, axis=1)

In [137]:
def MSE(y, y_pred):
    conf_mat = confusion_matrix(y, y_pred)
    m = y.shape[0]
    diffs = [[None] * k for _ in range(k)]
    for i in range(k):
        for j in range(k):
            diffs[i][j] = (i - j) ** 2
    return 1 / m * np.sum(np.multiply(conf_mat, np.array(diffs)))
score = make_scorer(MSE, greater_is_better=False)

In [138]:
MSE_ovr = 0
MSE_dc = 0
MSE_fh = 0
for i in range(10):
    print('Starting test number {}'.format(i))
    dc = DCModel(k)
    fh = FHModel(k)
    ovr = SGDClassifier(loss='log')
    MSE_ovr += cross_val_score(ovr, x, y, scoring=score, cv=10).mean()
    MSE_dc += cross_val_score(dc, x, y, scoring=score, cv=10).mean()
    MSE_fh += cross_val_score(fh, x, y, scoring=score, cv=10).mean()
print(-MSE_ovr / 10, -MSE_fh / 10, -MSE_dc / 10)

Starting test number 0
Starting test number 1
Starting test number 2
Starting test number 3
Starting test number 4
Starting test number 5
Starting test number 6
Starting test number 7
Starting test number 8
Starting test number 9
4.5218090399798285 4.420866157397922 4.873901598720394
