### Testing for Co-regression with co-training

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor

In [67]:
n_neighbors = 3
estimator1 = KNeighborsRegressor(n_neighbors=n_neighbors)
estimator2 = KNeighborsRegressor(n_neighbors=n_neighbors)

X1 = np.zeros((100,2))
X1[:,0] = np.linspace(0,1,100)
X1[:,1] = np.linspace(10,15,100)
X2 = X1.copy()
X2[:,0] = X2[:,0] + 0.1
X2[:,1] = np.linspace(-100,-92,100)


y = X1[:,0]

y[np.arange(0,100,2)] = np.nan

print(y)

ctr = CTRegressor(random_state=2)
ctr.fit([X1, X2], y, unlabeled_pool_size=10, num_iter=2)

[        nan  0.01010101         nan  0.03030303         nan  0.05050505
         nan  0.07070707         nan  0.09090909         nan  0.11111111
         nan  0.13131313         nan  0.15151515         nan  0.17171717
         nan  0.19191919         nan  0.21212121         nan  0.23232323
         nan  0.25252525         nan  0.27272727         nan  0.29292929
         nan  0.31313131         nan  0.33333333         nan  0.35353535
         nan  0.37373737         nan  0.39393939         nan  0.41414141
         nan  0.43434343         nan  0.45454545         nan  0.47474747
         nan  0.49494949         nan  0.51515152         nan  0.53535354
         nan  0.55555556         nan  0.57575758         nan  0.5959596
         nan  0.61616162         nan  0.63636364         nan  0.65656566
         nan  0.67676768         nan  0.6969697          nan  0.71717172
         nan  0.73737374         nan  0.75757576         nan  0.77777778
         nan  0.7979798          nan  0.81818182    

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [60]:
class CTRegressor():
    def __init__(
                 self,
                 estimator1=None,
                 estimator2=None,
                 k_neighbors = 3,
                 random_state=0
                 ):

        # initialize a BaseCTEstimator object
        #super().__init__(KNeighborsRegressor(n_neighbors=k_neighbors), KNeighborsRegressor(n_neighbors=k_neighbors), random_state)
        self.random_state = random_state

        # if not given, set classifiers as gaussian naive bayes estimators
        if estimator1 is None:
            self.estimator1 = KNeighborsRegressor(n_neighbors=k_neighbors)
        if estimator2 is None:
            self.estimator2 = KNeighborsRegressor(n_neighbors=k_neighbors)

        self.n_views_ = 2  # only 2 view learning supported currently

        self.class_name = "CTRegressor"
        
        self.k_neighbors_ = 3

    # requires Labeled sets for each view, U' shared by both views
    def fit(
            self,
            Xs,
            y,
            p=1,
            n=1,
            unlabeled_pool_size=50,
            num_iter=50
            ):
        # split data
        # fit each estimator to Labeled set
        # for num_iter:
            # for each view:
                # for each sample in U' for that view:
                    # regress (y_hat = h(x))
                    # find the k nearest examples in L to x (a set called Omega)
                    # fit a new kNN to the labeled set with the addition of this sample
                    # compute and store deltaMSE (MSE of the old regressor on each sample in Omega, minus MSE of new regressor on each sample in Omega)
                # if there exists deltaMSE > 0
                # then pick this example (and its regression) to add to the labeled set of other view, remove from U'
            # if didn't add a new sample to either set, then exit

        # final regressor = .5*h1(x) + h2(x)

        print('here')
        y = np.array(y)
        
        self.p_, self.n_ = p, n
        
        np.random.seed(self.random_state)
        
        self.unlabeled_pool_size_ = unlabeled_pool_size
        self.num_iter_ = num_iter

        # extract the multiple views given
        X1 = Xs[0]
        X2 = Xs[1]

        # the full set of unlabeled samples
        U = [i for i, y_i in enumerate(y) if np.isnan(y_i)]
        
        # shuffle unlabeled_pool data for easy random access
        np.random.shuffle(U)

        # the small pool of unlabled samples to draw from in training
        unlabeled_pool = U[-min(len(U), self.unlabeled_pool_size_):]
        

        # the labeled samples
        L = [i for i, y_i in enumerate(y) if ~np.isnan(y_i)]

        # remove the pool from overall unlabeled data
        U = U[:-len(unlabeled_pool)]

        it = 0
        
        print("starting")
        print(len(L))
        print(len(U))
        
        while it < self.num_iter_ and U:
            it += 1
            
            print("unlabeled pool")
            print(unlabeled_pool)
            print("labeled")
            print(X1[L])

            # fit each model to its respective view
            self.estimator1.fit(X1[L], y[L])
            self.estimator2.fit(X2[L], y[L])
            
            y_hat1 = self.estimator1.predict(X1[unlabeled_pool])
            y_hat2 = self.estimator2.predict(X2[unlabeled_pool])
            
            neighbors1 = (self.estimator1.kneighbors(X1[unlabeled_pool], n_neighbors=self.k_neighbors_))[1]
            neighbors2 = (self.estimator1.kneighbors(X2[unlabeled_pool], n_neighbors=self.k_neighbors_))[1]
            
            # find sample in each view which lowers the MSE the most
            delta_MSE1 = []
            for sample, (u, neigh) in enumerate(zip(unlabeled_pool, neighbors1)):
                new_L = L.copy()
                new_L.append(u)
                new_y = np.concatenate((y[L].copy(), np.array(y_hat1[sample]).reshape(1,)))
                new_estimator = KNeighborsRegressor(n_neighbors=self.k_neighbors_)
                new_estimator.fit(X1[new_L], new_y)
                delta_MSE1.append(self.estimate_delta_MSE_(self.estimator1, new_estimator, (X1[L])[neigh], (y[L])[neigh]))
                print(delta_MSE1[-1])
            
            best_delta_idx = np.argmax(delta_MSE1)
            now_labeled = []
            add_labels = []
            if delta_MSE1[best_delta_idx] > 0:
                now_labeled.append(unlabeled_pool[best_delta_idx])
                add_labels.append(y_hat1[best_delta_idx])
                print("Added best from view 1")
            
            delta_MSE2 = []
            for sample, (u, neigh) in enumerate(zip(unlabeled_pool, neighbors2)):
                new_L = L.copy()
                new_L.append(u)
                new_y = np.concatenate((y[L].copy(), np.array(y_hat2[sample]).reshape(1,)))
                new_estimator = KNeighborsRegressor(n_neighbors=self.k_neighbors_)
                new_estimator.fit(X2[new_L], new_y)
#                 # debugging
#                 if sample == 2:
#                     delta_MSE2.append(2)
#                 else:
#                     delta_MSE2.append(.5)
                delta_MSE2.append(self.estimate_delta_MSE_(self.estimator2, new_estimator, (X2[L])[neigh], (y[L])[neigh]))   
            
            # find top 2 in case overlap with view 1 selection
            best_delta_idx = np.argsort(delta_MSE2)[-2:][::-1]
            print(best_delta_idx)
            if delta_MSE2[best_delta_idx[0]] > 0:
                if best_delta_idx[0] not in now_labeled:
                    print("adding best")
                    now_labeled.append(unlabeled_pool[best_delta_idx[0]])
                    add_labels.append(y_hat1[best_delta_idx[0]])
                elif best_delta_idx[1] > 0:
                    print("adding second best because overlap")
                    now_labeled.append(unlabeled_pool[best_delta_idx[1]])
                    add_labels.append(y_hat1[best_delta_idx[1]])
            elif delta_MSE2[best_delta_idx[1]] > 0:
                print("adding second best")
                now_labeled.append(unlabeled_pool[best_delta_idx[1]])
                add_labels.append(y_hat1[best_delta_idx[1]])
            
            print(add_labels)
            print(now_labeled)
                    
            # create new labels for new additions to the labeled group
            for x, y_hat in zip(now_labeled, add_labels):
                print(x)
                y[x] = y_hat
                L.extend([x])

            # remove newly labeled samples from unlabeled_pool
            unlabeled_pool = [elem for elem in unlabeled_pool
                              if not (elem in now_labeled)]

            # add new elements to unlabeled_pool
            add_counter = 0
            while add_counter != len(now_labeled) and U:
                add_counter += 1
                unlabeled_pool.append(U.pop())

        print("ending")
        print(len(L))
        print(len(U))
        # fit the overall model on fully "labeled" data
        self.estimator1.fit(X1[L], y[L])
        self.estimator2.fit(X2[L], y[L])
            
    def estimate_delta_MSE_(self, old_estimator, new_estimator, X, y):
        """
        Estimate the decrease in MSE of the new estimator based on a small
        sample of neighbors.
        
        Parameters
        ----------
        old_estimator: estimator object
            The current estimator trained on less data.
        
        new_estimator: estimator object
            The new estimator trained with additional data.
            
        X : array-like, shape (n_samples, n_features)
            The truly labeled data that the old estimator was trained on.
        
        y : array-like, shape (n_samples,)
            The labels for the samples in X.
        
        Returns
        -------
        delta_MSE : float
            
        
        """
        
        # estimate the change in MSE
        y_hat_old = old_estimator.predict(X)
        y_hat_new = new_estimator.predict(X)
        
        print("errors")
        print(np.sum((y-y_hat_old)**2))
        print(np.sum((y-y_hat_new)**2))
        
        return np.sum((y-y_hat_old)**2 - (y-y_hat_new)**2)
            
        
    

In [65]:
xx = [-1,3,-2,5]
print(np.argsort(xx)[-2:][::-1])
xx = np.array([[0,1,2], [1,2,3]])
print(xx.shape)
for samp in xx:
    print(samp)
    
y = np.zeros(10,)
y2 = np.ones(10,)
print(type(np.sum((y-y2)**2)))

[3 1]
(2, 3)
[0 1 2]
[1 2 3]
<class 'numpy.float64'>
