# Evaluate the imputation method

In [58]:
import numpy as np
import sklearn
from nmf_with_missing_values import nmf_with_missing_values
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import KNeighborsRegressor as kNN
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (mean_squared_error,
                             mean_absolute_error,
                             explained_variance_score,
                             r2_score,
                            )
from tqdm import tqdm
%matplotlib inline

In [59]:
#load_data
tmp = np.load('../data/filtered_data_with_missing_20_percent.npz')
missing_mask = tmp['missing_mask']
data = tmp['data']
region_of_interest = tmp['region_of_interest']

In [60]:
#preprocess generate the new data
new_data = data.copy()
new_data[missing_mask] = -1

## Section 1: nearest neighbors

In [61]:
#define_model

class kNN_imputation:
    def __init__(self, 
                 n_neighbors = 1,
                 weights = 'uniform',
                 metric = 'euclidean'):
        ''' Init function of kNN_imputation
        
        '''
        self.model = kNN(n_neighbors=n_neighbors, metric=metric, weights = weights, n_jobs=3)
    def fit_transform(self, X, template, inplace = False):
        '''
        Input:
          X : 4d array, missing values are -1.
          template : 0-1 3d array, 1 means the voxel is of interest. 
        '''
        if inplace:
            Y = X
        else:
            Y = np.copy(X)
        for ind in tqdm(range(X.shape[0])):
            self.fit_transform_one_img(Y[ind,:,:,:], template, inplace=True)
        return Y
         
        
    def fit_transform_one_img(self, X, template, inplace = False):
        '''
        Input:
          X : 3d array, missing values are -1.
          template : 0-1 3d array, 1 means the voxel is of interest. 
        '''
        long_form = []
        for x in range(X.shape[0]):
            for y in range(X.shape[1]):
                for z in range(X.shape[2]):
                    if template[x, y, z] > 0:
                        long_form.append([x, y, z, X[x, y, z]])
        long_form = np.array(long_form)
        train_ind = long_form[:,3] >= 0
        test_ind = long_form[:,3] < 0
        X_train = long_form[train_ind,:3].astype(int)
        y_train = long_form[train_ind,3]
        X_test = long_form[test_ind, :3].astype(int)
        self.model.fit(X_train, y_train)
        y_test = self.model.predict(X_test)
        if inplace:
            Y = X
        else:
            Y = np.copy(X)
        for ind, (x, y, z) in enumerate(X_test):
            Y[x, y, z] = y_test[ind]
        return Y
        

In [62]:
#fit_model
model = kNN_imputation(n_neighbors=6, weights='distance')
imputed = model.fit_transform(new_data[:40], region_of_interest[0], inplace=False)

100%|██████████| 40/40 [00:45<00:00,  1.07s/it]


In [63]:
#load_model

In [64]:
#visualize evaluate the performance of model
y_true = data[:40][missing_mask[:40]].flatten()
y_pred = imputed[missing_mask[:40]].flatten()
print("The MSE is {}".format(
    mean_squared_error(y_true=y_true, y_pred=y_pred)))
print("The MAE is {}".format(
    mean_absolute_error(y_true=y_true, y_pred=y_pred)))
print("The explained_variance_score is {}".format(
    explained_variance_score(y_true=y_true, y_pred=y_pred)))
print("The R2 is {}".format(
    r2_score(y_true=y_true, y_pred=y_pred)))

The MSE is 2.4354538254556246e-05
The MAE is 0.0022555976174771786
The explained_variance_score is 0.9842177033424377
The R2 is 0.9842172056114726


## Section 2: NMF method

In [65]:
#define_model
class NMF_imputation:
    def __init__(self, init = 'nndsvd',
                 n_components = 18,
                 n_outer_loops = 4):
        self.nmf = nmf_with_missing_values(n_outer_loops = n_outer_loops,
                                           n_components = n_components, 
                                           init = init, 
                                           save_space = False,
                                           random_state = None)
    def fit_transform(self, data, template, missing_mask):
        # reshape
        original_shape = data.shape
        d = data.shape[1] * data.shape[2] * data.shape[3]
        reshaped_data = np.reshape(data, (data.shape[0], d))
        reshaped_missing_mask = np.reshape(missing_mask, (data.shape[0], d))
        reshaped_template = np.reshape(template, (d,))
        assert data.shape[0] > self.nmf.n_components, 'The data should have more samples than n_components.'
        self.nmf.fit_transform(reshaped_data[:,reshaped_template], missing_mask=reshaped_missing_mask[:,reshaped_template])
        X_guess = self.nmf.X_guess
        imputed_X = np.zeros_like(reshaped_data)
        imputed_X[:, reshaped_template] = X_guess
        return np.reshape(imputed_X, data.shape)

In [66]:
#fit_model
model_nmf = NMF_imputation(n_outer_loops = 4, n_components=18)
imputed_nmf = model_nmf.fit_transform(imputed[:40], region_of_interest[0],missing_mask[:40])

In [70]:
#visualize evaluate the performance of model
y_true = data[:40][missing_mask[:40]].flatten()
y_pred = imputed_nmf[missing_mask[:40]].flatten()
print("The MSE is {}".format(
    mean_squared_error(y_true=y_true, y_pred=y_pred)))
print("The MAE is {}".format(
    mean_absolute_error(y_true=y_true, y_pred=y_pred)))
print("The explained_variance_score is {}".format(
    explained_variance_score(y_true=y_true, y_pred=y_pred)))
print("The R2 is {}".format(
    r2_score(y_true=y_true, y_pred=y_pred)))

The MSE is 5.172146484255791e-05
The MAE is 0.003941208589822054
The explained_variance_score is 0.9664987325668335
The R2 is 0.9664822531055421
