**Link with Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


**Import Libraries**

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sklearn
import os
import gc
gc.enable()
from operator import itemgetter
from tqdm import tqdm
from scipy import optimize
from sklearn.metrics import mean_squared_error as MSE
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.gaussian_process import kernels
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

**GLOBAL LOG VARIABLE**

In [None]:
log = ""

**Utility Functions**

In [None]:
#Establishing directoyires to read original as well as incomplete datasets
def read_subsets_and_original(BASE_PATH, ORIGINAL_BASE_PATH, dataset_name):
  sub_incomplete_dataset = os.listdir(BASE_PATH + dataset_name)
  # print ("Found Total Subsets : ", len(sub_incomplete_dataset))
  original = pd.read_excel(ORIGINAL_BASE_PATH + dataset_name + '.xlsx', header=None)
  original = original.infer_objects()
  subsets = {}
  for each in tqdm(sub_incomplete_dataset, total=len(sub_incomplete_dataset)):
    subsets[each.split('.')[0]] = pd.read_excel(BASE_PATH + dataset_name + '/' + each, header=None)
  return subsets, original

#Calculate NRMS value (Formula according to the documentation)
def calculate_NRMS(y_true, y_pred):
  upper_values = y_pred - y_true
  #CHECK DOCUMENTATION ON https://numpy.org/doc/stable/reference/generated/numpy.linalg.norm.html

  #ord = 'fro' means frobenius norm
  upper_normed = np.linalg.norm(upper_values, ord='fro')
  lower_normed = np.linalg.norm(y_true, ord='fro')
  return upper_normed / lower_normed

#Calculate AE value (Formula according to the documentation)
def calculate_AE_DICT(y_true, y_pred):
  y_true = y_true.flatten()
  y_pred = y_pred.flatten()
  return y_true[y_true == y_pred].shape[0] / float(y_true.shape[0]) #(1 0 1 0 1) #(0 1 0 0 1) #2/5


**GRNN Implementation**

In [None]:
#Build GRNN Model
class GRNN(BaseEstimator, RegressorMixin):
    #Initializing all the elements
    def __init__(self, kernel='RBF', sigma=0.7, n_splits=5, calibration='warm_start', method='L-BFGS-B', bnds=(0, None), n_restarts_optimizer=0, seed = 42):
        self.kernel = kernel
        self.sigma = sigma
        self.n_splits = n_splits
        self.calibration = calibration
        self.method = method
        self.iterations = 0
        self.bnds = bnds
        self.n_restarts_optimizer = n_restarts_optimizer
        self.seed = seed
        
    def fit(self, X, y):

        # Check that X and y have correct shape
        # X, y = check_X_y(X, y)
        
        self.X_ = X
        self.y_ = y
        bounds = self.bnds
        
        np.seterr(divide='ignore', invalid='ignore')
        
        #Initializaing and establishing the cost function
        def cost(sigma_):
            kf = KFold(n_splits= self.n_splits, random_state=self.seed)
            kf.get_n_splits(self.X_)
            cv_err = []
            for train_index, validate_index in kf.split(self.X_):
                X_tr, X_val = self.X_[train_index], self.X_[validate_index]
                y_tr, y_val = self.y_[train_index], self.y_[validate_index]
                Kernel_def_= getattr(kernels, self.kernel)(length_scale=sigma_)
                K_ = Kernel_def_(X_tr, X_val)
                # If the distances are very high/low, zero-densities must be prevented:
                K_ = np.nan_to_num(K_)
                psum_ = K_.sum(axis=0).T # Cumulate denominator of the Nadaraya-Watson estimator
                psum_ = np.nan_to_num(psum_)
                y_pred_ = (np.dot(y_tr.T, K_) / psum_)
                y_pred_ = np.nan_to_num(y_pred_)
                cv_err.append(MSE(y_val, y_pred_.T))
                break
            return cv_err[0] ## Mean error over the k splits                        
        
        #Establising the optimization function
        def optimization(x0_):
            rlog = ""
            if len(self.bnds) > 1:
              self.bnds = (self.bnds[0], )


            try:
              if len(x0_) > 1:
                x0_ = x0_[0]
            except:
              rlog = "x0_ is Good Enough"

            # print ("x0_", x0_)
            # print ("Bounds : ", self.bnds)
            opt = optimize.minimize(cost, x0_, method=self.method, bounds=self.bnds)
            if opt['success'] is True:
                opt_sigma = opt['x']
                opt_cv_error = opt['fun']
            else:
                opt_sigma = np.full(len(self.X_[0]), np.nan)
                opt_cv_error = np.inf
                pass
            return [opt_sigma, opt_cv_error]
        
        #Regulating and calibrating sigma
        def calibrate_sigma(self):
            x0 = np.asarray(self.sigma) # Starting guess (either user-defined or measured with warm start)
            if self.n_restarts_optimizer > 0:
                # print ("################################")    
                optima = [optimization(x0)]            
                #First optimize starting from theta specified in kernel
                optima = [optimization(x0)] 
                # # Additional runs are performed from log-uniform chosen initial bandwidths
                r_s = np.random.RandomState(self.seed)
                for iteration in range(self.n_restarts_optimizer): 
                    x0_iter = np.full(len(self.X_[0]), np.around(r_s.uniform(0,1), decimals=3))
                    optima.append(optimization(x0_iter))             
            elif self.n_restarts_optimizer == 0: 
                # print ("Running SAD ONE")    
                optima = [optimization(x0)]            
            else:
                raise ValueError('n_restarts_optimizer must be a positive int!')
            
            # Select sigma from the run minimizing cost
            cost_values = list(map(itemgetter(1), optima))
            self.sigma = optima[np.argmin(cost_values)][0]
            self.cv_error = np.min(cost_values) 
            return self
        
        global log
        if self.calibration is 'warm_start':
            log = log + 'Executing warm start...' + '/n'
            self.bnds = (bounds,)           
            x0 = np.asarray(self.sigma)
            optima = [optimization(x0)]            
            cost_values = list(map(itemgetter(1), optima))
            self.sigma = optima[np.argmin(cost_values)][0]
            log = log + 'Warm start concluded. The optimum isotropic sigma is ' + str(self.sigma) + '/n'
            self.sigma = np.full(len(self.X_[0]), np.around(self.sigma, decimals=3))
            self.bnds = (bounds,)*len(self.X_[0])
            # print ('Executing gradient search...')
            calibrate_sigma(self)
            log = log + 'Gradient search concluded. The optimum sigma is ' + str(self.sigma) + '/n'
        elif self.calibration is 'gradient_search':
            #print ('Executing gradient search...')
            self.sigma = np.full(len(self.X_[0]), self.sigma)
            self.bnds = (bounds,)*len(self.X_[0])
            calibrate_sigma(self)
            #print('Gradient search concluded. The optimum sigma is ' + str(self.sigma))
        else:
            pass
                   
        self.is_fitted_ = True
        # Return the regressor
        return self

    #Gathering all the above and predicting the values 
    def predict(self, X):
        
         # Check if fit had been called
        # check_is_fitted(self, ['X_', 'y_'])
        
        # Input validation
        X = check_array(X)
        
        Kernel_def= getattr(kernels, self.kernel)(length_scale=self.sigma)
        K = Kernel_def(self.X_, X)
        # If the distances are very high/low, zero-densities must be prevented:
        K = np.nan_to_num(K)
        psum = K.sum(axis=0).T # Cumulate denominator of the Nadaraya-Watson estimator
        psum = np.nan_to_num(psum)
        return np.nan_to_num((np.dot(self.y_.T, K) / psum))


**SAGA Feature Selection**

In [None]:
############################################
############################################

def SAGA_FEATURE_SELECTION(X_train, y_train):
  model_logistic = Ridge(solver='saga')
  sel_model_logistic = SelectFromModel(estimator=model_logistic)
  X_train_sfm_l1 = sel_model_logistic.fit_transform(X_train.values, y_train.values)
  Indicator_columns = sel_model_logistic.get_support()
  return Indicator_columns #SAGA BASED FEATURE SELECTION

############################################
############################################


**Reading Metadata**

In [None]:
#Reading the data to run our model on
BASE = 'gdrive/My Drive/Course Project Datasets/Course Project Datasets/'
BASE_PATH = BASE + 'Incomplete Datasets Without Labels/'
ORIGINAL_BASE_PATH = BASE + 'Original Datasets Without Labels/'
meta_data = pd.read_excel(BASE + 'List of Datasets.xlsx')
categorical = meta_data[meta_data['Numerical'] == 0].reset_index(drop=True)
numerical = meta_data[meta_data['Categorical'] == 0].reset_index(drop=True)
combined = meta_data[(meta_data['Numerical'] != 0) & (meta_data['Categorical'] != 0)].reset_index(drop=True)

**Processing Datasets with only Numerical Values**

In [None]:
numerical #Datasets with only Numerical Values
numerical['Abbreviation'][numerical['Abbreviation'] == 'BUPA'] = 'Bupa'
numerical

Unnamed: 0,Dataset Name,Abbreviation,Instances,Features,Classes,Numerical,Categorical
0,Iris,Iris,150,4,3,4,0
1,Wine,Wine,178,13,3,13,0
2,Glass,Glass,214,9,7,9,0
3,Statlog Heart (CL),Sheart,270,13,2,13,0
4,BUPA liver disorders,Bupa,345,6,2,6,0
5,Ionosphere,Ionosphere,351,34,2,34,0
6,Sonar,Sonar,208,60,2,60,0
7,Four Gaussian,4-gauss,800,12,4,12,0
8,Breast Cancer Wisconsin,BCW,683,9,2,9,0
9,Pima Indians Diabetes,PID,768,8,2,8,0


In [None]:
numerical = numerical[numerical['Abbreviation'] == 'Iris']
numerical
numerical = numerical.reset_index(drop=True)
os.mkdir("numericals")

In [None]:
NRMS_DICT = {}
AE_DICT = {}

In [None]:
import time
begin_time = time.time()

for index, row in tqdm(numerical.iterrows(), total=numerical.shape[0]): #For Each Numerical Dataset
  subsets, original = read_subsets_and_original(BASE_PATH, ORIGINAL_BASE_PATH, row['Abbreviation']) # Get All Subsets and Original Dataset
  subset_names = list(subsets.keys())
  #ITERATE OVER ALL SUBSETS OF A DATASET AND APPLY GRNN ON EACH ONE
  for each_subset_name in subset_names:

    #SELECTING A SUBSET
    selected_subset = subsets[each_subset_name]


    new_prediction = np.zeros(shape=original.shape) #SAMPLE ARRAY TO SAVE PREDICTIONS
    new_prediction = pd.DataFrame(data = new_prediction, columns=selected_subset.columns) 


    #COLUMNS ARRAY TO ITERATE
    all_cols = np.array(original.columns) 
    for each in tqdm(all_cols, total=len(all_cols)):



      #ONE COLUMN IN TEST AND OTHERS IN TRAINING
      train_cols = all_cols[all_cols != each] 
      test_col = each

      #CHECKING IF THERE ARE NULL VALUES IN OUR TEST COLUMNS
      nulls = selected_subset[each].isnull() 
      test_index = nulls[nulls == True].index
      train_index = nulls[nulls == False].index


      #IF THERE IS NO NULL VALUE THEN WO WONT APPLY GRNN
      if test_index.shape[0] == 0 or test_index.shape[0] / float(nulls.shape[0]) < 0.1:
        new_prediction[each] = original[each].copy()

      else:
        ############################################
        ############################################
        #TRAIN GRNN ON INDEX WHERE THERE IS NO NULL AND PREDICT ON NULL VALUES
        custom_GRNN = GRNN()
        SAGA_BASED_FEATURES = SAGA_FEATURE_SELECTION(original[train_cols].loc[train_index], original[test_col].loc[train_index]) #SAGA
        ############################################
        ############################################


        ############################################
        ############################################
        #Normalization
        normalizer = StandardScaler()

        train_X = original[train_cols[SAGA_BASED_FEATURES]].loc[train_index].values
        train_Y = original[test_col].loc[train_index].values

        test_X = original[train_cols[SAGA_BASED_FEATURES]].loc[test_index].values

        normalizer.fit(train_X, train_Y)

        normalizer_train_X = normalizer.transform(train_X)
        normalizer_test_X = normalizer.transform(test_X)
        ############################################
        ############################################
        


        custom_GRNN.fit(normalizer_train_X, train_Y)

        #PREDICT
        prediction_smothened = custom_GRNN.predict(normalizer_test_X)

        #FILL OUR SAVING ARRAY WITH PREDICTIONS
        new_prediction[each].loc[train_index] = selected_subset[each].loc[train_index]
        new_prediction[each].loc[test_index] = prediction_smothened

    new_prediction.to_csv("numericals/imputed_" + each_subset_name + ".csv", index=False)
    NRMSE = calculate_NRMS(original.values, new_prediction.values)
    log = log + "Done Smoothing of : " + each_subset_name + " with NRMS : " + str(NRMSE) + '/n/n/n'
    NRMS_DICT[each_subset_name] = NRMSE

end_time = time.time()
diff = end_time - begin_time
print(diff) 

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/44 [00:00<?, ?it/s][A
 18%|█▊        | 8/44 [00:00<00:00, 79.31it/s][A
 34%|███▍      | 15/44 [00:00<00:00, 75.52it/s][A
 50%|█████     | 22/44 [00:00<00:00, 72.58it/s][A
 66%|██████▌   | 29/44 [00:00<00:00, 71.51it/s][A
100%|██████████| 44/44 [00:00<00:00, 71.96it/s]

100%|██████████| 4/4 [00:00<00:00, 70.78it/s]

100%|██████████| 4/4 [00:00<00:00, 329.09it/s]

100%|██████████| 4/4 [00:00<00:00, 68.07it/s]

100%|██████████| 4/4 [00:00<00:00, 317.01it/s]

100%|██████████| 4/4 [00:00<00:00, 95.03it/s]

100%|██████████| 4/4 [00:00<00:00, 59.53it/s]

100%|██████████| 4/4 [00:00<00:00, 114.61it/s]

100%|██████████| 4/4 [00:00<00:00, 45.11it/s]

100%|██████████| 4/4 [00:00<00:00, 304.97it/s]

100%|██████████| 4/4 [00:00<00:00, 57.95it/s]

100%|██████████| 4/4 [00:00<00:00, 88.19it/s]

100%|██████████| 4/4 [00:00<00:00, 70.49it/s]

100%|██████████| 4/4 [00:00<00:00, 259.75it/s]

100%|██████████| 4/4 [00:00<00:00, 71.20it/s]

100%|

2.8924708366394043





In [None]:
import zipfile

def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(os.path.join(root, file))


In [None]:

zipf = zipfile.ZipFile('numericals.zip', 'w', zipfile.ZIP_DEFLATED)
zipdir('numericals/', zipf)
zipf.close()       

**Categorical Features**

In [None]:
categorical #Datasets with only Numerical Values
categorical['Abbreviation'][categorical['Abbreviation'] == 'TTTEG'] = 'TTTTEG'
categorical

Unnamed: 0,Dataset Name,Abbreviation,Instances,Features,Classes,Numerical,Categorical
0,Tic-Tac-Toe End game,TTTTEG,958,9,2,0,9
1,House Votes,HOV,234,16,2,0,16
2,Mushroom,MUSH,5644,22,2,0,22
3,Splice,Splice,3190,60,3,0,60
4,Connect-4,C4,67557,42,3,0,42


In [None]:
categorical = categorical[categorical['Abbreviation'] == 'HOV']
categorical
categorical = categorical.reset_index(drop=True)
os.mkdir("categorical")

In [None]:
import time
begin_time = time.time()
for index, row in tqdm(categorical.iterrows(), total=categorical.shape[0]): #For Each Numerical Dataset
  subsets, original = read_subsets_and_original(BASE_PATH, ORIGINAL_BASE_PATH, row['Abbreviation']) # Get All Subsets and Original Dataset
  new_columns = []
  for each in original.columns:
    x_each = str(each)
    new_columns.append(x_each.replace(" ", ""))

  xoriginal = original.copy()
  
  subset_names = list(subsets.keys())
  #ITERATE OVER ALL SUBSETS OF A DATASET AND APPLY GRNN ON EACH ONE
  for each_subset_name in subset_names:

    #SELECTING A SUBSET
    selected_subset = subsets[each_subset_name]
    ss = pd.concat([selected_subset, xoriginal])
    ss = pd.get_dummies(ss)#APPLY ONE HOT ENCODING
    selected_subset = ss[0:selected_subset.shape[0]]
    original = ss[selected_subset.shape[0] :]
    
    new_columns = []
    for each in original.columns:
      x_each = str(each)
      new_columns.append(x_each.replace(" ", ""))
    original.columns = new_columns


    new_columns = []
    for each in selected_subset.columns:
      x_each = str(each)
      new_columns.append(x_each.replace(" ", ""))
    selected_subset.columns = new_columns



    new_prediction = np.zeros(shape=original.shape) #SAMPLE ARRAY TO SAVE PREDICTIONS
    new_prediction = pd.DataFrame(data = new_prediction, columns=selected_subset.columns) 


    #COLUMNS ARRAY TO ITERATE
    all_cols = np.array(original.columns) 
    for each in all_cols:



      #ONE COLUMN IN TEST AND OTHERS IN TRAINING
      train_cols = all_cols[all_cols != each] 
      test_col = each

      #CHECKING IF THERE ARE NULL VALUES IN OUR TEST COLUMNS
      nulls = selected_subset[each].isnull() 
      if len(nulls.shape) > 1:
        if nulls.shape[1] > 1:
          nulls = pd.DataFrame(nulls.values[:, 1], columns=[each])
      test_index = nulls[nulls == True].index
      train_index = nulls[nulls == False].index


      #IF THERE IS NO NULL VALUE THEN WO WONT APPLY GRNN
      if test_index.shape[0] == 0 or test_index.shape[0] / float(nulls.shape[0]) < 0.1:
        new_prediction[each] = original[each].copy()

      elif test_index.shape[0] == test_index.shape[0]:
        new_prediction[each] = 0

      else:
        #TRAIN GRNN ON INDEX WHERE THERE IS NO NULL AND PREDICT ON NULL VALUES
        custom_GRNN = GRNN()
        custom_GRNN.fit(original[train_cols].loc[train_index].values, original[test_col].loc[train_index].values)

        #PREDICT
        prediction_smothened = custom_GRNN.predict(original[train_cols].loc[test_index].values)

        #FILL OUR SAVING ARRAY WITH PREDICTIONS
        new_prediction[each].loc[train_index] = selected_subset[each].loc[train_index]
        if len(prediction_smothened.shape) > 1:
          if prediction_smothened.shape[0] > 1 and prediction_smothened.shape[1] > 1:
            prediction_smothened = prediction_smothened[0, :]
            cols = new_prediction.columns
            x = new_prediction.pop(each)
            new_prediction[each] = x.values[:, 0]
            new_prediction = new_prediction[cols]
        
        new_prediction[each].loc[test_index] = prediction_smothened

    new_prediction.to_csv("categorical/imputed_" + each_subset_name + ".csv", index=False)
    AE = calculate_AE_DICT(original.values, new_prediction.values)
    log = log + "Done Smoothing of : " + each_subset_name + " with AE : " + str(AE) + '/n/n/n'
    AE_DICT[each_subset_name] = AE


end_time = time.time()
diff = end_time - begin_time
print(diff) 

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/44 [00:00<?, ?it/s][A
  2%|▏         | 1/44 [00:00<00:20,  2.13it/s][A
  5%|▍         | 2/44 [00:00<00:19,  2.10it/s][A
  7%|▋         | 3/44 [00:01<00:19,  2.10it/s][A
  9%|▉         | 4/44 [00:01<00:20,  1.99it/s][A
 11%|█▏        | 5/44 [00:02<00:19,  1.99it/s][A
 14%|█▎        | 6/44 [00:03<00:20,  1.90it/s][A
 16%|█▌        | 7/44 [00:03<00:19,  1.92it/s][A
 18%|█▊        | 8/44 [00:04<00:17,  2.02it/s][A
 20%|██        | 9/44 [00:04<00:16,  2.09it/s][A
 23%|██▎       | 10/44 [00:04<00:15,  2.14it/s][A
 25%|██▌       | 11/44 [00:05<00:15,  2.17it/s][A
 27%|██▋       | 12/44 [00:05<00:14,  2.17it/s][A
 30%|██▉       | 13/44 [00:06<00:14,  2.18it/s][A
 32%|███▏      | 14/44 [00:06<00:13,  2.17it/s][A
 34%|███▍      | 15/44 [00:07<00:13,  2.18it/s][A
 36%|███▋      | 16/44 [00:07<00:12,  2.23it/s][A
 39%|███▊      | 17/44 [00:08<00:12,  2.22it/s][A
 41%|████      | 18/44 [00:08<00:11,  2.24it/s][A
 43%|████▎ 

25.23512864112854





In [None]:
# k : k belongs to (1, n) #SAGA FEATURE SELECTION

In [None]:
zipf = zipfile.ZipFile('categorical.zip', 'w', zipfile.ZIP_DEFLATED)
zipdir('categorical/', zipf)
zipf.close()       

**Combined Numerical and Categorical**

In [None]:
combined #Datasets with only Numerical Values
combined['Abbreviation'][combined['Abbreviation'] == 'Credit'] = 'Credit'
combined

In [None]:
combined = combined[combined['Abbreviation'] == 'Aheart']
combined
combined = combined.reset_index(drop=True)
os.mkdir("combined")

In [None]:
import time
begin_time = time.time()
for index, row in tqdm(combined.iterrows(), total=combined.shape[0]): #For Each Numerical Dataset
  subsets, original = read_subsets_and_original(BASE_PATH, ORIGINAL_BASE_PATH, row['Abbreviation']) # Get All Subsets and Original Dataset
  dts = original.dtypes
  numerical_columns = dts[dts != 'O'].index.values
  cat_columns = dts[dts == 'O'].index.values
  original = original[numerical_columns.tolist() + cat_columns.tolist()]
  xoriginal = original.copy()
  
  subset_names = list(subsets.keys())
  #ITERATE OVER ALL SUBSETS OF A DATASET AND APPLY GRNN ON EACH ONE
  for each_subset_name in subset_names:

    #SELECTING A SUBSET
    selected_subset = subsets[each_subset_name]
    selected_subset = selected_subset[numerical_columns.tolist() + cat_columns.tolist()]

    ss = pd.concat([selected_subset, xoriginal])
    ss = pd.get_dummies(ss)#APPLY ONE HOT ENCODING
    selected_subset = ss[0:selected_subset.shape[0]]
    original = ss[selected_subset.shape[0] :]
    
    new_columns = []
    for each in original.columns:
      x_each = str(each)
      new_columns.append(x_each.replace(" ", ""))
    original.columns = new_columns

    new_columns = []
    for each in selected_subset.columns:
      x_each = str(each)
      new_columns.append(x_each.replace(" ", ""))
    selected_subset.columns = new_columns




    new_prediction = np.zeros(shape=original.shape) #SAMPLE ARRAY TO SAVE PREDICTIONS
    new_prediction = pd.DataFrame(data = new_prediction, columns=selected_subset.columns) 


    #COLUMNS ARRAY TO ITERATE
    all_cols = np.array(original.columns) 
    for each in all_cols:



      #ONE COLUMN IN TEST AND OTHERS IN TRAINING
      train_cols = all_cols[all_cols != each] 
      test_col = each

      #CHECKING IF THERE ARE NULL VALUES IN OUR TEST COLUMNS
      nulls = selected_subset[each].isnull() 
      if len(nulls.shape) > 1:
        if nulls.shape[1] > 1:
          nulls = pd.DataFrame(nulls.values[:, 1], columns=[each])
      test_index = nulls[nulls == True].index
      train_index = nulls[nulls == False].index


      #IF THERE IS NO NULL VALUE THEN WO WONT APPLY GRNN
      if test_index.shape[0] == 0 or test_index.shape[0] / float(nulls.shape[0]) < 0.1:
        new_prediction[each] = original[each].copy()

      elif test_index.shape[0] == test_index.shape[0]:
        new_prediction[each] = 0

      else:
        #TRAIN GRNN ON INDEX WHERE THERE IS NO NULL AND PREDICT ON NULL VALUES
        custom_GRNN = GRNN()
        custom_GRNN.fit(original[train_cols].loc[train_index].values, original[test_col].loc[train_index].values)

        #PREDICT
        prediction_smothened = custom_GRNN.predict(original[train_cols].loc[test_index].values)

        #FILL OUR SAVING ARRAY WITH PREDICTIONS
        new_prediction[each].loc[train_index] = selected_subset[each].loc[train_index]
        if len(prediction_smothened.shape) > 1:
          if prediction_smothened.shape[0] > 1 and prediction_smothened.shape[1] > 1:
            prediction_smothened = prediction_smothened[0, :]
            cols = new_prediction.columns
            x = new_prediction.pop(each)
            new_prediction[each] = x.values[:, 0]
            new_prediction = new_prediction[cols]
        

        new_prediction[each].loc[test_index] = prediction_smothened



    NRMSE = calculate_NRMS(original.values, new_prediction.values)
    log = log + "Done Smoothing of : " + each_subset_name + " with NRMS : " + str(NRMSE) + '/n/n/n'
    NRMS_DICT[each_subset_name] = NRMSE

    ccat_cols = []
    ocols = original.columns
    for each_cc in ocols:
      if each_cc not in numerical_columns.tolist():
          ccat_cols.append(each_cc)
    AE = calculate_AE_DICT(original[ccat_cols].values, new_prediction[ccat_cols].values)
    log = log + "Done Smoothing of : " + each_subset_name + " with AE : " + str(AE) + '/n/n/n'
    AE_DICT[each_subset_name] = AE  
    new_prediction.to_csv("combined/imputed_" + each_subset_name + ".csv", index=False)

end_time = time.time()
diff = end_time - begin_time
print(diff)


In [None]:
zipf = zipfile.ZipFile('combined.zip', 'w', zipfile.ZIP_DEFLATED)
zipdir('combined/', zipf)
zipf.close()       

In [None]:
print (log)

Executing warm start.../nWarm start concluded. The optimum isotropic sigma is [0.50951578]/nGradient search concluded. The optimum sigma is [0.5095154]/nExecuting warm start.../nWarm start concluded. The optimum isotropic sigma is [0.24018485]/nGradient search concluded. The optimum sigma is [0.24]/nDone Smoothing of : Iris_AE_10 with NRMS : 0.023104781211727415/n/n/nDone Smoothing of : Iris_AE_1 with NRMS : 0.0/n/n/nExecuting warm start.../nWarm start concluded. The optimum isotropic sigma is [0.06935079]/nGradient search concluded. The optimum sigma is [0.069]/nExecuting warm start.../nWarm start concluded. The optimum isotropic sigma is [0.]/nGradient search concluded. The optimum sigma is [0.]/nDone Smoothing of : Iris_AE_20 with NRMS : 0.10312658936206778/n/n/nDone Smoothing of : Iris_AG_1 with NRMS : 0.0/n/n/nExecuting warm start.../nWarm start concluded. The optimum isotropic sigma is [0.15651098]/nGradient search concluded. The optimum sigma is [0.15650383]/nDone Smoothing of :

In [None]:
df = pd.read_excel('gdrive/My Drive/Course Project Datasets/Course Project Datasets/Table.xlsx')

In [None]:
#Code to update and save
logs = log.split('/n')
for each in logs:
    x = each.split(' ')
    if len(x) > 0:
        if x[0] == 'Done':
            if x[6] == 'NRMS':
                df['NRMS'][df['Datasets'] == str(x[4])] = float(x[8])
            elif x[6] == 'AE':
                df['AE'][df['Datasets'] == str(x[4])] = float(x[8])
df.to_excel("Table.xlsx") 

In [None]:
##Analysis of the Project##

Complete pipeline results in an upper bound of O(n^2).

The model built is scalable to a large extent especially on parallel distributed computation.

HOV is the fastest data that was computed in 13.78 seconds whereas the letter dataset took
2.75 hours to execute

The Sonar dataset has the least percentage of missing data (20%) and letter dataset had the
highest percentage of missing data (80%). 