### Imports

In [35]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

In [56]:
def weightedKnnImpute(X, k):
    #Input: X:= pandas dataframe with uknown values set to NaN
    #       k:= number of nearest neighbors sought
    
    numberOfFeatures = X.shape[1]
    
    for index1, row1 in X.iterrows():
        for i in range(numberOfFeatures):
            if np.isnan(row1[i]):

                #calculate distances
                distArray = []
                for index2, row2 in X.iterrows():
                    distance = 0
                    if(index1 != index2):
                        for j in range(numberOfFeatures):
                            if( not np.isnan(row1[j]) and not np.isnan(row2[j])):
                                #print(str(X.loc[index1, j]) + " " + str(row2[j]))

                                distance += np.square(X.loc[index1, j] - row2[j])

                                #print(distance)

                        #invert distance
                        distArray.append(1 / np.sqrt(distance))
                    else:
                        distArray.append(0)

                #find nearest neighbors and weight their values 
                #print(distArray)
                result = 0
                values = []
                totalWeight = 0
                weights = []
                knn = k
                
                for l in range(knn):
                    if not np.isnan(X.loc[np.argmax(distArray), i]):
                        values.append(X.loc[np.argmax(distArray), i])
                        totalWeight += distArray[np.argmax(distArray)]
                        weights.append(distArray[np.argmax(distArray)])
                        distArray[np.argmax(distArray)] = 0
                    else:
                        distArray[np.argmax(distArray)] = 0
                        knn += 1
                
                #print()
                
                for m in range(len(values)):
                    result += values[m] * (weights[m] / totalWeight)
                    
                    #print (str(values[m])+ " " + str(weights[m]) + "//" + str(totalWeight) + "=" + str(weights[m] / totalWeight))

                #print(result)
                    
                row1[i] = result
                
    return X
    

## Estimation Methods Comparison

In [65]:
# put data in pandas dataframe
data = 'datasets/MissingValueEstimation/MissingData1.txt'
X = pd.read_csv(data, sep='\s+', header=None)
X = X.transpose()

# header
print('\n''*** MissingData1 ***''')
print('Number of Samples: ' + str(X.shape[0]))
numberOfFeatures = X.shape[1]
print('Number of Features: ' + str(X.shape[1]))
print("\nOriginal Data: \n")

# set uknowns in dataset to NaN
X = X[X < 1.00000000000000e+99]
print(str(X))

# Remove Values for MSE Test
Xtest = X.copy()
for i in range(100):
    Xtest.loc[np.random.randint(0, 14), np.random.randint(0, 240)] = np.nan
    
    
# linear Interpolation
Xresult = Xtest.copy()
Xresult = Xresult.interpolate(limit_direction='both')
print("\nInterpolation w/ limit MSE: " + str(mean_squared_error(X.combine_first(Xresult), Xresult)))

# Slinear Interpolation
Xresult = Xtest.copy()
Xresult = Xresult.interpolate(method='slinear', limit_direction='both')
Xresult = Xresult.fillna(Xresult.mean())
print("\nslinear Interpolation MSE: " + str(mean_squared_error(X.combine_first(Xresult), Xresult)))

# Mean Imputation
Xresult = Xtest.copy()
Xresult = Xresult.fillna(Xresult.mean())
print("\nMean Imputation MSE: " + str(mean_squared_error(X.combine_first(Xresult), Xresult)))

# Weighted KNN Imputation
Xresult = Xtest.copy()
Xresult = weightedKnnImpute(Xresult, 4)
print("\nWeighted KNN Imputation MSE: " + str(mean_squared_error(X.combine_first(Xresult), Xresult)))



*** MissingData1 ***
Number of Samples: 14
Number of Features: 242

Original Data: 

     0     1     2     3     4     5     6     7     8     9    ...   232  \
0  -0.11 -0.30  0.50  0.00  0.40  0.39  0.50 -0.52 -0.87 -1.39  ...  0.19   
1   0.02 -0.37  0.18 -0.11 -0.16 -0.18 -0.24 -0.48 -0.17 -0.57  ... -0.27   
2  -0.36 -0.18  0.41   NaN  0.31 -0.09 -0.16 -0.73  0.10 -0.16  ... -0.43   
3  -0.11 -0.09   NaN  0.19 -0.34 -0.04   NaN -0.42 -0.12 -0.02  ... -0.12   
4   0.48 -0.16   NaN  0.00  0.02 -0.25  0.04 -0.40   NaN  0.30  ... -0.07   
5  -0.20 -0.16  0.15 -0.07  0.21 -0.69   NaN -0.01  0.52  0.41  ...  0.21   
6   0.27 -0.10 -0.25  0.18 -0.70 -0.64 -0.11 -0.34 -0.28  0.03  ... -0.01   
7   0.29 -0.09 -0.41  0.18  0.11 -0.38  0.10  0.55  0.50  0.50  ...  0.24   
8  -0.34  0.46 -0.07 -0.23  0.08  0.41 -0.04   NaN  0.09  0.48  ...  0.04   
9  -0.05  0.32 -0.13 -0.16  0.05  0.33 -0.01  0.37 -0.01  0.51  ...  0.01   
10  0.23  0.02 -0.15  0.03 -0.09 -0.62 -0.04 -0.29  0.08 -0.11  ...

## Test Interpolation

## Average KNN Imputation

In [3]:
#read in CSV
for i in range(1,2):
    # put data in pandas dataframe
    data = 'datasets/MissingValueEstimation/MissingData{}.txt'.format(i, i)
    X = pd.read_csv(data, sep='\s+', header=None)
    X = X.transpose()
    
    #header
    print('\n''*** MissingData{} ***'''.format(i))
    print('Number of Samples: ' + str(X.shape[0]))
    numberOfFeatures = X.shape[1]
    print('Number of Features: ' + str(X.shape[1]))
    print("\nbefore: \n")
    
    #set uknowns to NaN
    X = X[X < 1.00000000000000e+99]
    print(str(X))
    
    
    #fancyimpute
#    fancyimpute.kNN(k=3).complete(X)
    
    
    #fill missing values using interpolation
#    Xinterpolated = X.interpolate(method='linear')
#    print("\nafter: \n" + str(Xinterpolated))
    
    #weighted knn implimentation
    for index1, row1 in X.iterrows():
        for i in range(numberOfFeatures):
            if np.isnan(row1[i]):
                
                #calculate distances
                distArray = []
                for index2, row2 in X.iterrows():
                    distance = 0
                    if(index1 != index2):
                        for j in range(numberOfFeatures):
                            if( not np.isnan(row1[j]) and not np.isnan(row2[j])):
                                #print(str(X.loc[index1, j]) + " " + str(row2[j]))
                                
                                distance += np.square(X.loc[index1, j] - row2[j])
        
                                #print(distance)
                                
                        distArray.append(np.sqrt(distance))
                    else:
                        distArray.append(999999999)
                        
                #find k=3 nearest neighbors and average their values 
                print(distArray)
                value = 0
                k=3
                for l in range(k):
                    if not np.isnan(X.loc[np.argmin(distArray), i]):
                        value += X.loc[np.argmin(distArray), i] / k 
                        distArray[np.argmin(distArray)] = 999999999
                    else:
                        k += 1
                row1[i] = value
                
                print(distArray)
                
    print("\nafter: \n" + str(X))


*** MissingData1 ***
Number of Samples: 14
Number of Features: 242

before: 

     0     1     2     3     4     5     6     7     8     9    ...   232  \
0  -0.11 -0.30  0.50  0.00  0.40  0.39  0.50 -0.52 -0.87 -1.39  ...  0.19   
1    NaN -0.37  0.18 -0.11 -0.16 -0.18 -0.24 -0.48 -0.17 -0.57  ... -0.27   
2  -0.36   NaN  0.41   NaN  0.31 -0.09 -0.16 -0.73  0.10 -0.16  ... -0.43   
3  -0.11 -0.09   NaN  0.19 -0.34 -0.04   NaN -0.42 -0.12 -0.02  ... -0.12   
4   0.48 -0.16   NaN  0.00  0.02 -0.25  0.04 -0.40   NaN  0.30  ... -0.07   
5  -0.20 -0.16  0.15 -0.07  0.21 -0.69   NaN -0.01  0.52  0.41  ...  0.21   
6   0.27 -0.10 -0.25  0.18 -0.70 -0.64 -0.11 -0.34 -0.28  0.03  ... -0.01   
7   0.29 -0.09 -0.41  0.18  0.11 -0.38  0.10  0.55  0.50  0.50  ...  0.24   
8  -0.34  0.46 -0.07 -0.23  0.08  0.41 -0.04   NaN  0.09  0.48  ...  0.04   
9  -0.05  0.32 -0.13 -0.16  0.05  0.33 -0.01  0.37 -0.01  0.51  ...  0.01   
10  0.23  0.02 -0.15  0.03 -0.09 -0.62 -0.04 -0.29  0.08 -0.11  ... -0.16 

[9.176743552166108, 999999999, 5.775615407325294, 6.802225126138264, 8.38007557642929, 8.372570294320218, 7.554626985423494, 7.122174293477145, 7.064516968625667, 7.357488248942932, 6.416878784372768, 6.381264242556748, 7.15130058101322, 6.301335837212084]
[9.176743552166108, 999999999, 999999999, 6.802225126138264, 8.38007557642929, 8.372570294320218, 7.554626985423494, 7.122174293477145, 7.064516968625667, 7.357488248942932, 6.416878784372768, 999999999, 7.15130058101322, 999999999]
[9.17871027008818, 999999999, 5.776039588968667, 6.802585292862319, 8.383511595188898, 8.373054002771829, 7.560984650750778, 7.122174293477145, 7.071315294908012, 7.358038687947579, 6.4206489806976155, 6.381389608332447, 7.152670829836922, 6.301407250236515]
[9.17871027008818, 999999999, 999999999, 6.802585292862319, 8.383511595188898, 8.373054002771829, 7.560984650750778, 7.122174293477145, 7.071315294908012, 7.358038687947579, 6.4206489806976155, 999999999, 7.152670829836922, 999999999]
[9.1827791726325

[10.90233665066551, 6.872920291501524, 6.089716286038654, 999999999, 5.544314405065987, 6.320852614780521, 5.630238795014727, 6.238869199533589, 6.686350939704621, 6.515544660442611, 6.475167093511094, 7.469407978920116, 7.889973947288351, 6.2337192393911645]
[10.90233665066551, 6.872920291501524, 999999999, 999999999, 999999999, 6.320852614780521, 999999999, 6.238869199533589, 6.686350939704621, 6.515544660442611, 6.475167093511094, 7.469407978920116, 7.889973947288351, 6.2337192393911645]
[10.926972234897564, 6.876547906552471, 6.091646374795202, 999999999, 5.544940636892938, 6.3403356027544415, 5.63521073252811, 6.241562304423469, 6.701032258789187, 6.516908776406188, 6.478153028963323, 7.496210153582049, 7.907979092874068, 6.236414568216794]
[10.926972234897564, 6.876547906552471, 6.091646374795202, 999999999, 999999999, 6.3403356027544415, 5.63521073252811, 6.241562304423469, 6.701032258789187, 6.516908776406188, 6.478153028963323, 7.496210153582049, 7.907979092874068, 6.236414568

[13.047954714139003, 8.469813719583476, 7.064148136108762, 6.349920384452774, 3.9888260595485154, 999999999, 4.572628711510846, 5.602079970867963, 7.0436259601619025, 5.803128466611782, 4.961357341158432, 8.354549658718893, 9.222580983650943, 7.594026160959237]
[13.047954714139003, 8.469813719583476, 7.064148136108762, 6.349920384452774, 999999999, 999999999, 999999999, 5.602079970867963, 7.0436259601619025, 5.803128466611782, 999999999, 8.354549658718893, 9.222580983650943, 7.594026160959237]
[13.048092666065115, 8.490338299764295, 7.133182241390506, 6.372839939060831, 4.029346514428033, 999999999, 4.59988405650983, 5.6520615707899005, 7.068293051838377, 5.84400547569901, 5.0754277323853865, 8.382994691636156, 9.222580983650943, 7.621793314787098]
[13.048092666065115, 8.490338299764295, 7.133182241390506, 6.372839939060831, 999999999, 999999999, 999999999, 5.6520615707899005, 7.068293051838377, 5.84400547569901, 999999999, 8.382994691636156, 9.222580983650943, 7.621793314787098]
[13.0

[11.762368431192382, 7.223436393665644, 6.494937344664004, 6.3085946665376005, 6.102375675824039, 5.690092754565215, 5.223690691881022, 999999999, 4.035737575434975, 3.7100209643013553, 4.50773655939307, 4.977833085010206, 6.403693378737686, 4.602827633338253]
[11.762368431192382, 7.223436393665644, 6.494937344664004, 6.3085946665376005, 6.102375675824039, 5.690092754565215, 5.223690691881022, 999999999, 999999999, 999999999, 999999999, 4.977833085010206, 6.403693378737686, 4.602827633338253]
[11.766109051943308, 7.226023187833755, 6.497814162384832, 6.330264168614485, 6.102991069959058, 5.693844629656368, 5.224712644942515, 999999999, 4.035759600813154, 3.71614944084151, 4.513431805917384, 5.010741794717958, 6.416382677698288, 4.60326333521485]
[11.766109051943308, 7.226023187833755, 6.497814162384832, 6.330264168614485, 6.102991069959058, 5.693844629656368, 5.224712644942515, 999999999, 999999999, 999999999, 999999999, 5.010741794717958, 6.416382677698288, 4.60326333521485]
[11.80796

[10.898091066278024, 6.449646846490469, 6.105664946952496, 6.508948541132517, 5.769231606837545, 5.09503680065218, 4.798437245604031, 4.531560928029594, 5.560176300606587, 5.0036353450932705, 999999999, 6.076421644356157, 7.114119762837845, 5.452687410809461]
[10.898091066278024, 6.449646846490469, 6.105664946952496, 6.508948541132517, 5.769231606837545, 5.09503680065218, 999999999, 999999999, 5.560176300606587, 999999999, 999999999, 6.076421644356157, 7.114119762837845, 5.452687410809461]
[10.905926523378623, 6.450102497032293, 6.105709531978152, 6.509195205416891, 5.771935935580404, 5.111240336530633, 4.804773089797733, 4.533238969606119, 5.565944508491342, 5.005155120251295, 999999999, 6.092572892009124, 7.122888771028538, 5.455441727710454]
[10.905926523378623, 6.450102497032293, 6.105709531978152, 6.509195205416891, 5.771935935580404, 5.111240336530633, 999999999, 999999999, 5.565944508491342, 999999999, 999999999, 6.092572892009124, 7.122888771028538, 5.455441727710454]
[10.93168

[10.342249916402782, 6.52185045315617, 7.096315945615725, 7.662419113221792, 8.683585409009087, 8.52791820369361, 7.412778756234999, 5.075043513770761, 4.020069712558125, 5.095830539655817, 6.262942155604218, 999999999, 4.748342400833742, 4.050043895509722]
[10.342249916402782, 6.52185045315617, 7.096315945615725, 7.662419113221792, 8.683585409009087, 8.52791820369361, 7.412778756234999, 5.075043513770761, 999999999, 5.095830539655817, 6.262942155604218, 999999999, 4.748342400833742, 999999999]
[10.366364411456464, 6.522426269759165, 7.101324602573178, 7.662770024243479, 8.68432495937364, 8.52826281646307, 7.412790747530021, 5.07911518978563, 4.02007524037444, 5.108065517721297, 6.263850431022615, 999999999, 4.748342400833742, 4.0507077570880545]
[10.366364411456464, 6.522426269759165, 7.101324602573178, 7.662770024243479, 8.68432495937364, 8.52826281646307, 7.412790747530021, 5.07911518978563, 999999999, 5.108065517721297, 6.263850431022615, 999999999, 999999999, 999999999]
[10.361849

## Weighted KNN Imputation

In [3]:
#read in CSV
for i in range(1,2):
    # put data in pandas dataframe
    data = 'datasets/MissingValueEstimation/MissingData{}.txt'.format(i, i)
    X = pd.read_csv(data, sep='\s+', header=None)
    X = X.transpose()
    
    #header
    print('\n''*** MissingData{} ***'''.format(i))
    print('Number of Samples: ' + str(X.shape[0]))
    numberOfFeatures = X.shape[1]
    print('Number of Features: ' + str(X.shape[1]))
    print("\nbefore: \n")
    
    #set uknowns to NaN
    X = X[X < 1.00000000000000e+99]
    print(str(X))
    
    #mean square error
    testX = []
    testY = []
    
    for i in range(5):
        randint(0, 13)
        testX.append(randint(0, 240))
        testY.append(randint(0, 13))

    
    #weighted knn implimentation
    for index1, row1 in X.iterrows():
        for i in range(numberOfFeatures):
            if np.isnan(row1[i]):
                
                #calculate distances
                distArray = []
                for index2, row2 in X.iterrows():
                    distance = 0
                    if(index1 != index2):
                        for j in range(numberOfFeatures):
                            if( not np.isnan(row1[j]) and not np.isnan(row2[j])):
                                #print(str(X.loc[index1, j]) + " " + str(row2[j]))
                                
                                distance += np.square(X.loc[index1, j] - row2[j])
        
                                #print(distance)
                        
                        #invert distance
                        distArray.append(1 / np.sqrt(distance))
                    else:
                        distArray.append(0)
                        
                #find k=3 nearest neighbors and average their values 
                #print(distArray)
                result = 0
                values = []
                totalWeight = 0
                weights = []
                k=3
                for l in range(k):
                    if not np.isnan(X.loc[np.argmax(distArray), i]):
                        values.append(X.loc[np.argmax(distArray), i])
                        totalWeight += distArray[np.argmax(distArray)]
                        weights.append(distArray[np.argmax(distArray)])
                        distArray[np.argmax(distArray)] = 0
                    else:
                        distArray[np.argmax(distArray)] = 0
                        k += 1
                        
                for m in range(len(values)):
                    result += values[m] * (weights[m] / totalWeight)
                    
                row1[i] = result
                
                #print(distArray)
                
    print("\nafter: \n" + str(X))
    
    
    


*** MissingData1 ***
Number of Samples: 14
Number of Features: 242

before: 

     0     1     2     3     4     5     6     7     8     9    ...   232  \
0  -0.11 -0.30  0.50  0.00  0.40  0.39  0.50 -0.52 -0.87 -1.39  ...  0.19   
1    NaN -0.37  0.18 -0.11 -0.16 -0.18 -0.24 -0.48 -0.17 -0.57  ... -0.27   
2  -0.36   NaN  0.41   NaN  0.31 -0.09 -0.16 -0.73  0.10 -0.16  ... -0.43   
3  -0.11 -0.09   NaN  0.19 -0.34 -0.04   NaN -0.42 -0.12 -0.02  ... -0.12   
4   0.48 -0.16   NaN  0.00  0.02 -0.25  0.04 -0.40   NaN  0.30  ... -0.07   
5  -0.20 -0.16  0.15 -0.07  0.21 -0.69   NaN -0.01  0.52  0.41  ...  0.21   
6   0.27 -0.10 -0.25  0.18 -0.70 -0.64 -0.11 -0.34 -0.28  0.03  ... -0.01   
7   0.29 -0.09 -0.41  0.18  0.11 -0.38  0.10  0.55  0.50  0.50  ...  0.24   
8  -0.34  0.46 -0.07 -0.23  0.08  0.41 -0.04   NaN  0.09  0.48  ...  0.04   
9  -0.05  0.32 -0.13 -0.16  0.05  0.33 -0.01  0.37 -0.01  0.51  ...  0.01   
10  0.23  0.02 -0.15  0.03 -0.09 -0.62 -0.04 -0.29  0.08 -0.11  ... -0.16 

[0.10890268021882285, 0, 0.17320480725931295, 0.14707313815434614, 0.11929226642506238, 0.11946443539432092, 0.13225484603771842, 0.1402939735129456, 0.14142134490028782, 0.13588418766953206, 0.15574176738938347, 0.15662458330476461, 0.1396864690545022, 0.15862091844815446]
[0.10890268021882285, 0, 0, 0.14707313815434614, 0.11929226642506238, 0.11946443539432092, 0.13225484603771842, 0.1402939735129456, 0.14142134490028782, 0.13588418766953206, 0.15574176738938347, 0, 0.1396864690545022, 0]
[0.108899202801294, 0, 0.17257581167348632, 0.14702900014707596, 0.11921421816518235, 0.11937538523406542, 0.13218638424776416, 0.14026430352163924, 0.1413550326016863, 0.13580165013388973, 0.15556821060453005, 0.15638083649475962, 0.13918766385772188, 0.15855146911161375]
[0.108899202801294, 0, 0, 0.14702900014707596, 0.11921421816518235, 0.11937538523406542, 0.13218638424776416, 0.14026430352163924, 0.1413550326016863, 0.13580165013388973, 0.15556821060453005, 0, 0.13918766385772188, 0]
[0.1088725

[0.09139690562411883, 0.145516396833159, 0.16418586026474816, 0, 0.18027211737881124, 0.15775307541109587, 0.17735209722557804, 0.16009196691559893, 0.1487813022325587, 0.1528510306425907, 0.15432042298909737, 0.1333499265386323, 0.12640021361119447, 0.1597255616229373]
[0.09139690562411883, 0.145516396833159, 0, 0, 0, 0.15775307541109587, 0, 0.16009196691559893, 0.1487813022325587, 0.1528510306425907, 0.15432042298909737, 0.1333499265386323, 0.12640021361119447, 0.1597255616229373]
[0.0795712733734598, 0.1177428033192983, 0.14837318124110113, 0.1802721117984344, 0, 0.25378585458284636, 0.20990762214054917, 0.1673361887638281, 0.1424537558431588, 0.163353994473394, 0.17414310384481466, 0.11758035394533523, 0.10721735395505734, 0.1309644524740936]
[0.0795712733734598, 0.1177428033192983, 0.14837318124110113, 0, 0, 0, 0, 0.1673361887638281, 0.1424537558431588, 0.163353994473394, 0.17414310384481466, 0.11758035394533523, 0.10721735395505734, 0.1309644524740936]
[0.07950748019368516, 0.117

[0.07664496155841506, 0.11769300717914061, 0.14027625404275013, 0.15686480108198658, 0.2489476331349193, 0, 0.21763419428062034, 0.17717792433302187, 0.14158676223183891, 0.1712305714906991, 0.196485770957862, 0.11937387617325718, 0.10831283743015352, 0.1312631662706817]
[0.07664496155841506, 0.11769300717914061, 0.14027625404275013, 0.15686480108198658, 0, 0, 0, 0.17717792433302187, 0.14158676223183891, 0.1712305714906991, 0, 0.11937387617325718, 0.10831283743015352, 0.1312631662706817]
[0.07637573841107471, 0.11762143340365895, 0.14019249696632727, 0.15686114231196638, 0.24888990736253097, 0, 0.21755557289309854, 0.1770843143015741, 0.14158637779537483, 0.1712251536470156, 0.19625579182105074, 0.11930892250519887, 0.1082080252391806, 0.13091440289511988]
[0.07637573841107471, 0.11762143340365895, 0.14019249696632727, 0.15686114231196638, 0, 0, 0, 0.1770843143015741, 0.14158637779537483, 0.1712251536470156, 0, 0.11930892250519887, 0.1082080252391806, 0.13091440289511988]
[0.0762721082

[0.08467377341884558, 0.13800750816259352, 0.15333906834859862, 0.15763124349164295, 0.16413205628647753, 0.175535632569207, 0.19103882154127794, 0, 0.24770054175157288, 0.2688920579680623, 0.22075811525191513, 0.19948050376920173, 0.155668096851018, 0.2170932571645626]
[0.08467377341884558, 0.13800750816259352, 0.15333906834859862, 0.15763124349164295, 0.16413205628647753, 0.175535632569207, 0.19103882154127794, 0, 0, 0, 0, 0.19948050376920173, 0.155668096851018, 0.2170932571645626]
[0.09286672960668106, 0.14088884700773902, 0.15404785172244298, 0.1487799534522844, 0.1411452161157513, 0.14155198453384363, 0.15665801350994313, 0.24756769660918954, 0, 0.2733751033802321, 0.18269510911761816, 0.2536592705535259, 0.1876995368181785, 0.23786207494199293]
[0.09286672960668106, 0.14088884700773902, 0.15404785172244298, 0.1487799534522844, 0.1411452161157513, 0.14155198453384363, 0.15665801350994313, 0, 0, 0, 0.18269510911761816, 0, 0.1876995368181785, 0.23786207494199293]
[0.0924426067236754

[0.0914778858449605, 0.15484600467960777, 0.16349365226098847, 0.15335242550123712, 0.17274912805244827, 0.19452578275200388, 0.20812192723391457, 0.22035787688399255, 0.17971333585469787, 0.1995662048345472, 0, 0.16390003926590618, 0.14038915503084304, 0.18328596329205674]
[0.0914778858449605, 0.15484600467960777, 0.16349365226098847, 0.15335242550123712, 0.17274912805244827, 0.19452578275200388, 0, 0, 0.17971333585469787, 0, 0, 0.16390003926590618, 0.14038915503084304, 0.18328596329205674]
[0.09135880480826525, 0.15479949226355402, 0.16342421117649295, 0.15328621982648566, 0.17229261754075667, 0.19374602748018507, 0.2080552276479635, 0.22030617293691246, 0.17971236140963634, 0.19956487046348959, 0, 0.1638674566393679, 0.14038444909063896, 0.18236729715049746]
[0.09135880480826525, 0.15479949226355402, 0.16342421117649295, 0.15328621982648566, 0.17229261754075667, 0.19374602748018507, 0, 0, 0.17971236140963634, 0, 0, 0.1638674566393679, 0.14038444909063896, 0.18236729715049746]
[0.090

[0.09649030682963664, 0.13908095616678415, 0.1317334513013269, 0.12635512054745865, 0.10634226605902479, 0.10798201677059356, 0.11728933876616343, 0.15561845418072523, 0.18695251298084106, 0.15724246307121328, 0.1379258796526075, 0.21036238540856628, 0, 0.19685992935363397]
[0.09649030682963664, 0.13908095616678415, 0.1317334513013269, 0.12635512054745865, 0.10634226605902479, 0.10798201677059356, 0.11728933876616343, 0.15561845418072523, 0, 0.15724246307121328, 0.1379258796526075, 0, 0, 0]
[0.0960139418149729, 0.13887298116251967, 0.1317334051628184, 0.1261100359987967, 0.106341848462536, 0.10791086488601712, 0.11726425609896424, 0.15561740055071902, 0.18695068615643065, 0.157167569285387, 0.1378645112789418, 0.2103623235126164, 0, 0.19685728021385923]
[0.0960139418149729, 0.13887298116251967, 0.1317334051628184, 0.1261100359987967, 0.106341848462536, 0.10791086488601712, 0.11726425609896424, 0.15561740055071902, 0, 0.157167569285387, 0.1378645112789418, 0, 0, 0]
[0.09600075096354285,

In [None]:
def weightedKnnImpute(X, k):
    #Input: X:= pandas dataframe with uknown values set to NaN
    #       k:= number of nearest neighbors sought
    
    numberOfFeatures = X.shape[1]
    
    for index1, row1 in X.iterrows():
        for i in range(numberOfFeatures):
            if np.isnan(row1[i]):

                #calculate distances
                distArray = []
                for index2, row2 in X.iterrows():
                    distance = 0
                    if(index1 != index2):
                        for j in range(numberOfFeatures):
                            if( not np.isnan(row1[j]) and not np.isnan(row2[j])):
                                distance += np.square(X.loc[index1, j] - row2[j])
                                
                        #invert distance
                        distArray.append(1 / np.sqrt(distance))
                    else:
                        distArray.append(0)
                        
                #find nearest neighbors and weight their values 
                result = 0
                values = []
                totalWeight = 0
                weights = []
                knn = k
                
                for l in range(knn):
                    if not np.isnan(X.loc[np.argmax(distArray), i]):
                        values.append(X.loc[np.argmax(distArray), i])
                        totalWeight += distArray[np.argmax(distArray)]
                        weights.append(distArray[np.argmax(distArray)])
                        distArray[np.argmax(distArray)] = 0
                    else:
                        distArray[np.argmax(distArray)] = 0
                        knn += 1
                
                for m in range(len(values)):
                    result += values[m] * (weights[m] / totalWeight)
               
                row1[i] = result
                
    return X
    