In [None]:
import pandas as pd
import numpy as np
import random as rand
import math

In [None]:
df_rf = pd.read_csv(r'data.csv')

print(df_rf.isna().sum())

In [None]:
columns = [column for column in df_rf]
columns.insert(0, 'Method')
    
results_original_data = pd.DataFrame(columns = columns)
results_scaled_data = pd.DataFrame(columns = columns)

#### Min Max Scaling

##### X' = (X - min(X)) / (max(X) - min(X))

In [None]:
# normalize the dataset using Min Max scaling method
def min_max_scaling(df):
    scaled_df = (df - df.min())/(df.max() - df.min())
    
    return scaled_df

In [None]:
# creates artificial missingness in the dataset
def miss_values(df):
    df_mv = df.copy()
    
    # randomly selecting 50% indices from the length of dataframe
    rand_miss_instances = rand.sample(range(0, len(df_mv) - 1), math.floor(len(df_mv)/2))
    
    # number of values to be made NaN for every feature
    rand_miss_values_count = math.floor(len(rand_miss_instances) / 2)
    
    # replacing the identified feature value as NaN
    for feature in df_mv:
        rand_miss_values = rand.sample(rand_miss_instances, rand_miss_values_count)
        for value in rand_miss_values:
            df_mv.loc[value, feature] = np.nan
            
    return df_mv

In [None]:
def accuracy_mse(original_values, imputed_values):
    mse = 0
    total_imputed = 0
    
    # distance can be calculated only if both the arrays are of same length
    if(len(original_values) == len(imputed_values)):
        for i in range(len(original_values)):
            
            # discarding the identical values in original and imputed df since the distance between them is zero
            if(original_values[i] != imputed_values[i]):
                total_imputed += 1
                
                # squared error is summation of square of the difference between original and imputed value
                mse += (original_values[i] - imputed_values[i]) ** 2
        
        
        # mean squared error is average of squared error
        mse = mse / len(original_values)
    
    else:
        print("Original values length: {}".format(len(original_values)))
        print("Imputed values length: {}".format(len(imputed_values)))
        return np.nan
        
    return mse

In [None]:
def calculate_accuracy(df_rf, df_imputed):
    result = []
    for feature in df_rf:
        accuracy = accuracy_mse(df_rf[feature], df_imputed[feature])
        print("Original values length: {}".format(len(df_rf[feature])))
        print("Imputed values length: {}".format(len(df_imputed[feature])))
        print("{} is {}".format(feature, accuracy))
        result.append(accuracy)
        
    return result

In [None]:
def euclidean_dist(dp1, dp2):
    # dp1 is row with missing values
    # dp2 is row to be matched with dp1 for similarity using distance
    sum = 0
    
    # calculating summation of the square of difference between two points dp1[i] and dp2[j]
    for i in range(len(dp1)):
        
        # discarding nan values from dp1 and finding neighbours for dp1 with only available features in dp1
        if not math.isnan(dp1[i]):
            sum = sum + ((dp1[i] - dp2[i]) ** 2)
           
    # euclidean distance is the square root of the above summation
    return math.sqrt(sum)

In [None]:
def get_nearest_neighbours(missed_row, df, k, incomplete_instances):
    min = k
    current_max = {'index':np.nan, 'max':0}
    nearest_neighbours = []
    # iterate through the df
    for index, current_row in df.iterrows():
        
        # considering only the complete instances
        # check if the df's current row does not contain any null values and 
        # also current_row is from the 50% of the complete instances
        if not current_row.isna().sum() and index not in incomplete_instances:
            # if any row found, calculate the distance between the missed row and the current row
            distance = euclidean_dist(missed_row, current_row)
            
            # insert the first 3 directly into the nearest neighbours list, current_max contains the 
            # max distance out of the current 3 items in the nearest neighbours list
            if min > 0:
                nearest_neighbours.append({'index':index, 'nn':current_row, 'distance':distance})
                min = min - 1
                if distance >= current_max['max']:
                    current_max['max'] = distance
                    current_max['index'] = index
            
            # if there are already 3 items in the nearest neighbours list, check if the recently
            # calculated distance is less than the current max value, if yes, remove the current max item
            # from the list and add the new neighbour to the list and update the current max accordingly
            else:
                if distance < current_max['max']:
                    nearest_neighbours = [neighbour for neighbour in nearest_neighbours if neighbour['index'] != current_max['index']]
                    nearest_neighbours.append({'index':index, 'nn':current_row, 'distance':distance})
                    
                    max_neighbour = max(nearest_neighbours, key=lambda x:x['distance'])
                    current_max['max'] = max_neighbour['distance']
                    current_max['index'] = max_neighbour['index']
                    
    print("Nearest Neighbours for {}".format(missed_row))
    print(nearest_neighbours)
                    
    return nearest_neighbours

In [None]:
def imputation_mean(df):
    #df_imputed = df.copy()
    
    mean_dict = {}
    
    complete_instances = df.copy().dropna()
    
    for feature in complete_instances:
        mean_dict[feature] = round(complete_instances[feature].mean(), 3)
    
    print("Mean of complete instances")
    print(mean_dict)
    
    for feature in df:
        #print("Mean of feature {} is {}".format(feature, round(df_imputed[feature].mean(), 1)))
        # replace all the nan values with the mean value of the feature
        df[feature] = df[feature].replace(np.nan, mean_dict[feature])

In [None]:
def imputation_nn(df, k, weighted):
    incomplete_instances = []   #array to keep track of rows that has missing cells imputed
    
    # Imputing is done row wise - a row with nan values is selected and all the 
    # nan values are imputed according to the nearest neighbours
    for index, row in df.iterrows():
        
        # considering rows that contains nan values for imputations
        if row.isna().sum():
            incomplete_instances.append(index)
            print("\nRow {} has missing values".format(index))
            # get the nearest neighbours
            nn_df_dict = get_nearest_neighbours(row, df, k, incomplete_instances)
            
            # sorting the list of dictionary of nearest neighbours in descending order
            # with respect to distance incase of weighted knn
            sorted(nn_df_dict, key = lambda nn: nn['distance'], reverse = True)
            nn_df = [nn['nn'] for nn in nn_df_dict]
            
            # iterate through the columns to find if its nan
            for key in row.keys():
                if(math.isnan(row[key])):
                    mean_nn = 0
                    # if nan found, get the key value and find the mean / weighted mean of the key values 
                    # from the received nearest neighbours depending on weighted parameter
                    
                    # if not weighted knn
                    if not weighted:
                        for nn in nn_df:
                            mean_nn = mean_nn + nn[key]
                        
                        mean_nn = mean_nn / k
                        
                    # if weighted knn, weight assigned to every neighbour is 1/distance
                    else:
                        # calculating weights of neighbour with respect to distance
                        distances = [nn['distance'] for nn in nn_df_dict]
                        weights = []
                        
                        # if any of the neighbours has distance 0, then total weightage is given to those
                        # neighbours and values are picked from those neighbours only with full weightage
                        # if distance is 0, the two points are identical
                        if 0 in distances:
                            print("One of the neighbour has distance 0")
                            for i in range(len(distances)):
                                if distances[i] == 0:
                                    weights.append(1)
                                else:
                                    weights.append(0)
                        
                        # if not, then weight is 1/distance for the neighbour
                        else:
                            for i in range(len(distances)):
                                weights.append(1/distances[i])
                            
                        i = 0
                        for nn in nn_df_dict:
                            mean_nn = mean_nn + (nn['nn'][key] * weights[i])
                            i = i + 1
                            
                        mean_nn = mean_nn / sum(weights)
                    
                    
                    # replace the nan value with the mean value from above step
                    print("Before imputation {}[{}] is {}".format(key, index, row[key]))
                    row[key] = mean_nn
                    print("After imputation {}[{}] is {}".format(key, index, row[key]))
    

In [None]:
# df - dataframe for operation
# method - possible values - mean, knn
# k - number of nearest neighbours for kKNN, default is 1
# weighted - weighted KNN if value is true, default is false

def imputation(df, method, k=1, weighted=False):
    if method == 'mean':
        imputation_mean(df)
    
    elif method == 'knn':
        imputation_nn(df, k, weighted)

In [None]:
df_rf_scaled = min_max_scaling(df_rf)

df_rf_scaled.to_csv('data_scaled.csv', index=False)

In [None]:
df_mv = miss_values(df_rf)

df_mv_s = miss_values(df_rf_scaled)

In [None]:
df_mv.to_csv('dataset_mv.csv', index=False)

df_mv_s.to_csv('dataset_mv_s.csv', index=False)

In [None]:
# creating individual dataframes for every imputation method for simplicity
df_mv_mean = pd.read_csv('dataset_mv.csv')
df_mv_1nn = pd.read_csv('dataset_mv.csv')
df_mv_3nn = pd.read_csv('dataset_mv.csv')
df_mv_5nn = pd.read_csv('dataset_mv.csv')
df_mv_w_1nn = pd.read_csv('dataset_mv.csv')
df_mv_w_3nn = pd.read_csv('dataset_mv.csv')
df_mv_w_5nn = pd.read_csv('dataset_mv.csv')

In [None]:
df_mv_s_mean = pd.read_csv('dataset_mv_s.csv')
df_mv_s_1nn = pd.read_csv('dataset_mv_s.csv')
df_mv_s_3nn = pd.read_csv('dataset_mv_s.csv')
df_mv_s_5nn = pd.read_csv('dataset_mv_s.csv')
df_mv_w_s_1nn = pd.read_csv('dataset_mv_s.csv')
df_mv_w_s_3nn = pd.read_csv('dataset_mv_s.csv')
df_mv_w_s_5nn = pd.read_csv('dataset_mv_s.csv')

In [None]:
imputation(df_mv_mean, method='mean')

In [None]:
imputation(df_mv_1nn, method='knn', k=1)

In [None]:
imputation(df_mv_3nn, method='knn', k=3)

In [None]:
imputation(df_mv_5nn, method='knn', k=5)

In [None]:
imputation(df_mv_w_1nn, method='knn', k=1, weighted=True)

In [None]:
imputation(df_mv_w_3nn, method='knn', k=3, weighted=True)

In [None]:
imputation(df_mv_w_5nn, method='knn', k=5, weighted=True)

In [None]:
imputation(df_mv_s_mean, method='mean')
imputation(df_mv_s_1nn, method='knn', k=1)
imputation(df_mv_s_3nn, method='knn', k=3)
imputation(df_mv_s_5nn, method='knn', k=5)
imputation(df_mv_w_s_1nn, method='knn', k=1, weighted=True)
imputation(df_mv_w_s_3nn, method='knn', k=3, weighted=True)
imputation(df_mv_w_s_5nn, method='knn', k=5, weighted=True)

In [None]:
results_original_data.loc[0] = ['Mean'] + calculate_accuracy(df_rf, df_mv_mean)
results_original_data.loc[1] = ['1NN'] + calculate_accuracy(df_rf, df_mv_1nn)
results_original_data.loc[2] = ['3NN'] + calculate_accuracy(df_rf, df_mv_3nn)
results_original_data.loc[3] = ['5NN'] + calculate_accuracy(df_rf, df_mv_5nn)
results_original_data.loc[5] = ['1WNN'] + calculate_accuracy(df_rf, df_mv_w_1nn)
results_original_data.loc[6] = ['3WNN'] + calculate_accuracy(df_rf, df_mv_w_3nn)
results_original_data.loc[7] = ['5WNN'] + calculate_accuracy(df_rf, df_mv_w_5nn)

In [None]:
results_scaled_data.loc[0] = ['Mean'] + calculate_accuracy(df_rf_scaled, df_mv_s_mean)
results_scaled_data.loc[1] = ['1NN'] + calculate_accuracy(df_rf_scaled, df_mv_s_1nn)
results_scaled_data.loc[2] = ['3NN'] + calculate_accuracy(df_rf_scaled, df_mv_s_3nn)
results_scaled_data.loc[3] = ['5NN'] + calculate_accuracy(df_rf_scaled, df_mv_s_5nn)
results_scaled_data.loc[5] = ['1WNN'] + calculate_accuracy(df_rf_scaled, df_mv_w_s_1nn)
results_scaled_data.loc[6] = ['3WNN'] + calculate_accuracy(df_rf_scaled, df_mv_w_s_3nn)
results_scaled_data.loc[7] = ['5WNN'] + calculate_accuracy(df_rf_scaled, df_mv_w_s_5nn)

In [None]:
results_original_data.to_csv('results_original_data.csv')

In [None]:
results_scaled_data.to_csv('results_scaled_data.csv')