In [None]:
import matplotlib.pyplot as plt
import numpy as np 

from csv import reader
from math import sqrt


# Load and convert a dataset

def load_data(data_csv):
    """
    This funciton opens a dataset, turns it to a list, and return that list.
    Each element in that list is also a list.
    This list will be used as a train dataset or a test dataset.
    """
    dataset = list()
    # Open and read a dataset
    with open(data_csv, 'r') as data:
        csv_reader = reader(data)
        #  For each row, append the whole row of that row into the list
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    # Since the dataframe is turned to a list, the column indexes are removed
    dataset.pop(0)
    return dataset




# Because the datasets are lists, all entries are string type.
# The needed entries will be turned to float or integer for further uses.

def float_data_converter(dataset, column):
    """
    This function turns all values in a specific column to float type.
    """
    for row in dataset:
        row[column] = float(row[column])
    return
     

def numerical_data_converter(dataset, column):
    """
    This function turns the atom type column in a dataset to numbers for knn uses.
    This fuction returns a dictionary where keys are atom types and values are integers.
    """
    # Create a set of atom types
    atom_types = [row[column] for row in dataset]
    # Assign atom types to a set so they don't repeat
    key = set(atom_types)
    # Asign a number to each atom type
    numerical_atom_types = dict()
    for i, value in enumerate(key):
        numerical_atom_types[value] = i
    for row in dataset:
        row[column] = numerical_atom_types[row[column]]
    return numerical_atom_types
 
    
    
# Process train and test data when being input as directories
    
def data_processor(train_data_dir, test_data_dir):
    """
    This function turns two the input data (train and test) to to lists that will later be used.
    It takes in two directories and returns two lists where tall values are float and the atom types are enumerated.
    """
    train_data = load_data(train_data_dir)
    test_data = load_data(test_data_dir)
    datasets = (train_data, test_data)
    # Convert datasets to useful lists
    for ds in datasets:
        for i in range(len(ds[0])-2):
            float_data_converter(ds, i)
        numerical_data_converter(ds, len(ds[0])-1)
    return (train_data, test_data)  
    
    
            
# Find the euclidean distance of two data rows
    
def distance_calculator(row1, row2):
    """
    This function calculates the Euclidean distance of two rows.
    It returns the distance.
    """
    distance_squared = 0.0
    for i in range(len(row1)-2):
        distance_squared += (row1[i] - row2[i])**2
    distance = sqrt(distance_squared)
    return distance



# Find the values that are closest to the test value

def closest_neighbors_finder(train_data, test_row, k):
    """
    This function takes a row from a test dataset and find its closest neighbors from a train dataset.
    k is the number of neighbors.
    This function returns a list of neighbors.
    """
    distances = list()
    for train_row in train_data:
        distance = distance_calculator(test_row, train_row)
        distances.append((train_row, distance))
    # Sort distances list aka train rows from biggest to smallest based on their distance 
    distances.sort(key=lambda tup: tup[1])
    neighbors_list = list()
    for i in range(k):
        neighbors_list.append(distances[i][0])
    return neighbors_list





# Make a prediction with neighbors

def type_predictor(train_data, test_row, k):
    """
    This function uses the train data to predict the type of a test row.
    It employs the previous closet_neighbors_finder function to predict.
    It returns a predicted type for a test row.
    """
    neighbors_list = closest_neighbors_finder(train_data, test_row, k)
    # Create a list of types. This set contains k number of types
    type_list = [row[-1] for row in neighbors_list]
    # Based on how many times the type appears in the type list to predict type
    # Convert list of type to set so the entries don't repeat
    predicted_type = max(set(type_list), key=type_list.count)
    return predicted_type
 
# knn
def k_nearest_neighbors(train_data, test_data, k):
    """
    This function is the final funtion to use knn to predict the type of atom.
    It takes in a train dataset, a test dataset as lists, and an input number of neighbors (k).
    It returns a list of predicted types for all rows in the test dataset. 
    """
    # Avoid users from inputing k that is greater than the sample size
    assert k <= len(train_data), "[!] k can't be greater than the size of dataset."
    
    predictions = list()
    for test_row in test_data:
        result = type_predictor(train_data, test_row, k)
        predictions.append(result)
    return(predictions)

# Calculate knn accuracy
def knn_accuracy_test(train_data, test_data, k):
    """
    This function calculate the accuracy percentage of the knn algorithm.
    It takes train data and test data as lists, and a value of k, and retunrs the accuracy of that k.
    """
    # Avoid users from inputing k that is greater than the sample size
    assert k <= len(train_data), "[!] k can't be greater than the size of dataset."
    # The actual typess from the test dataset
    real = [row[-1] for row in test_data]
    # The predicted types from knn algorithm
    predict = k_nearest_neighbors(train_data, test_data, k)
    count = 0
    for i in range(len(real)):
        if real[i] == predict[i]:
            count+=1
    accuracy_percentage = ( count / float(len(predict)) ) * 100
    return accuracy_percentage



# Evaluate accuracy of all k values
def knn_k_evaluation(train_data_dir, test_data_dir):
    """
    This function evaluates the accuracy of all possible k values.
    It takes the train data directory and the test data directory.
    It returns the accuracy for each k as a histogram and a dictionary.
    """
    train_data, test_data = data_processor(train_data_dir, test_data_dir)
    k_range = np.arange(1, len(ds_train)+1)
    result = {}
    for k in k_range:
        result[k] = knn_accuracy_test(train_data, test_data, k)    
    # Plot a histogram from the result dictionary
    plt.bar(list(result.keys()), result.values(), color='b')
    plt.xlabel('k')
    plt.ylabel('Accuracy (%)')
    plt.show()
    print(result)

