In [1]:
# OUR LIBRARIES:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# OUR DATA:
# Reading our Test and Training Sets from their respective CSV files
test_set = pd.read_csv('test.csv')
train_set = pd.read_csv('train.csv')

# Separating our Attribute and Class Values for our Test Sets
test_x = test_set.drop('actual-class', axis = 1)
test_y = test_set['actual-class']

# Had to drop the ID column due to issues with the KNN method
test_x = test_x.drop('ID', axis = 1)

# Separating our Attribute and Class Values for our Training Sets
train_x = train_set.drop('class', axis = 1)
train_y = train_set['class']

# Converting our dataframes to a numpy arrays
test_x = np.array(test_x)
test_y = np.array(test_y)
train_x = np.array(train_x)
train_y = np.array(train_y)

In [3]:
# K-NEAREST NEIGHBORS PROBLEM #1: 3-NEAREST NEIGHBORS:
# Initializing our KNN Object. For Problem #1, we will use 3-Nearest Neighbors.
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(train_x, train_y)
predictions = knn.predict(test_x)

# Printing a visual of what our predicted values and test values are.
print(predictions)
print(test_y)

[1 0 0 1 0 1 0 0 0 0 1 0 0 0 1 1 1 0 0 0]
[1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 0]


In [4]:
# METRICS FOR PROBLEM #1:
print("Probability Estimates:", '\n', (knn.predict_proba(test_x)), '\n')
print("Confusion Matrix:", '\n', confusion_matrix(predictions, test_y), '\n')
print("Accuracy Score for Problem #1:", accuracy_score(predictions, test_y))
print("Precision Score for Problem #1:", precision_score(predictions, test_y))
print("Recall Score for Problem #1:", recall_score(predictions, test_y))
print("F_Measure for Problem #1:", f1_score(predictions, test_y))

Probability Estimates: 
 [[0.         1.        ]
 [0.66666667 0.33333333]
 [1.         0.        ]
 [0.         1.        ]
 [1.         0.        ]
 [0.         1.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [0.         1.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [0.33333333 0.66666667]
 [0.         1.        ]
 [0.         1.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]] 

Confusion Matrix: 
 [[13  0]
 [ 1  6]] 

Accuracy Score for Problem #1: 0.95
Precision Score for Problem #1: 1.0
Recall Score for Problem #1: 0.8571428571428571
F_Measure for Problem #1: 0.923076923076923


In [5]:
# K-NEAREST NEIGHBORS PROBLEM #2: EUCLIDEAN DISTANCE 3-NEAREST NEIGHBORS:
# Creating our weight function (1/d^2), since our weighted KNN accepts a user-defined function as a weight, so long as the function accepts an array
# 1. Compute the Euclidean Distance for Each Input
#    a. Square each component input in our array
#    b. Add each of the squared component inputs
#    c. Take the Square Root of the Sum (We can exclude this step since will be squaring the distance anyway)
# 2. Divide 1 by the value calculated in step #1, giving us (1/d^2)
# 3. Multiply each input value by our weight, and return the array once the loop is finished
def weighted_distances(arr):
    length, width = arr.shape
    temp = 0
    for i in range(length):
        temp = np.square(arr[i])
        temp = np.sum(temp)
        temp = 1/temp
        arr[i] = temp*arr[i]
    
    return arr
    
# Initializing our KNN Object. For Problem #2, we use 3-Nearest Neighbors with Euclidean Distance Weighted 1/d^2
euclid_knn = KNeighborsClassifier(n_neighbors = 3, weights = weighted_distances, metric = "euclidean")
euclid_knn.fit(train_x, train_y)
euclid_predictions = euclid_knn.predict(test_x)


print(euclid_predictions)
print(test_y)

[1 0 0 1 0 1 0 0 0 0 1 0 0 0 1 1 1 0 0 0]
[1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 0]


In [6]:
# METRICS FOR PROBLEM #2:
print("Probability Estimates:", '\n', (euclid_knn.predict_proba(test_x)), '\n')
print("Confusion Matrix:", '\n', confusion_matrix(euclid_predictions, test_y), '\n')
print("Accuracy Score for Problem #1:", accuracy_score(euclid_predictions, test_y))
print("Precision Score for Problem #1:", precision_score(euclid_predictions, test_y))
print("Recall Score for Problem #1:", recall_score(euclid_predictions, test_y))
print("F_Measure for Problem #1:", f1_score(euclid_predictions, test_y))

Probability Estimates: 
 [[0.         1.        ]
 [0.66165411 0.33834589]
 [1.         0.        ]
 [0.         1.        ]
 [1.         0.        ]
 [0.         1.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [0.         1.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [0.3229923  0.6770077 ]
 [0.         1.        ]
 [0.         1.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]] 

Confusion Matrix: 
 [[13  0]
 [ 1  6]] 

Accuracy Score for Problem #1: 0.95
Precision Score for Problem #1: 1.0
Recall Score for Problem #1: 0.8571428571428571
F_Measure for Problem #1: 0.923076923076923
