In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load the dataset
data = np.load("/content/drive/My Drive/Thesis_data/Master_integer.npy", allow_pickle=True)

# Separate features and target variable
X = data[:, :-1]
y = data[:, -1]

# Split the dataset into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

In [4]:
print(data.shape)

(120224, 170)


In [5]:
# Initialize the KNeighborsClassifier
model = KNeighborsClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions on validation data
val_predictions = model.predict(X_val)

# Evaluate the model on validation data
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Make predictions on test data
test_predictions = model.predict(X_test)

# Evaluate the model on test data
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)

Validation Accuracy: 0.7353711790393013
Test Accuracy: 0.7409024745269287


In [6]:
# Print classification report for test data
print(classification_report(y_test, test_predictions))

              precision    recall  f1-score   support

           0       0.56      0.39      0.46      2043
           8       0.67      0.06      0.11        33
          16       0.78      0.93      0.85     15810
          24       0.47      0.26      0.34      1330
          32       0.50      0.29      0.37      1512
          40       0.44      0.17      0.25       155
          48       0.00      0.00      0.00        11
          56       0.00      0.00      0.00         1
          64       0.90      0.88      0.89       538
          80       0.00      0.00      0.00        24
         144       0.00      0.00      0.00         3
        2048       0.00      0.00      0.00        10
        2064       0.68      0.44      0.54      2343
        2080       0.68      0.06      0.11       228
        2096       0.00      0.00      0.00         4

    accuracy                           0.74     24045
   macro avg       0.38      0.23      0.26     24045
weighted avg       0.71   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
import joblib

# Save the trained model
joblib.dump(model, '/content/drive/My Drive/Thesis_data/KNeighborsClassifier.pkl')

['/content/drive/My Drive/Thesis_data/KNeighborsClassifier.pkl']

In [8]:
print(X_test.shape)
print(test_predictions.shape)

(24045, 169)
(24045,)


In [9]:
# Extract unique integer labels
original_labels = np.unique(y)
predicted_unique = np.unique(test_predictions)

# Create a mapping between integer labels and class indices which will be later used in classification as classes.
label_to_index = {label: index for index, label in enumerate(original_labels)}

print(original_labels)
print(predicted_unique)
print(label_to_index)

[   0    8   16   24   32   40   48   56   64   80  144 2048 2064 2072
 2080 2096]
[   0    8   16   24   32   40   64   80 2048 2064 2080]
{0: 0, 8: 1, 16: 2, 24: 3, 32: 4, 40: 5, 48: 6, 56: 7, 64: 8, 80: 9, 144: 10, 2048: 11, 2064: 12, 2072: 13, 2080: 14, 2096: 15}
