In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Load the dataset
data = np.load("/content/drive/My Drive/Thesis_data/Master_integer.npy", allow_pickle=True)

# Separate features and target variable
X = data[:, :-1]
y = data[:, -1]

# Split the dataset into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

In [12]:
print(data.shape)

(120224, 170)


In [6]:
# Initialize the GradientBoostingClassifier
model = DecisionTreeClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions on validation data
val_predictions = model.predict(X_val)

# Evaluate the model on validation data
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Make predictions on test data
test_predictions = model.predict(X_test)

# Evaluate the model on test data
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)

Validation Accuracy: 0.7573715949261801
Test Accuracy: 0.7664795175712207


In [8]:
# Print classification report for test data
print(classification_report(y_test, test_predictions))

              precision    recall  f1-score   support

           0       0.67      0.44      0.53      2043
           8       0.44      0.36      0.40        33
          16       0.78      0.96      0.86     15810
          24       0.63      0.26      0.37      1330
          32       0.70      0.32      0.43      1512
          40       0.63      0.33      0.43       155
          48       0.25      0.09      0.13        11
          56       0.00      0.00      0.00         1
          64       0.91      0.92      0.92       538
          80       0.00      0.00      0.00        24
         144       0.00      0.00      0.00         3
        2048       0.00      0.00      0.00        10
        2064       0.77      0.43      0.55      2343
        2080       0.74      0.13      0.22       228
        2096       0.00      0.00      0.00         4

    accuracy                           0.77     24045
   macro avg       0.43      0.28      0.32     24045
weighted avg       0.75   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
import joblib

# Save the trained model
joblib.dump(model, '/content/drive/My Drive/Thesis_data/DecisionTree_model.pkl')

['/content/drive/My Drive/Thesis_data/DecisionTree_model.pkl']

In [10]:
print(X_test.shape)
print(test_predictions.shape)

(24045, 169)
(24045,)


In [11]:
# Extract unique integer labels
original_labels = np.unique(y)
predicted_unique = np.unique(test_predictions)

# Create a mapping between integer labels and class indices which will be later used in classification as classes.
label_to_index = {label: index for index, label in enumerate(original_labels)}

print(original_labels)
print(predicted_unique)
print(label_to_index)

[   0    8   16   24   32   40   48   56   64   80  144 2048 2064 2072
 2080 2096]
[   0    8   16   24   32   40   48   64   80 2048 2064 2080]
{0: 0, 8: 1, 16: 2, 24: 3, 32: 4, 40: 5, 48: 6, 56: 7, 64: 8, 80: 9, 144: 10, 2048: 11, 2064: 12, 2072: 13, 2080: 14, 2096: 15}
