In [4]:
import pandas as pd
import numpy as np
import re
import random

trainset = np.load('data/train_data.npz', allow_pickle=True)
valset= np.load('data/val_data.npz', allow_pickle=True)
testset= np.load('data/test_data.npz', allow_pickle=True)

train_data = trainset['data']
train_labels = trainset['labels']
val_data = valset['data']
val_labels = valset['labels']
test_data = testset['data']
test_labels = testset['labels']

# Model

In [None]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score


# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Fit the model on the training data
rf_classifier.fit(train_data, train_labels)
# Make predictions on the validation set
val_predictions = rf_classifier.predict(val_data)
# Calculate accuracy and F1 score
val_accuracy = accuracy_score(val_labels, val_predictions)
val_f1 = f1_score(val_labels, val_predictions, average='weighted')
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation F1 Score: {val_f1:.4f}")
# Make predictions on the test set
test_predictions = rf_classifier.predict(test_data)
# Calculate accuracy and F1 score
test_accuracy = accuracy_score(test_labels, test_predictions)
test_f1 = f1_score(test_labels, test_predictions, average='weighted')
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

Validation Accuracy: 0.9500
Validation F1 Score: 0.9256
Test Accuracy: 1.0000
Test F1 Score: 1.0000


# Feature Importance Analysis

In [6]:
# feature importance analysis
rf_importances = rf_classifier.feature_importances_
# Sort the feature importances in descending order
sorted_indices = np.argsort(rf_importances)[::-1]
# Get the top 10 most important features
top_n = 10
top_features = sorted_indices[:top_n]
# Print the top 10 most important features
print("Top 10 most important features:")
for i in range(top_n):
    print(f"Feature {top_features[i]}: {rf_importances[top_features[i]]:.4f}")

Top 10 most important features:
Feature 1614: 0.0339
Feature 2031: 0.0339
Feature 34909: 0.0169
Feature 3788: 0.0169
Feature 11677: 0.0169
Feature 16983: 0.0169
Feature 15180: 0.0169
Feature 37712: 0.0169
Feature 6934: 0.0169
Feature 1696: 0.0169
