In [None]:
import torch
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import sys
import os

# Add the src directory to the path to import from common
from common.datasets import CubeDataset

# Set random seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

# Generate the Cube dataset
print("Generating Cube dataset...")
dataset = CubeDataset(
    n_features=20,
    n_samples=1000,  # Reduced for faster execution
    seed=SEED,
    non_informative_feature_mean=0.5,
    informative_feature_variance=0.2,
    non_informative_feature_variance=0.3
)
dataset.generate_data()

# Get features and labels
features, labels = dataset.get_all_data()
print(f"Dataset shape: {features.shape}")
print(f"Number of classes: {len(torch.unique(labels))}")
print(f"Class distribution: {torch.bincount(labels.long(), minlength=8)}")

# Convert to numpy for scikit-learn
X = features.numpy()
y = labels.numpy()

# Split into train and test sets (80% train, 20% test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# Train a random forest classifier
print("\nTraining Random Forest classifier...")
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=SEED,
    n_jobs=-1  # Use all available cores
)
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest accuracy: {accuracy:.4f}")



In [9]:
import torch
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import sys
import os

# Add the src directory to the path to import from common
from common.datasets import AFAContextDataset

# Set random seed for reproducibility
SEED = 49
torch.manual_seed(SEED)
np.random.seed(SEED)

# Generate the AFAContext dataset
print("Generating AFAContext dataset...")
dataset = AFAContextDataset(
    n_samples=1000,  # Reduced for faster execution
    std_bin=0.1,
    std_cube=0.3,
    bin_feature_cost=5.0,
    n_dummy_features=10,
    seed=SEED,
    non_informative_feature_mean=0.5,
    non_informative_feature_std=0.3
)
dataset.generate_data()

# Get features and labels
features, labels = dataset.get_all_data()

# Convert to numpy for scikit-learn
X = features.numpy()
y = labels.numpy()
y = np.argmax(y, axis=1)

print(f"Dataset shape: {features.shape}")
print(f"class distr ", np.unique(y, return_counts=True))


# Split into train and test sets (80% train, 20% test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# Function to train and evaluate a random forest on a subset of features
def train_and_evaluate_rf(X_train, X_test, y_train, y_test, feature_indices, description):
    print(f"\nTraining Random Forest classifier on {description}...")
    
    # Select features
    X_train_subset = X_train[:, feature_indices]
    X_test_subset = X_test[:, feature_indices]
    
    # Train a random forest classifier
    rf_classifier = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=SEED,
        n_jobs=-1  # Use all available cores
    )
    rf_classifier.fit(X_train_subset, y_train)
    
    # Make predictions
    y_pred = rf_classifier.predict(X_test_subset)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test accuracy: {accuracy:.4f}")
    
    return accuracy

# 1. Train on first 10 features
first_10_features = list(range(10))
first_10_accuracy = train_and_evaluate_rf(
    X_train, X_test, y_train, y_test, 
    first_10_features, 
    "first 10 features"
)

# 2. Train on next 10 features
next_10_features = list(range(10, 20))
next_10_accuracy = train_and_evaluate_rf(
    X_train, X_test, y_train, y_test, 
    next_10_features, 
    "next 10 features"
)

# 3. Train on all features
all_features = list(range(X.shape[1]))
all_features_accuracy = train_and_evaluate_rf(
    X_train, X_test, y_train, y_test, 
    all_features, 
    "all features"
)

# 4. Train on all features EXCEPT the first 10
all_except_first_10 = list(range(10, X.shape[1]))
all_except_first_10_accuracy = train_and_evaluate_rf(
    X_train, X_test, y_train, y_test, 
    all_except_first_10, 
    "all features except first 10"
)

# 5. Train on all features EXCEPT the second 10
all_except_second_10 = list(range(0, 10)) + list(range(20, X.shape[1]))
all_except_second_10_accuracy = train_and_evaluate_rf(
    X_train, X_test, y_train, y_test, 
    all_except_second_10, 
    "all features except second 10"
)

# Compare accuracies
print("\nAccuracy Comparison:")
print(f"First 10 features: {first_10_accuracy:.4f}")
print(f"Next 10 features: {next_10_accuracy:.4f}")
print(f"All features: {all_features_accuracy:.4f}")
print(f"All features except first 10: {all_except_first_10_accuracy:.4f}")
print(f"All features except second 10: {all_except_second_10_accuracy:.4f}")


Generating AFAContext dataset...
Dataset shape: torch.Size([1000, 30])
class distr  (array([0, 1, 2, 3, 4, 5, 6, 7]), array([111, 110, 156, 129, 129, 118, 109, 138]))
Train set shape: (800, 30)
Test set shape: (200, 30)

Training Random Forest classifier on first 10 features...
Test accuracy: 1.0000

Training Random Forest classifier on next 10 features...
Test accuracy: 0.7950

Training Random Forest classifier on all features...
Test accuracy: 0.9300

Training Random Forest classifier on all features except first 10...
Test accuracy: 0.8000

Training Random Forest classifier on all features except second 10...
Test accuracy: 0.9900

Accuracy Comparison:
First 10 features: 1.0000
Next 10 features: 0.7950
All features: 0.9300
All features except first 10: 0.8000
All features except second 10: 0.9900
