# OIKAN Classification Benchmark Tests

This notebook evaluates the OIKANClassifier on various classification tasks to assess:
1. Classification metrics (Accuracy, F1-score, ROC-AUC)
2. Decision boundary interpretability
3. Symbolic formula extraction quality
4. Comparison with traditional classifiers

## Setup and Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')

!pip install -qU oikan

In [None]:
!pip freeze | grep oikan

In [None]:
import numpy as np
import pandas as pd
from time import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    roc_auc_score,
    f1_score
)
from sklearn.datasets import (
    make_classification,
    make_moons,
    make_circles,
    load_iris,
    load_breast_cancer
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from oikan.model import OIKANClassifier

np.random.seed(42)

## 1. Synthetic Classification Tests

First, let's evaluate OIKAN on synthetic datasets with known decision boundaries.

In [None]:
def generate_synthetic_datasets():
    datasets = {
        'Linear': make_classification(
            n_samples=1000, n_features=2, n_redundant=0,
            n_informative=2, random_state=42,
            n_clusters_per_class=1
        ),
        'Moons': make_moons(
            n_samples=1000, noise=0.1, random_state=42
        ),
        'Circles': make_circles(
            n_samples=1000, noise=0.1, factor=0.3, random_state=42
        )
    }
    return datasets

synthetic_datasets = generate_synthetic_datasets()

In [None]:
def benchmark_classifier(model, X, y, model_name="OIKAN"):
    results = {}
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42
    )
    
    # Training time
    start_time = time()
    model.fit(X_train, y_train)
    train_time = time() - start_time
    
    # Prediction time
    start_time = time()
    y_pred = model.predict(X_test)
    predict_time = time() - start_time
    
    # Metrics
    results = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred, average='weighted'),
        'Train Time': train_time,
        'Predict Time': predict_time,
        'Report': classification_report(y_test, y_pred)
    }
    
    if model_name == 'OIKAN':
        symbolic_pred = model.symbolic_predict(X_test)
        results['Symbolic Accuracy'] = accuracy_score(y_test, symbolic_pred)
        results['Symbolic Formula'] = model.get_symbolic_formula()
    
    return results

In [None]:
def run_synthetic_benchmarks():
    results = []
    models = {
        'OIKAN': OIKANClassifier(hidden_dims=[32, 16]),
        'MLPClassifier': MLPClassifier(hidden_layer_sizes=(32, 16), max_iter=500),
        'RandomForest': RandomForestClassifier(n_estimators=100),
        'LogisticRegression': LogisticRegression(),
        'DecisionTree': DecisionTreeClassifier(),
        'SVM': SVC(probability=True)
    }
    
    metrics = ['accuracy', 'roc_auc', 'train_time', 'predict_time']
    
    for dataset_name, (X, y) in synthetic_datasets.items():
        print(f"\nBenchmarking {dataset_name} dataset...")
        
        for model_name, model in models.items():
            res = benchmark_classifier(model, X, y, model_name)
            res['Dataset'] = dataset_name
            results.append(res)
            print(f"{model_name}:")
            print(f"Accuracy = {res['Accuracy']:.4f}")
            if 'ROC AUC' in res:
                print(f"ROC AUC = {res['ROC AUC']:.4f}")
            
            if model_name == 'OIKAN':
                print(f"Symbolic Accuracy = {res['Symbolic Accuracy']:.4f}")
                print("Example decision boundary terms:")
                for i, formula in enumerate(res['Symbolic Formula'][0]):
                    print(f"Class {i}: {formula[:100]}...")
    
    return pd.DataFrame(results)

benchmark_results = run_synthetic_benchmarks()

## 2. Real Dataset Tests

Now let's evaluate OIKAN on real-world classification problems.

In [None]:
def load_real_datasets():
    datasets = {
        'Iris': load_iris(return_X_y=True),
        'Breast Cancer': load_breast_cancer(return_X_y=True)
    }
    return datasets

real_datasets = load_real_datasets()
real_results = []

for dataset_name, (X, y) in real_datasets.items():
    print(f"\nBenchmarking {dataset_name} dataset...")
    
    # Scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Run benchmarks
    oikan = OIKANClassifier(hidden_dims=[64, 32])
    res = benchmark_classifier(oikan, X_scaled, y, 'OIKAN')
    res['Dataset'] = dataset_name
    real_results.append(res)
    
    print(f"OIKAN Accuracy = {res['Accuracy']:.4f}")
    print(f"Symbolic Accuracy = {res['Symbolic Accuracy']:.4f}")
    print("\nExample decision boundary formula:")
    print(res['Symbolic Formula'][0][0])

## 3. Results Analysis

Let's analyze the classification performance metrics in a clear tabular format.

In [None]:
def display_benchmark_results(synthetic_results, real_results):
    # Combine all results
    all_results = pd.concat([pd.DataFrame(synthetic_results), pd.DataFrame(real_results)])
    
    # Format results table with consistent columns
    summary = all_results.pivot_table(
        index=['Dataset', 'Model'],
        values=['Accuracy', 'F1', 'Train Time', 'Predict Time'],
        aggfunc='mean'
    ).round(4)
    
    # Sort by dataset and accuracy
    summary = summary.sort_index(level=0)
    
    print("\nClassification Benchmark Results:")
    print("==============================\n")
    print(summary)
    
    # For OIKAN models, show symbolic accuracy
    oikan_results = all_results[all_results['Model'] == 'OIKAN']
    print("\nOIKAN Symbolic Formula Performance:")
    print("================================\n")
    for _, row in oikan_results.iterrows():
        print(f"Dataset: {row['Dataset']}")
        print(f"Neural Accuracy = {row['Accuracy']:.4f}")
        print(f"Symbolic Accuracy = {row['Symbolic Accuracy']:.4f}\n")

display_benchmark_results(benchmark_results, pd.DataFrame(real_results))