# OIKAN Regression Benchmark Tests

This notebook evaluates the OIKANRegressor on various regression tasks to assess:
1. Prediction Accuracy (MSE, R²)
2. Training and Prediction Time
3. Symbolic formula extraction and prediction quality

## Dataset Types:
1. Synthetic Dataset (make_regression)
2. Real Datasets:
   - Diabetes Dataset
   - Weather Temperature Dataset

In [None]:
import warnings
warnings.filterwarnings('ignore')

!pip install -qU oikan

In [None]:
!pip freeze | grep oikan

In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression, load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from oikan.model import OIKANRegressor

np.random.seed(42)
print('Libraries imported.')

In [None]:
# Generate synthetic regression dataset
def generate_regression_data(n_samples=1000, n_features=10, noise=10.0):
    X, y = make_regression(n_samples=n_samples, n_features=n_features, noise=noise, random_state=42)
    return X, y

X, y = generate_regression_data()
print(f'Dataset shape: X={X.shape}, y={y.shape}')

In [None]:
# Load datasets with consistent feature dimensions
def load_weather_data():
    rng = np.random.RandomState(42)
    n_samples = 1000
    n_features = 10  # Match feature dimension with other datasets
    X = rng.randn(n_samples, n_features)
    noise = 0.1
    # Use first 4 features for the actual relationship
    y = 3.0 * X[:, 0] + 2.0 * X[:, 1] - 1.5 * X[:, 2] + 0.5 * X[:, 3] + noise * rng.randn(n_samples)
    return X, y

def load_datasets():
    try:
        datasets = {
            'Synthetic': make_regression(n_samples=1000, n_features=10, noise=10.0, random_state=42),
            'Diabetes': load_diabetes(return_X_y=True),  # Already has 10 features
            'Weather': load_weather_data()
        }
        return datasets
    except Exception as e:
        print(f'Error loading datasets: {str(e)}')
        return {}

print('Loading datasets...')
datasets = load_datasets()
if datasets:
    for name, (X, y) in datasets.items():
        print(f'{name} dataset shape: X={X.shape}, y={y.shape}')
else:
    print('Failed to load datasets!')

In [None]:
def benchmark_regressor(model, X, y, model_name='OIKANRegressor'):
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Training
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time

    # Neural Network Prediction
    start_time = time.time()
    y_pred = model.predict(X_test)
    predict_time = time.time() - start_time

    results = {
        'Model': model_name,
        'MSE': mean_squared_error(y_test, y_pred),
        'R2': r2_score(y_test, y_pred),
        'Train Time': train_time,
        'Predict Time': predict_time
    }

    # For OIKAN, add symbolic prediction results
    if model_name == 'OIKANRegressor':
        # Get symbolic formula and predictions
        symbolic_formula = model.get_symbolic_formula()
        y_sym = model.symbolic_predict(X_test)
        
        # Calculate symbolic performance
        results.update({
            'Symbolic MSE': mean_squared_error(y_test, y_sym),
            'Symbolic R2': r2_score(y_test, y_sym),
            'Symbolic Formula': symbolic_formula
        })

    return results

print('Benchmark function defined.')

In [None]:
# Initialize models with consistent architectures
models = {
    'OIKANRegressor': OIKANRegressor(hidden_dims=[10, 5]),  # Match input dimension
    'LinearRegression': LinearRegression(),
    'RandomForestRegressor': RandomForestRegressor(n_estimators=100, random_state=42),
    'MLPRegressor': MLPRegressor(hidden_layer_sizes=(10, 5), max_iter=500, random_state=42)
}

results_list = []

for dataset_name, (X, y) in datasets.items():
    print(f'\nTesting on {dataset_name} dataset:')
    for model_name, model in models.items():
        print(f'Benchmarking {model_name}...')
        res = benchmark_regressor(model, X, y, model_name=model_name)
        res['Dataset'] = dataset_name
        results_list.append(res)
        
        # Print results
        print(f"{model_name}:")
        print(f"MSE={res.get('MSE', 'N/A'):.4f}, R2={res.get('R2', 'N/A'):.4f}")
        if model_name == 'OIKANRegressor':
            print(f"Symbolic MSE={res.get('Symbolic MSE', 'N/A'):.4f}")
            print(f"Symbolic R2={res.get('Symbolic R2', 'N/A'):.4f}")

# Generate performance table
df_results = pd.DataFrame(results_list)
summary = df_results.pivot_table(
    index=['Dataset', 'Model'],
    values=['MSE', 'R2', 'Symbolic MSE', 'Symbolic R2'],
    aggfunc='mean'
).round(4)
print('\nRegression Benchmark Performance Table:')
print(summary)

In [None]:
# Visualize results
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15, 6))

# R² Score Comparison
plt.subplot(121)
sns.barplot(data=df_results, x='Dataset', y='R2', hue='Model')
plt.title('R² Score by Model and Dataset')
plt.xticks(rotation=45)
plt.tight_layout()

# MSE Comparison
plt.subplot(122)
sns.barplot(data=df_results, x='Dataset', y='MSE', hue='Model')
plt.title('MSE by Model and Dataset')
plt.xticks(rotation=45)
plt.yscale('log')  # Use log scale for MSE
plt.tight_layout()

plt.show()

In [None]:
# Feature importance analysis for OIKAN
oikan_model = models['OIKANRegressor']

for dataset_name, (X, y) in datasets.items():
    print(f'\nFeature Importance Analysis for {dataset_name}:')
    X_scaled = StandardScaler().fit_transform(X)
    oikan_model.fit(X_scaled, y)
    
    feature_scores = oikan_model.get_feature_scores()
    
    plt.figure(figsize=(10, 4))
    plt.bar(range(len(feature_scores)), feature_scores)
    plt.title(f'OIKAN Feature Importance - {dataset_name}')
    plt.xlabel('Feature Index')
    plt.ylabel('Importance Score')
    plt.show()