In [1]:
import os
import warnings
warnings.filterwarnings("ignore")


import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor, ExtraTreesClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [24]:
DATASETS = [
'auto-mpg',
'breastCanDT',
'concrete_data',
'HousingData',
'ozone',
'parkinsons_clean',
'sonar',
'winequality-red',
'winequality-white',
]
TARGET_COLS = {
'auto-mpg': 'mpg', 
'breastCanDT': 'diagnosis', # e.g. 'diagnosis' or 'Class' or 'target'
'concrete_data': 'concrete_compressive_strength',
'Housing': 'MEDV', # Boston: 'MEDV' or 'target'
'ozone': 'obs',
'parkinsons_clean': 'status', # 1 or 0
'sonar': 'R', 
'winequality-red': 'quality',
'winequality-white': 'quality',
}
TASKS = {
    'auto-mpg': 'regression',
    'breastCancer': 'classification',
    'concrete_data': 'regression',
    'Housing': 'regression',
    'ozone': 'regression',
    'parkinsons_clean': 'classification',
    'sonar': 'classification',
    'winequality-red': 'regression',
    'winequality-white': 'regression',
}

In [3]:
def load_dataset(name):
    df = pd.read_csv(f"{name}.csv")
    return df

In [20]:
load_dataset(DATASETS[7])

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


### Detect feature columns and prepare pipline :

In [21]:
def build_preprocessor(X):
    # Basic heuristic: numeric columns vs categorical (object or low-cardinality)
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()


    # Also treat integer columns with low cardinality as categorical
    for col in numeric_cols[:]:
        if X[col].nunique() <= 10 and X[col].dtype in ['int64']:
            numeric_cols.remove(col)
            categorical_cols.append(col)


    numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_cols),('cat', categorical_transformer, categorical_cols)], remainder='drop')
    return preprocessor


### Model runners :

In [22]:
def evaluate_regression(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return {'rmse': rmse, 'r2': r2}


def evaluate_classification(y_true, y_pred, y_score=None):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='binary' if len(np.unique(y_true))==2 else 'macro')
    auc = None
    try:
        if y_score is not None:
            auc = roc_auc_score(y_true, y_score[:,1] if y_score.ndim==2 else y_score)
    except Exception:
        auc = None
    return {'accuracy': acc, 'f1': f1, 'auc': auc}

In [25]:
def run_for_dataset(name):
    print(f"\n=== Dataset: {name} ===")
    df = load_dataset(name)
    target_col = TARGET_COLS.get(name)
    if target_col is None or target_col not in df.columns:
        raise ValueError(f"Target column for {name} not found in dataframe. Please update TARGET_COLS.")

    task = TASKS.get(name)
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # For classification, encode non-numeric labels
    if task=='classification':
        if y.dtype == 'object' or y.dtype.name == 'category':
            y = pd.factorize(y)[0]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    preprocessor = build_preprocessor(X_train)

    results = []

    # --- Random Forest ---
    if task=='regression':
        rf = Pipeline([('pre', preprocessor), ('rf', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))])
    else:
        rf = Pipeline([('pre', preprocessor), ('rf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))])
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    if task=='regression':
        res = evaluate_regression(y_test, y_pred)
    else:
        y_score = rf.predict_proba(X_test) if hasattr(rf.named_steps['rf'], 'predict_proba') else None
        res = evaluate_classification(y_test, y_pred, y_score)
    res.update({'dataset': name, 'model': 'RandomForest'})
    results.append(res)
    print('RandomForest ->', res)

    # --- Extra Trees ---
    if task=='regression':
        et = Pipeline([('pre', preprocessor), ('et', ExtraTreesRegressor(n_estimators=200, random_state=42, n_jobs=-1))])
    else:
        et = Pipeline([('pre', preprocessor), ('et', ExtraTreesClassifier(n_estimators=200, random_state=42, n_jobs=-1))])
    et.fit(X_train, y_train)
    y_pred = et.predict(X_test)
    if task=='regression':
        res = evaluate_regression(y_test, y_pred)
    else:
        y_score = et.predict_proba(X_test) if hasattr(et.named_steps['et'], 'predict_proba') else None
        res = evaluate_classification(y_test, y_pred, y_score)
    res.update({'dataset': name, 'model': 'ExtraTrees'})
    results.append(res)
    print('ExtraTrees ->', res)

    # --- Gradient Boosting ---
    if task=='regression':
        gb = Pipeline([('pre', preprocessor), ('gb', GradientBoostingRegressor(n_estimators=200, random_state=42))])
    else:
        gb = Pipeline([('pre', preprocessor), ('gb', GradientBoostingClassifier(n_estimators=200, random_state=42))])
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    if task=='regression':
        res = evaluate_regression(y_test, y_pred)
    else:
        y_score = gb.predict_proba(X_test) if hasattr(gb.named_steps['gb'], 'predict_proba') else None
        res = evaluate_classification(y_test, y_pred, y_score)
    res.update({'dataset': name, 'model': 'GradientBoosting'})
    results.append(res)
    print('GradientBoosting ->', res)

    # --- RF-log(p) : implemented for regression tasks as RF on log1p(target) ---
    if task=='regression':
        y_train_log = np.log1p(y_train)
        rf_log = Pipeline([('pre', preprocessor), ('rf', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))])
        rf_log.fit(X_train, y_train_log)
        y_pred_log = rf_log.predict(X_test)
        y_pred = np.expm1(y_pred_log)
        res = evaluate_regression(y_test, y_pred)
        res.update({'dataset': name, 'model': 'RF-log1p'})
        results.append(res)
        print('RF-log1p ->', res)
    else:
        print('RF-log(p) skipped for classification (not applicable)')

    # Save results
    df_res = pd.DataFrame(results)


    return df_res


In [30]:
all_results = []
for name in DATASETS:
    try:
        df_res = run_for_dataset(name)
        all_results.append(df_res)
    except Exception as e:
        print(f"Failed on {name}: {e}")


if all_results:
    concat = pd.concat(all_results, ignore_index=True)
    res_concat = pd.DataFrame(concat)
    print(res_concat)
else:
    print('No results to summarize.')


=== Dataset: auto-mpg ===
RandomForest -> {'rmse': 1.997907639882282, 'r2': 0.9257597367596032, 'dataset': 'auto-mpg', 'model': 'RandomForest'}
ExtraTrees -> {'rmse': 2.1469159253741155, 'r2': 0.9142727755613371, 'dataset': 'auto-mpg', 'model': 'ExtraTrees'}
GradientBoosting -> {'rmse': 2.2314598469655293, 'r2': 0.9073880900181459, 'dataset': 'auto-mpg', 'model': 'GradientBoosting'}
RF-log1p -> {'rmse': 1.9917091117837653, 'r2': 0.9262196844483978, 'dataset': 'auto-mpg', 'model': 'RF-log1p'}

=== Dataset: breastCanDT ===
Failed on breastCanDT: pos_label=1 is not a valid label. It should be one of ['B', 'M']

=== Dataset: concrete_data ===
RandomForest -> {'rmse': 5.528438940061759, 'r2': 0.8813877465051169, 'dataset': 'concrete_data', 'model': 'RandomForest'}
ExtraTrees -> {'rmse': 5.256598386223905, 'r2': 0.8927656021998773, 'dataset': 'concrete_data', 'model': 'ExtraTrees'}
GradientBoosting -> {'rmse': 4.934091245184691, 'r2': 0.9055202195137996, 'dataset': 'concrete_data', 'model':