In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE, SMOTENC
from SupportFunctions.load_datasets import load_datasets
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

In [2]:
all_datasets = load_datasets()
selected = ["sonar", "crx", "titanic", "ionosphere"]

Loaded: artificial_tree.csv (5000 rows, 41 columns)
Loaded: audiology.csv (190 rows, 68 columns)
Loaded: balance_scale.csv (625 rows, 5 columns)
Loaded: breast_cancer.csv (699 rows, 10 columns)
Loaded: car.csv (1728 rows, 7 columns)
Loaded: chess.csv (3196 rows, 37 columns)
Loaded: crx.csv (664 rows, 15 columns)
Loaded: diabetes.csv (768 rows, 9 columns)
Loaded: ecoli_5.csv (327 rows, 8 columns)
Loaded: flare1.csv (323 rows, 11 columns)
Loaded: glass.csv (214 rows, 10 columns)
Loaded: heart_disease.csv (303 rows, 14 columns)
Loaded: heart_failure.csv (299 rows, 11 columns)
Loaded: hepatitis.csv (138 rows, 16 columns)
Loaded: hill_valley.csv (606 rows, 101 columns)
Loaded: ionosphere.csv (351 rows, 35 columns)
Loaded: iris.csv (150 rows, 5 columns)
Loaded: lymphography.csv (148 rows, 19 columns)
Loaded: mnist_test.csv (10000 rows, 785 columns)
Loaded: optdigits.csv (3823 rows, 65 columns)
Loaded: parkinsons.csv (195 rows, 23 columns)
Loaded: seeds.csv (199 rows, 8 columns)
Loaded: segme

In [13]:
# Preview dataset
dataset_name = "tic-tac-toe"
df = all_datasets[dataset_name]

print(f"Previewing dataset: {dataset_name}\n")
print(df.head())

print("\nColumn Data Types:")
print(df.dtypes)

cat_cols = list(df.select_dtypes(include=['object', 'category']).columns)
print("\nCategorical columns detected based on dtype:")

df

Previewing dataset: tic-tac-toe

        win TL TM TR ML MM MR BL BM BR
0  positive  x  x  x  x  o  o  x  o  o
1  positive  x  x  x  x  o  o  o  x  o
2  positive  x  x  x  x  o  o  o  o  x
3  positive  x  x  x  x  o  o  o  b  b
4  positive  x  x  x  x  o  o  b  o  b

Column Data Types:
win    object
TL     object
TM     object
TR     object
ML     object
MM     object
MR     object
BL     object
BM     object
BR     object
dtype: object

Categorical columns detected based on dtype:


Unnamed: 0,win,TL,TM,TR,ML,MM,MR,BL,BM,BR
0,positive,x,x,x,x,o,o,x,o,o
1,positive,x,x,x,x,o,o,o,x,o
2,positive,x,x,x,x,o,o,o,o,x
3,positive,x,x,x,x,o,o,o,b,b
4,positive,x,x,x,x,o,o,b,o,b
...,...,...,...,...,...,...,...,...,...,...
953,negative,o,x,x,x,o,o,o,x,x
954,negative,o,x,o,x,x,o,x,o,x
955,negative,o,x,o,x,o,x,x,o,x
956,negative,o,x,o,o,x,x,x,o,x


In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE, SMOTENC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from collections import Counter

def evaluate_models_on_split(X_train, X_test, y_train, y_test, encoding_technique='onehot', random_state=42):
    min_count = min(Counter(y_train).values())
    if min_count < 2:
        print("Skipping iteration: Not enough minority samples for SMOTE.")
        return None, None

    k_neighbors = min(5, min_count - 1)

    # Identify categorical columns
    cat_cols = list(X_train.select_dtypes(include=['object', 'category']).columns)
    
    # Encoding Techniques
    if encoding_technique == 'onehot':
        X_train_encoded = pd.get_dummies(X_train, drop_first=True)
        X_test_encoded = pd.get_dummies(X_test, drop_first=True)
        X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)
        
        smote = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
        X_train_res, y_train_res = smote.fit_resample(X_train_encoded, y_train)

    elif encoding_technique == 'ordinal':
        X_train_encoded = X_train.copy()
        X_test_encoded = X_test.copy()
        ordinal_mappings = {}

        for col in cat_cols:
            categories = X_train_encoded[col].astype('category').cat.categories.tolist()
            ordinal_mappings[col] = {val: idx for idx, val in enumerate(categories)}
            X_train_encoded[col] = X_train_encoded[col].map(ordinal_mappings[col])
            X_test_encoded[col] = X_test_encoded[col].map(ordinal_mappings[col])

        smote = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
        X_train_res, y_train_res = smote.fit_resample(X_train_encoded, y_train)

    elif encoding_technique == 'frequency':
        freq_mappings = {col: X_train[col].value_counts(normalize=True).to_dict() for col in cat_cols}
        X_train_encoded = X_train.copy()
        X_test_encoded = X_test.copy()

        for col in cat_cols:
            X_train_encoded[col] = X_train_encoded[col].map(freq_mappings[col])
            X_test_encoded[col] = X_test_encoded[col].map(freq_mappings[col])

        smote = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
        X_train_res, y_train_res = smote.fit_resample(X_train_encoded, y_train)

    else:
        raise ValueError("Invalid encoding technique. Choose 'onehot', 'ordinal', or 'frequency'.")

    # Convert categorical columns to numeric (Ensures no string values)
    X_train_res = pd.DataFrame(X_train_res).apply(pd.to_numeric, errors='coerce')
    X_test_encoded = pd.DataFrame(X_test_encoded).apply(pd.to_numeric, errors='coerce')

    # Fit RandomForest Classifier
    clf_smote = RandomForestClassifier(random_state=random_state)
    clf_smote.fit(X_train_res, y_train_res)
    y_pred_smote = clf_smote.predict(X_test_encoded)
    f1_smote = f1_score(y_test, y_pred_smote, average='weighted')

    # --- SMOTENC ---
    X_train_smotenc = X_train.copy()
    X_test_smotenc = X_test.copy()
    for col in cat_cols:
        X_train_smotenc[col] = X_train_smotenc[col].astype('category').cat.codes
        X_test_smotenc[col] = X_test_smotenc[col].astype('category').cat.codes

    categorical_indices = [X_train_smotenc.columns.get_loc(col) for col in cat_cols]

    smotenc = SMOTENC(categorical_features=categorical_indices, random_state=random_state, k_neighbors=k_neighbors)
    X_train_res_smotenc, y_train_res_smotenc = smotenc.fit_resample(X_train_smotenc, y_train)

    # Convert categorical columns to numeric
    X_train_res_smotenc = pd.DataFrame(X_train_res_smotenc).apply(pd.to_numeric, errors='coerce')
    X_test_smotenc = pd.DataFrame(X_test_smotenc).apply(pd.to_numeric, errors='coerce')

    # Fit RandomForest Classifier
    clf_smotenc = RandomForestClassifier(random_state=random_state)
    clf_smotenc.fit(X_train_res_smotenc, y_train_res_smotenc)
    y_pred_smotenc = clf_smotenc.predict(X_test_smotenc)
    f1_smotenc = f1_score(y_test, y_pred_smotenc, average='weighted')

    return f1_smote, f1_smotenc

# Selected datasets
selected = ["flare1", "crx", "titanic"]
selected_datasets = {name: df for name, df in all_datasets.items() if name in selected}

n_iter = 10
results_f1 = {name: {"smote": [], "smotenc": []} for name in selected_datasets.keys()}

encoding_techniques = ["onehot", "ordinal", "frequency"]

# Loop over datasets and iterations
for encoding_technique in encoding_techniques:
    print(f"\nEvaluating with encoding technique: {encoding_technique}")
    for name, df in selected_datasets.items():
        target = df.iloc[:, 0]
        X = df.iloc[:, 1:]
        print("=" * 50)
        print(f"Processing dataset: {name}")
        print("=" * 50)

        for i in range(n_iter):
            rs = 42 + i
            X_train, X_test, y_train, y_test = train_test_split(
                X, target, test_size=0.3, random_state=rs
            )
            scores = evaluate_models_on_split(X_train, X_test, y_train, y_test, encoding_technique, random_state=rs)
            if scores[0] is not None and scores[1] is not None:
                results_f1[name]["smote"].append(scores[0])
                results_f1[name]["smotenc"].append(scores[1])

        avg_smote = np.mean(results_f1[name]["smote"]) if results_f1[name]["smote"] else np.nan
        avg_smotenc = np.mean(results_f1[name]["smotenc"]) if results_f1[name]["smotenc"] else np.nan
        print(f"Dataset: {name} - Avg Weighted F1 (SMOTE {encoding_technique}): {avg_smote:.4f}")
        print(f"Dataset: {name} - Avg Weighted F1 (SMOTENC): {avg_smotenc:.4f}")

# Plot Results
plt.figure(figsize=(10, 6))
for encoding in encoding_techniques:
    smote_scores = [np.mean(results_f1[d]["smote"]) for d in selected]
    smotenc_scores = [np.mean(results_f1[d]["smotenc"]) for d in selected]
    plt.bar(selected, smote_scores, label=f'SMOTE {encoding}', alpha=0.7)
plt.legend()
plt.ylabel('Average Weighted F1 Score')
plt.title('Comparison of SMOTE Encoding Techniques vs. SMOTENC')
plt.show()



Evaluating with encoding technique: onehot
Processing dataset: crx
Dataset: crx - Avg Weighted F1 (SMOTE onehot): 0.8753
Dataset: crx - Avg Weighted F1 (SMOTENC): 0.8797
Processing dataset: flare1
Dataset: flare1 - Avg Weighted F1 (SMOTE onehot): 0.7897
Dataset: flare1 - Avg Weighted F1 (SMOTENC): 0.7460
Processing dataset: titanic
Dataset: titanic - Avg Weighted F1 (SMOTE onehot): 0.7859
Dataset: titanic - Avg Weighted F1 (SMOTENC): 0.7906

Evaluating with encoding technique: ordinal
Processing dataset: crx


ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values