**Análise da performance do RF nos diiferentes datasets**

In [2]:
import os
import pandas as pd
import numpy as np
from mla.ensemble.random_forest import RandomForestClassifier  # Usando a tua implementação

# Pasta dos datasets tratados
pasta = r"C:\Users\Utilizador\Desktop\AC1_Trabalho\clean_class_imbalance"

# Funções para métricas sem sklearn
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

def precision(y_true, y_pred):
    classes = np.unique(y_true)
    precisions = []
    for cls in classes:
        tp = np.sum((y_pred == cls) & (y_true == cls))
        fp = np.sum((y_pred == cls) & (y_true != cls))
        if tp + fp == 0:
            precisions.append(0)
        else:
            precisions.append(tp / (tp + fp))
    return np.mean(precisions)

def recall(y_true, y_pred):
    classes = np.unique(y_true)
    recalls = []
    for cls in classes:
        tp = np.sum((y_pred == cls) & (y_true == cls))
        fn = np.sum((y_pred != cls) & (y_true == cls))
        if tp + fn == 0:
            recalls.append(0)
        else:
            recalls.append(tp / (tp + fn))
    return np.mean(recalls)

def f1_score(y_true, y_pred):
    prec = precision(y_true, y_pred)
    rec = recall(y_true, y_pred)
    if prec + rec == 0:
        return 0
    return 2 * (prec * rec) / (prec + rec)

def roc_auc(y_true, y_score):
    # Só para binário
    if len(np.unique(y_true)) != 2:
        return np.nan

    # Ordenar por score da classe positiva
    order = np.argsort(-y_score[:, 1])
    y_true_sorted = y_true[order]

    # Calcular TPR e FPR
    P = np.sum(y_true == 1)
    N = np.sum(y_true == 0)
    tpr = np.cumsum(y_true_sorted == 1) / P
    fpr = np.cumsum(y_true_sorted == 0) / N

    # AUC pela regra do trapézio
    auc = np.trapz(tpr, fpr)
    return auc

# Função para dividir treino e teste sem sklearn
def split_train_test(X, y, test_size=0.3, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    test_size = int(test_size * X.shape[0])
    test_idx = indices[:test_size]
    train_idx = indices[test_size:]
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]


# Avaliação dos datasets
resultados = []
count = 1

for arquivo in os.listdir(pasta):
    if arquivo.endswith(".csv"):
        caminho_arquivo = os.path.join(pasta, arquivo)
        print(f"\nTreinando no dataset: {arquivo}")
        print(count, "de 50")
        count +=1
        
        # Lê o dataset
        df = pd.read_csv(caminho_arquivo)
        
        # Última coluna como target
        X = df.iloc[:, :-1].values
        y = df.iloc[:, -1].values

        # Dividir treino e teste
        X_train, X_test, y_train, y_test = split_train_test(X, y, test_size=0.3, random_state=42)

        # Treinar modelo
        # Treinar modelo (com todos os atributos!)
        modelo = RandomForestClassifier(
            n_estimators=10,
            max_depth=10,
            max_features=X.shape[1]
            )
        modelo.fit(X_train, y_train)

        # Prever
        y_pred_prob = modelo._predict(X_test)
        y_pred = np.argmax(y_pred_prob, axis=1)

        # Avaliar
        acc = accuracy(y_test, y_pred)
        rec = recall(y_test, y_pred)
        prec = precision(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc(y_test, y_pred_prob)

        resultados.append({
            "dataset": arquivo,
            "accuracy": acc,
            "recall": rec,
            "precision": prec,
            "f1_score": f1,
            "roc_auc": auc
        })

# Mostrar resultados
resultados_df = pd.DataFrame(resultados)
print("\nResultados:")
print(resultados_df)


Treinando no dataset: dataset_1000_hypothyroid.csv
1 de 50

Treinando no dataset: dataset_1002_ipums_la_98-small.csv
2 de 50

Treinando no dataset: dataset_1004_synthetic_control.csv
3 de 50

Treinando no dataset: dataset_1013_analcatdata_challenger.csv
4 de 50

Treinando no dataset: dataset_1014_analcatdata_dmft.csv
5 de 50

Treinando no dataset: dataset_1016_vowel.csv
6 de 50

Treinando no dataset: dataset_1018_ipums_la_99-small.csv
7 de 50

Treinando no dataset: dataset_1020_mfeat-karhunen.csv
8 de 50

Treinando no dataset: dataset_1021_page-blocks.csv
9 de 50

Treinando no dataset: dataset_1022_mfeat-pixel.csv
10 de 50

Treinando no dataset: dataset_1023_soybean.csv
11 de 50

Treinando no dataset: dataset_1039_hiva_agnostic.csv
12 de 50

Treinando no dataset: dataset_1045_kc1-top5.csv
13 de 50

Treinando no dataset: dataset_1049_pc4.csv
14 de 50

Treinando no dataset: dataset_1050_pc3.csv
15 de 50

Treinando no dataset: dataset_1056_mc1.csv
16 de 50

Treinando no dataset: dataset_

  left_mask = X < value
  right_mask = X >= value
  left_mask = X[:, column] < value
  right_mask = X[:, column] >= value
  self.outcome = np.bincount(targets["y"], minlength=self.n_classes) / targets["y"].shape[0]



Treinando no dataset: dataset_764_analcatdata_apnea3.csv
28 de 50

Treinando no dataset: dataset_765_analcatdata_apnea2.csv
29 de 50

Treinando no dataset: dataset_767_analcatdata_apnea1.csv
30 de 50

Treinando no dataset: dataset_865_analcatdata_neavote.csv
31 de 50

Treinando no dataset: dataset_867_visualizing_livestock.csv
32 de 50

Treinando no dataset: dataset_875_analcatdata_chlamydia.csv
33 de 50

Treinando no dataset: dataset_940_water-treatment.csv
34 de 50


  left_mask = X < value
  right_mask = X >= value
  left_mask = X[:, column] < value
  right_mask = X[:, column] >= value
  self.outcome = np.bincount(targets["y"], minlength=self.n_classes) / targets["y"].shape[0]



Treinando no dataset: dataset_947_arsenic-male-bladder.csv
35 de 50

Treinando no dataset: dataset_949_arsenic-female-bladder.csv
36 de 50

Treinando no dataset: dataset_950_arsenic-female-lung.csv
37 de 50

Treinando no dataset: dataset_951_arsenic-male-lung.csv
38 de 50

Treinando no dataset: dataset_954_spectrometer.csv
39 de 50

Treinando no dataset: dataset_958_segment.csv
40 de 50

Treinando no dataset: dataset_962_mfeat-morphological.csv
41 de 50

Treinando no dataset: dataset_966_analcatdata_halloffame.csv
42 de 50


  left_mask = X < value
  right_mask = X >= value
  left_mask = X[:, column] < value
  right_mask = X[:, column] >= value
  self.outcome = np.bincount(targets["y"], minlength=self.n_classes) / targets["y"].shape[0]



Treinando no dataset: dataset_968_analcatdata_birthday.csv
43 de 50

Treinando no dataset: dataset_971_mfeat-fourier.csv
44 de 50

Treinando no dataset: dataset_976_JapaneseVowels.csv
45 de 50

Treinando no dataset: dataset_978_mfeat-factors.csv
46 de 50

Treinando no dataset: dataset_980_optdigits.csv
47 de 50

Treinando no dataset: dataset_984_analcatdata_draft.csv
48 de 50


  self.outcome = np.bincount(targets["y"], minlength=self.n_classes) / targets["y"].shape[0]



Treinando no dataset: dataset_987_collins.csv
49 de 50

Treinando no dataset: dataset_995_mfeat-zernike.csv
50 de 50

Resultados:
                                    dataset  accuracy    recall  precision  \
0              dataset_1000_hypothyroid.csv  0.998232  0.987654   0.999049   
1        dataset_1002_ipums_la_98-small.csv  0.892205  0.500000   0.446102   
2        dataset_1004_synthetic_control.csv  0.977778  0.973804   0.951144   
3   dataset_1013_analcatdata_challenger.csv  1.000000  1.000000   1.000000   
4         dataset_1014_analcatdata_dmft.csv  0.778243  0.500000   0.389121   
5                    dataset_1016_vowel.csv  0.973064  0.943691   0.920767   
6        dataset_1018_ipums_la_99-small.csv  0.929891  0.540955   0.625011   
7           dataset_1020_mfeat-karhunen.csv  0.973333  0.915778   0.915778   
8              dataset_1021_page-blocks.csv  0.974406  0.945418   0.924235   
9              dataset_1022_mfeat-pixel.csv  0.995000  0.979857   0.988375   
10         

**Análise da perceentagem de outliers nos datasets Grupo 1**

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

def detect_outliers(df, contamination=0.05):
    clf = IsolationForest(contamination=contamination, random_state=42)
    preds = clf.fit_predict(df)
    outliers = (preds == -1).sum()
    total = len(df)
    return (outliers / total) * 100

def main():
    datasets_folder = 'noise'
    contamination = 0.05
    outlier_percentages = []
    dataset_count = 0

    for filename in os.listdir(datasets_folder):
        if filename.endswith('.csv'):
            filepath = os.path.join(datasets_folder, filename)
            try:
                df = pd.read_csv(filepath)

                # Drop label columns if they exist
                label_columns = ['label', 'class', 'target', 'y']
                df_features = df.copy()
                for col in label_columns:
                    if col in df_features.columns:
                        df_features = df_features.drop(columns=[col])

                # Keep only numeric columns
                df_features = df_features.select_dtypes(include=[np.number])

                # Skip datasets with no numeric features
                if df_features.empty:
                    continue

                # Skip datasets with missing values
                if df_features.isnull().any().any():
                    continue

                outlier_percentage = detect_outliers(df_features, contamination=contamination)
                outlier_percentages.append(outlier_percentage)
                dataset_count += 1

            except Exception:
                continue  # Just ignore problematic datasets silently

    if outlier_percentages:
        avg_outliers = np.mean(outlier_percentages)
        print(f"Number of datasets analyzed: {dataset_count}")
        print(f"Average outlier percentage across datasets: {avg_outliers:.2f}%")
    else:
        print("No valid datasets processed.")

if __name__ == "__main__":
    main()


Number of datasets analyzed: 29
Average outlier percentage across datasets: 4.04%
