In [4]:
# ========== Import required libraries ==========
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# ========== Constants ==========
COLUMNS_TO_REMOVE = [
    'Pacientes', 'Nome', 'T-Score L1-L4', 'T-Score Fêmur Total',
    'EspessuraCorticalPan', 'EspessuraCorticalTrans', 'IMC',
    'T-Score Colo Femoral', 'Idade', 'Altura', 'Peso',
    'BMD L1-L4', 'BMD Colo Femoral', 'BMD Fêmur Total'
]
TARGET_NAME = 'Diagnosticounificado'
SUBGRUPOS = ['all','_CD', '_CE', '_MD', '_ME', '_SA']
FILE_PATH = 'train_database_matheus.csv'

# ========== Function to load CSV and remove unnecessary columns ==========
def read_csv_and_split(path, columns_to_remove):
    df = pd.read_csv(path, sep=';')
    df_train = df.drop(columns=columns_to_remove)
    return df_train, list(df_train.columns)

# ========== Function to filter features based on subgroup name ==========
def filtrar_palavras(lista, parametro):
    palavras_especificas = [
        'Altura da cortical (D)', 'Altura forame-base (D)',
        'Altura da medular forame-superficie  da cortical (D)',
        'Altura da cortical (E)', 'Altura forame-base (E)',
        'Altura da medular forame-superficie  da cortical (E)',
    ]
    if parametro.lower() == "all":
        return lista
    else:
        return [
            palavra for palavra in lista
            if parametro.lower() in palavra.lower() or palavra in palavras_especificas
        ]

# ========== Main logic ==========
if __name__ == '__main__':
    # Create a writable output directory (e.g., in the home folder)
    output_dir = os.path.expanduser('~/classification_reports')
    os.makedirs(output_dir, exist_ok=True)

    for subgrupo in SUBGRUPOS:
        train_df, all_columns = read_csv_and_split(FILE_PATH, COLUMNS_TO_REMOVE)
        features = filtrar_palavras(all_columns, subgrupo)

        # Make sure target is not in features
        if TARGET_NAME in features:
            features.remove(TARGET_NAME)

        X = train_df[features].to_numpy()
        y = train_df[TARGET_NAME].to_numpy()

        learning_model = RandomForestClassifier(verbose=True)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        y_pred_cv = cross_val_predict(learning_model, X, y, cv=cv)

        report = classification_report(y, y_pred_cv, digits=4)

        print('\n\n')
        print(report)
        print('Subgrupo de Features da Cross-Validation:', subgrupo, '\n\n\n')

        # Write report to a text file
        file_path = f'{output_dir}/classification_report_{subgrupo}.txt'
        with open(file_path, 'w') as f:
            f.write(report)
            f.write(f'\nSubgrupo de Features da Cross-Validation: {subgrupo}\n\n\n')


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s





              precision    recall  f1-score   support

           0     0.7308    0.7451    0.7379        51
           1     0.7292    0.7143    0.7216        49

    accuracy                         0.7300       100
   macro avg     0.7300    0.7297    0.7298       100
weighted avg     0.7300    0.7300    0.7299       100

Subgrupo de Features da Cross-Validation: all 





[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s





              precision    recall  f1-score   support

           0     0.5769    0.5882    0.5825        51
           1     0.5625    0.5510    0.5567        49

    accuracy                         0.5700       100
   macro avg     0.5697    0.5696    0.5696       100
weighted avg     0.5699    0.5700    0.5699       100

Subgrupo de Features da Cross-Validation: _CD 





[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s





              precision    recall  f1-score   support

           0     0.6400    0.6275    0.6337        51
           1     0.6200    0.6327    0.6263        49

    accuracy                         0.6300       100
   macro avg     0.6300    0.6301    0.6300       100
weighted avg     0.6302    0.6300    0.6300       100

Subgrupo de Features da Cross-Validation: _CE 





[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s





              precision    recall  f1-score   support

           0     0.6000    0.5882    0.5941        51
           1     0.5800    0.5918    0.5859        49

    accuracy                         0.5900       100
   macro avg     0.5900    0.5900    0.5900       100
weighted avg     0.5902    0.5900    0.5900       100

Subgrupo de Features da Cross-Validation: _MD 





[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s





              precision    recall  f1-score   support

           0     0.6071    0.6667    0.6355        51
           1     0.6136    0.5510    0.5806        49

    accuracy                         0.6100       100
   macro avg     0.6104    0.6088    0.6081       100
weighted avg     0.6103    0.6100    0.6086       100

Subgrupo de Features da Cross-Validation: _ME 





[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s





              precision    recall  f1-score   support

           0     0.7347    0.7059    0.7200        51
           1     0.7059    0.7347    0.7200        49

    accuracy                         0.7200       100
   macro avg     0.7203    0.7203    0.7200       100
weighted avg     0.7206    0.7200    0.7200       100

Subgrupo de Features da Cross-Validation: _SA 





[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


This Python script performs a machine learning classification task using a Random Forest Classifier on a medical dataset to predict the target variable Diagnosticounificado. It begins by importing the necessary libraries and defining constants, including a list of columns to remove and specific feature subgroups to analyze. The script reads the dataset, cleans it by removing unnecessary columns, and selects features based on the current subgroup being tested. It then prepares the data by converting features and labels into NumPy arrays and performs stratified 5-fold cross-validation to ensure class distribution is maintained. A Random Forest model is trained and used to generate predictions, after which the script prints and saves a classification report showing key metrics like precision, recall, and F1-score. This process is repeated for each feature subgroup, allowing for systematic evaluation of how different sets of features affect model performance.