In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from scipy.stats import ttest_ind
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Custom transformer for Welch's t-test
class WelchsTTestSelector(BaseEstimator, TransformerMixin):
    def __init__(self, p_value_threshold=0.05):
        self.p_value_threshold = p_value_threshold
        self.selected_features_ = None

    def fit(self, X, y):
        p_values = []
        for i in range(X.shape[1]):
            class_0 = X[y == 0, i]
            class_1 = X[y == 1, i]
            _, p = ttest_ind(class_0, class_1, equal_var=False)
            p_values.append(p)
        self.selected_features_ = np.where(np.array(p_values) < self.p_value_threshold)[0]
        return self

    def transform(self, X):
        return X[:, self.selected_features_]

# Constants
COLUMNS_TO_REMOVE = [
    'Pacientes', 'Nome', 'T-Score L1-L4', 'T-Score Fêmur Total',
    'EspessuraCorticalPan', 'EspessuraCorticalTrans', 'IMC',
    'T-Score Colo Femoral', 'Idade', 'Altura', 'Peso',
    'BMD L1-L4', 'BMD Colo Femoral', 'BMD Fêmur Total'
]
TARGET_NAME = 'Diagnosticounificado'
SUBGRUPOS = ['all','_CD', '_CE', '_MD', '_ME', '_SA']
FILE_PATH = 'train_database_matheus.csv'

# Function to load CSV and remove unnecessary columns
def read_csv_and_split(path, columns_to_remove):
    df = pd.read_csv(path, sep=';')
    df_train = df.drop(columns=columns_to_remove)
    return df_train, list(df_train.columns)

# Function to filter features based on subgroup name
def filtrar_palavras(lista, parametro):
    palavras_especificas = [
        'Altura da cortical (D)', 'Altura forame-base (D)',
        'Altura da medular forame-superficie  da cortical (D)',
        'Altura da cortical (E)', 'Altura forame-base (E)',
        'Altura da medular forame-superficie  da cortical (E)',
    ]
    if parametro.lower() == "all":
        return lista
    else:
        return [
            palavra for palavra in lista
            if parametro.lower() in palavra.lower() or palavra in palavras_especificas
        ]

# Main logic
if __name__ == '__main__':
    # Create a writable output directory (e.g., in the home folder)
    output_dir = os.path.expanduser('~/classification_reports')
    os.makedirs(output_dir, exist_ok=True)

    for subgrupo in SUBGRUPOS:
        train_df, all_columns = read_csv_and_split(FILE_PATH, COLUMNS_TO_REMOVE)
        features = filtrar_palavras(all_columns, subgrupo)

        # Make sure target is not in features
        if TARGET_NAME in features:
            features.remove(TARGET_NAME)

        X = train_df[features].to_numpy()
        y = train_df[TARGET_NAME].to_numpy()

        # Define the feature selection pipeline
        feature_selection_pipeline = Pipeline([
            ('variance_threshold', VarianceThreshold(threshold=0.01)),
            ('welchs_t_test', WelchsTTestSelector(p_value_threshold=0.05)),
            ('scaler', StandardScaler())
        ])

        # Fit and transform the data
        X_transformed = feature_selection_pipeline.fit_transform(X, y)

        # Train the classifier
        classifier = RandomForestClassifier(verbose=True)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        y_pred_cv = cross_val_predict(classifier, X_transformed, y, cv=cv)

        # Generate the classification report
        report = classification_report(y, y_pred_cv, digits=4)
        print('\n\n')
        print(report)
        print('Subgrupo de Features da Cross-Validation:', subgrupo, '\n\n\n')

        # Write report to a text file
        file_path = f'{output_dir}/classification_report_{subgrupo}.txt'
        with open(file_path, 'w') as f:
            f.write(report)
            f.write(f'\nSubgrupo de Features da Cross-Validation: {subgrupo}\n\n\n')


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s





              precision    recall  f1-score   support

           0     0.8077    0.8235    0.8155        51
           1     0.8125    0.7959    0.8041        49

    accuracy                         0.8100       100
   macro avg     0.8101    0.8097    0.8098       100
weighted avg     0.8100    0.8100    0.8099       100

Subgrupo de Features da Cross-Validation: all 





[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s





              precision    recall  f1-score   support

           0     0.7347    0.7059    0.7200        51
           1     0.7059    0.7347    0.7200        49

    accuracy                         0.7200       100
   macro avg     0.7203    0.7203    0.7200       100
weighted avg     0.7206    0.7200    0.7200       100

Subgrupo de Features da Cross-Validation: _CD 





[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s





              precision    recall  f1-score   support

           0     0.6735    0.6471    0.6600        51
           1     0.6471    0.6735    0.6600        49

    accuracy                         0.6600       100
   macro avg     0.6603    0.6603    0.6600       100
weighted avg     0.6605    0.6600    0.6600       100

Subgrupo de Features da Cross-Validation: _CE 





[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s





              precision    recall  f1-score   support

           0     0.6957    0.6275    0.6598        51
           1     0.6481    0.7143    0.6796        49

    accuracy                         0.6700       100
   macro avg     0.6719    0.6709    0.6697       100
weighted avg     0.6724    0.6700    0.6695       100

Subgrupo de Features da Cross-Validation: _MD 





[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s





              precision    recall  f1-score   support

           0     0.5577    0.5686    0.5631        51
           1     0.5417    0.5306    0.5361        49

    accuracy                         0.5500       100
   macro avg     0.5497    0.5496    0.5496       100
weighted avg     0.5498    0.5500    0.5499       100

Subgrupo de Features da Cross-Validation: _ME 





[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s





              precision    recall  f1-score   support

           0     0.7679    0.8431    0.8037        51
           1     0.8182    0.7347    0.7742        49

    accuracy                         0.7900       100
   macro avg     0.7930    0.7889    0.7890       100
weighted avg     0.7925    0.7900    0.7893       100

Subgrupo de Features da Cross-Validation: _SA 





[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
