## V tem koraku bi rad preveril katera kombinacija fingerprinta in klasifikacijskega modela je najboljša preden začnem manipulirati s podatki. Ta korak je pomemben zato da v nadaljnih korakih vidim kaj izboljšujem

In [None]:
%run __A_knjiznice.py

# Import specific elements from the script
from __A_knjiznice import *
from __B_funkcije import *
%matplotlib inline

In [1]:

input_directory = '/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/dir1/molekulski_prstni_odtisi'

generated_fingerprints = ['df_circular.csv']

In [2]:
import pandas as pd

def remove_collinear_features_simple(x, threshold):
    '''
    Simplified function to remove collinear features in a DataFrame based on a specified correlation threshold.

    Parameters:
    - x: DataFrame with features.
    - threshold: Correlation coefficient threshold; features with correlations above this threshold will be removed.

    Returns:
    - DataFrame with collinear features removed.
    '''

    # Calculate the correlation matrix
    corr_matrix = x.corr().abs()

    # Identify pairs of columns that exceed the correlation threshold
    high_corr_pairs = [(i, j) for i in range(len(corr_matrix.columns)) for j in range(i+1, len(corr_matrix.columns)) if corr_matrix.iloc[i, j] > threshold]

    # Create a set to hold all columns that need to be removed
    cols_to_remove = set()

    # Add one column from each pair to the set of columns to be removed
    for i, j in high_corr_pairs:
        cols_to_remove.add(corr_matrix.columns[j])

    # Remove the columns from the DataFrame
    x_reduced = x.drop(columns=list(cols_to_remove))

    return x_reduced


In [3]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import SelectKBest, chi2, VarianceThreshold
from sklearn.cluster import FeatureAgglomeration
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np

# Define classifiers
classifiers = {
    "RandomForestClassifier": {
        "classifier": RandomForestClassifier(n_jobs=-1, random_state=42),
        "params": {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }
    }
}

# Dimensionality Reduction Methods
dim_reduction_methods = {
    "None": {
        "dim_reduction": None,
        "params": {}
    },
    "SelectKBest": {
        "dim_reduction": SelectKBest(score_func=chi2, k=150),
        "params": {
            'dim_reduction__k': [10, 20, 50, 75, 150, 250]
        }
    },
    "LDA": {
        "dim_reduction": LinearDiscriminantAnalysis(),
        "params": {}
    },
    "FeatureAgglomeration": {
        "dim_reduction": FeatureAgglomeration(),
        "params": {
            'dim_reduction__n_clusters': [20, 50, 75, 150, 250]
        }
    }
}

# Methods for Handling Imbalanced Data
sampling_techniques = {
    "None": {
        "feature_selection": None,
        "params": {}
    },
    "SMOTENC": {
        "feature_selection": SMOTENC(categorical_features=[0, 1],random_state=42),
        "params": {
            'feature_selection__sampling_strategy': [0.5, 0.75, 1.0]
        }
    },
    "RandomUnderSampler": {
        "feature_selection": RandomUnderSampler(random_state=42),
        "params": {
            'feature_selection__sampling_strategy': [0.1, 0.5, 0.75, 1.0]
        }
    }
}

# Store results
results_list = []

# Define Stratified k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# List of specific filenames to process
for filename in generated_fingerprints:  # Assuming generated_fingerprints is defined
    file_path = os.path.join(input_directory, filename)
    
    if os.path.exists(file_path):  # Check if the file exists
        print(f'Processing fingerprint DataFrame: {filename}')
        
        df = pd.read_csv(file_path)
        y = df[['Activity']].values.ravel()  # Assuming 'Activity' is the target
        X = df.iloc[:, 3:]  # Assuming features start from the 4th column

        # Remove constant features
        selector = VarianceThreshold()
        X = pd.DataFrame(selector.fit_transform(X), columns=selector.get_feature_names_out())
        
        # Remove collinear features
        X = remove_collinear_features_simple(X, threshold=0.95)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, shuffle=True, stratify=y)
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=15/85, random_state=42, shuffle=True, stratify=y_train)

        # Train and evaluate each classifier
        for clf_name, clf in classifiers.items():
            for dr_name, dr_method in dim_reduction_methods.items():
                for fs_name, fs_method in sampling_techniques.items():
                    steps = []
                    if fs_method["feature_selection"] is not None:
                        steps.append(('feature_selection', fs_method["feature_selection"]))
                    if dr_method["dim_reduction"] is not None:
                        steps.append(('dim_reduction', dr_method["dim_reduction"]))
                    steps.append(('classifier', clf["classifier"]))
                    
                    # Create the pipeline
                    pipeline = ImbPipeline(steps)

                    # Perform cross-validation with RandomizedSearchCV
                    search_params = {**fs_method["params"], **dr_method["params"], **clf["params"]}
                    random_search = RandomizedSearchCV(
                        pipeline, 
                        param_distributions=search_params, 
                        n_iter=10, 
                        scoring=['accuracy', 'f1', 'precision', 'recall', 'roc_auc'],  # Include all desired metrics
                        refit='accuracy',  # Refitting based on accuracy
                        cv=cv, 
                        random_state=42, 
                        n_jobs=-1
                    )
                    
                    # Fit the model
                    random_search.fit(X_train, y_train)

                    # Train the model on the best hyperparameters
                    best_pipeline = random_search.best_estimator_
                    best_pipeline.fit(X_train, y_train)

                    # Evaluate on the validation set
                    y_val_pred = best_pipeline.predict(X_val)
                    val_accuracy = accuracy_score(y_val, y_val_pred)
                    val_f1 = f1_score(y_val, y_val_pred)
                    val_precision = precision_score(y_val, y_val_pred)
                    val_recall = recall_score(y_val, y_val_pred)
                    val_roc_auc = roc_auc_score(y_val, y_val_pred)

                    # Append results to the list
                    results_temp = {
                        'Fingerprint': filename,  # Use the filename for identification
                        'Feature_Selection': fs_name,
                        'Dim_Reduction': dr_name,
                        'Classifier': clf_name,
                        'Best_Params': random_search.best_params_,
                        'CV_Mean_Accuracy': random_search.best_score_,
                        'CV_Mean_F1': np.mean(random_search.cv_results_['mean_test_f1']),
                        'CV_Mean_Precision': np.mean(random_search.cv_results_['mean_test_precision']),
                        'CV_Mean_Recall': np.mean(random_search.cv_results_['mean_test_recall']),
                        'CV_Mean_ROC_AUC': np.mean(random_search.cv_results_['mean_test_roc_auc']),
                        'Val_Accuracy': val_accuracy,
                        'Val_F1': val_f1,
                        'Val_Precision': val_precision,
                        'Val_Recall': val_recall,
                        'Val_ROC_AUC': val_roc_auc,
                    }
                    results_list.append(results_temp)
                    print("\nResults:")
                    print(results_temp)

# Create DataFrame from the results list
results_df = pd.DataFrame(results_list)
print("\nFinal Results:")

Processing fingerprint DataFrame: df_circular.csv

Results:
{'Fingerprint': 'df_circular.csv', 'Feature_Selection': 'None', 'Dim_Reduction': 'None', 'Classifier': 'RandomForestClassifier', 'Best_Params': {'classifier__n_estimators': 100, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 2, 'classifier__max_depth': None}, 'CV_Mean_Accuracy': 0.8809499136442142, 'CV_Mean_F1': 0.8769698834291404, 'CV_Mean_Precision': 0.8780408441536915, 'CV_Mean_Recall': 0.876199239017641, 'CV_Mean_ROC_AUC': 0.934861336630038, 'Val_Accuracy': 0.8762057877813505, 'Val_F1': 0.8783570300157978, 'Val_Precision': 0.8769716088328076, 'Val_Recall': 0.879746835443038, 'Val_ROC_AUC': 0.8761479275254406}


35 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/imblearn/pipeline.py", line 329, in fit
    Xt, yt = self._fit


Results:
{'Fingerprint': 'df_circular.csv', 'Feature_Selection': 'SMOTENC', 'Dim_Reduction': 'None', 'Classifier': 'RandomForestClassifier', 'Best_Params': {'feature_selection__sampling_strategy': 1.0, 'classifier__n_estimators': 200, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 2, 'classifier__max_depth': 20}, 'CV_Mean_Accuracy': 0.8764635816806623, 'CV_Mean_F1': nan, 'CV_Mean_Precision': nan, 'CV_Mean_Recall': nan, 'CV_Mean_ROC_AUC': nan, 'Val_Accuracy': 0.864951768488746, 'Val_F1': 0.8675078864353313, 'Val_Precision': 0.8647798742138365, 'Val_Recall': 0.870253164556962, 'Val_ROC_AUC': 0.8648651443699844}


45 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/imblearn/pipeline.py", line 329, in fit
    Xt, yt = self._fit


Results:
{'Fingerprint': 'df_circular.csv', 'Feature_Selection': 'RandomUnderSampler', 'Dim_Reduction': 'None', 'Classifier': 'RandomForestClassifier', 'Best_Params': {'feature_selection__sampling_strategy': 1.0, 'classifier__n_estimators': 50, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': 20}, 'CV_Mean_Accuracy': 0.8764641772378059, 'CV_Mean_F1': nan, 'CV_Mean_Precision': nan, 'CV_Mean_Recall': nan, 'CV_Mean_ROC_AUC': nan, 'Val_Accuracy': 0.8745980707395499, 'Val_F1': 0.8773584905660378, 'Val_Precision': 0.871875, 'Val_Recall': 0.8829113924050633, 'Val_ROC_AUC': 0.8744622321502441}

Results:
{'Fingerprint': 'df_circular.csv', 'Feature_Selection': 'None', 'Dim_Reduction': 'SelectKBest', 'Classifier': 'RandomForestClassifier', 'Best_Params': {'dim_reduction__k': 150, 'classifier__n_estimators': 50, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': 20}, 'CV_Mean_Accuracy': 0.8702537073432197

35 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/imblearn/pipeline.py", line 329, in fit
    Xt, yt = self._fit


Results:
{'Fingerprint': 'df_circular.csv', 'Feature_Selection': 'SMOTENC', 'Dim_Reduction': 'SelectKBest', 'Classifier': 'RandomForestClassifier', 'Best_Params': {'feature_selection__sampling_strategy': 1.0, 'dim_reduction__k': 150, 'classifier__n_estimators': 200, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 2, 'classifier__max_depth': 20}, 'CV_Mean_Accuracy': 0.872322672860461, 'CV_Mean_F1': nan, 'CV_Mean_Precision': nan, 'CV_Mean_Recall': nan, 'CV_Mean_ROC_AUC': nan, 'Val_Accuracy': 0.860128617363344, 'Val_F1': 0.8616852146263911, 'Val_Precision': 0.865814696485623, 'Val_Recall': 0.8575949367088608, 'Val_ROC_AUC': 0.8601700173740383}


40 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/imblearn/pipeline.py", line 329, in fit
    Xt, yt = self._fit


Results:
{'Fingerprint': 'df_circular.csv', 'Feature_Selection': 'RandomUnderSampler', 'Dim_Reduction': 'SelectKBest', 'Classifier': 'RandomForestClassifier', 'Best_Params': {'feature_selection__sampling_strategy': 1.0, 'dim_reduction__k': 150, 'classifier__n_estimators': 50, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': 20}, 'CV_Mean_Accuracy': 0.8702519206717885, 'CV_Mean_F1': nan, 'CV_Mean_Precision': nan, 'CV_Mean_Recall': nan, 'CV_Mean_ROC_AUC': nan, 'Val_Accuracy': 0.860128617363344, 'Val_F1': 0.8608, 'Val_Precision': 0.8705501618122977, 'Val_Recall': 0.8512658227848101, 'Val_ROC_AUC': 0.8602734342682221}





Results:
{'Fingerprint': 'df_circular.csv', 'Feature_Selection': 'None', 'Dim_Reduction': 'LDA', 'Classifier': 'RandomForestClassifier', 'Best_Params': {'classifier__n_estimators': 50, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 4, 'classifier__max_depth': None}, 'CV_Mean_Accuracy': 0.8253880054791256, 'CV_Mean_F1': 0.8243700535858641, 'CV_Mean_Precision': 0.8240452653083684, 'CV_Mean_Recall': 0.8249369307044854, 'CV_Mean_ROC_AUC': 0.8683729051529868, 'Val_Accuracy': 0.8408360128617364, 'Val_F1': 0.8445839874411303, 'Val_Precision': 0.838006230529595, 'Val_Recall': 0.8512658227848101, 'Val_ROC_AUC': 0.8406655911309671}


35 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/imblearn/pipeline.py", line 329, in fit
    Xt, yt = self._fit


Results:
{'Fingerprint': 'df_circular.csv', 'Feature_Selection': 'SMOTENC', 'Dim_Reduction': 'LDA', 'Classifier': 'RandomForestClassifier', 'Best_Params': {'feature_selection__sampling_strategy': 1.0, 'classifier__n_estimators': 200, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 4, 'classifier__max_depth': 10}, 'CV_Mean_Accuracy': 0.8215913286879877, 'CV_Mean_F1': nan, 'CV_Mean_Precision': nan, 'CV_Mean_Recall': nan, 'CV_Mean_ROC_AUC': nan, 'Val_Accuracy': 0.8392282958199357, 'Val_F1': 0.845679012345679, 'Val_Precision': 0.8253012048192772, 'Val_Recall': 0.8670886075949367, 'Val_ROC_AUC': 0.838773061967403}


45 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/imblearn/pipeline.py", line 329, in fit
    Xt, yt = self._fit


Results:
{'Fingerprint': 'df_circular.csv', 'Feature_Selection': 'RandomUnderSampler', 'Dim_Reduction': 'LDA', 'Classifier': 'RandomForestClassifier', 'Best_Params': {'feature_selection__sampling_strategy': 1.0, 'classifier__n_estimators': 50, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': 20}, 'CV_Mean_Accuracy': 0.8119284140313263, 'CV_Mean_F1': nan, 'CV_Mean_Precision': nan, 'CV_Mean_Recall': nan, 'CV_Mean_ROC_AUC': nan, 'Val_Accuracy': 0.8311897106109325, 'Val_F1': 0.8335974643423137, 'Val_Precision': 0.834920634920635, 'Val_Recall': 0.8322784810126582, 'Val_ROC_AUC': 0.8311719202448911}





Results:
{'Fingerprint': 'df_circular.csv', 'Feature_Selection': 'None', 'Dim_Reduction': 'FeatureAgglomeration', 'Classifier': 'RandomForestClassifier', 'Best_Params': {'dim_reduction__n_clusters': 150, 'classifier__n_estimators': 50, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': None}, 'CV_Mean_Accuracy': 0.8768084092668691, 'CV_Mean_F1': 0.8717577992220418, 'CV_Mean_Precision': 0.869946405498012, 'CV_Mean_Recall': 0.8738152888273953, 'CV_Mean_ROC_AUC': 0.9308802012368215, 'Val_Accuracy': 0.8681672025723473, 'Val_F1': 0.8714733542319749, 'Val_Precision': 0.8633540372670807, 'Val_Recall': 0.879746835443038, 'Val_ROC_AUC': 0.8679779928849177}


40 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/imblearn/pipeline.py", line 329, in fit
    Xt, yt = self._fit


Results:
{'Fingerprint': 'df_circular.csv', 'Feature_Selection': 'SMOTENC', 'Dim_Reduction': 'FeatureAgglomeration', 'Classifier': 'RandomForestClassifier', 'Best_Params': {'feature_selection__sampling_strategy': 1.0, 'dim_reduction__n_clusters': 50, 'classifier__n_estimators': 50, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': 20}, 'CV_Mean_Accuracy': 0.8726663093323804, 'CV_Mean_F1': nan, 'CV_Mean_Precision': nan, 'CV_Mean_Recall': nan, 'CV_Mean_ROC_AUC': nan, 'Val_Accuracy': 0.8569131832797428, 'Val_F1': 0.8589540412044374, 'Val_Precision': 0.8603174603174604, 'Val_Recall': 0.8575949367088608, 'Val_ROC_AUC': 0.8569020435178292}


40 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/samonose/PycharmProjects/IDO_inhibitors_ML_predictions/.venv/lib/python3.10/site-packages/imblearn/pipeline.py", line 329, in fit
    Xt, yt = self._fit


Results:
{'Fingerprint': 'df_circular.csv', 'Feature_Selection': 'RandomUnderSampler', 'Dim_Reduction': 'FeatureAgglomeration', 'Classifier': 'RandomForestClassifier', 'Best_Params': {'feature_selection__sampling_strategy': 1.0, 'dim_reduction__n_clusters': 150, 'classifier__n_estimators': 50, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': 20}, 'CV_Mean_Accuracy': 0.8792245845988923, 'CV_Mean_F1': nan, 'CV_Mean_Precision': nan, 'CV_Mean_Recall': nan, 'CV_Mean_ROC_AUC': nan, 'Val_Accuracy': 0.860128617363344, 'Val_F1': 0.8603531300160514, 'Val_Precision': 0.8729641693811075, 'Val_Recall': 0.8481012658227848, 'Val_ROC_AUC': 0.860325142715314}

Final Results:


In [4]:
results_df

Unnamed: 0,Fingerprint,Feature_Selection,Dim_Reduction,Classifier,Best_Params,CV_Mean_Accuracy,CV_Mean_F1,CV_Mean_Precision,CV_Mean_Recall,CV_Mean_ROC_AUC,Val_Accuracy,Val_F1,Val_Precision,Val_Recall,Val_ROC_AUC
0,df_circular.csv,,,RandomForestClassifier,"{'classifier__n_estimators': 100, 'classifier_...",0.88095,0.87697,0.878041,0.876199,0.934861,0.876206,0.878357,0.876972,0.879747,0.876148
1,df_circular.csv,SMOTENC,,RandomForestClassifier,"{'feature_selection__sampling_strategy': 1.0, ...",0.876464,,,,,0.864952,0.867508,0.86478,0.870253,0.864865
2,df_circular.csv,RandomUnderSampler,,RandomForestClassifier,"{'feature_selection__sampling_strategy': 1.0, ...",0.876464,,,,,0.874598,0.877358,0.871875,0.882911,0.874462
3,df_circular.csv,,SelectKBest,RandomForestClassifier,"{'dim_reduction__k': 150, 'classifier__n_estim...",0.870254,0.825374,0.839463,0.812841,0.895253,0.863344,0.865719,0.864353,0.867089,0.863283
4,df_circular.csv,SMOTENC,SelectKBest,RandomForestClassifier,"{'feature_selection__sampling_strategy': 1.0, ...",0.872323,,,,,0.860129,0.861685,0.865815,0.857595,0.86017
5,df_circular.csv,RandomUnderSampler,SelectKBest,RandomForestClassifier,"{'feature_selection__sampling_strategy': 1.0, ...",0.870252,,,,,0.860129,0.8608,0.87055,0.851266,0.860273
6,df_circular.csv,,LDA,RandomForestClassifier,"{'classifier__n_estimators': 50, 'classifier__...",0.825388,0.82437,0.824045,0.824937,0.868373,0.840836,0.844584,0.838006,0.851266,0.840666
7,df_circular.csv,SMOTENC,LDA,RandomForestClassifier,"{'feature_selection__sampling_strategy': 1.0, ...",0.821591,,,,,0.839228,0.845679,0.825301,0.867089,0.838773
8,df_circular.csv,RandomUnderSampler,LDA,RandomForestClassifier,"{'feature_selection__sampling_strategy': 1.0, ...",0.811928,,,,,0.83119,0.833597,0.834921,0.832278,0.831172
9,df_circular.csv,,FeatureAgglomeration,RandomForestClassifier,"{'dim_reduction__n_clusters': 150, 'classifier...",0.876808,0.871758,0.869946,0.873815,0.93088,0.868167,0.871473,0.863354,0.879747,0.867978


In [5]:
results_df.sort_values(by=['Val_Accuracy'], ascending=False, inplace = True)
results_df

Unnamed: 0,Fingerprint,Feature_Selection,Dim_Reduction,Classifier,Best_Params,CV_Mean_Accuracy,CV_Mean_F1,CV_Mean_Precision,CV_Mean_Recall,CV_Mean_ROC_AUC,Val_Accuracy,Val_F1,Val_Precision,Val_Recall,Val_ROC_AUC
0,df_circular.csv,,,RandomForestClassifier,"{'classifier__n_estimators': 100, 'classifier_...",0.88095,0.87697,0.878041,0.876199,0.934861,0.876206,0.878357,0.876972,0.879747,0.876148
2,df_circular.csv,RandomUnderSampler,,RandomForestClassifier,"{'feature_selection__sampling_strategy': 1.0, ...",0.876464,,,,,0.874598,0.877358,0.871875,0.882911,0.874462
9,df_circular.csv,,FeatureAgglomeration,RandomForestClassifier,"{'dim_reduction__n_clusters': 150, 'classifier...",0.876808,0.871758,0.869946,0.873815,0.93088,0.868167,0.871473,0.863354,0.879747,0.867978
1,df_circular.csv,SMOTENC,,RandomForestClassifier,"{'feature_selection__sampling_strategy': 1.0, ...",0.876464,,,,,0.864952,0.867508,0.86478,0.870253,0.864865
3,df_circular.csv,,SelectKBest,RandomForestClassifier,"{'dim_reduction__k': 150, 'classifier__n_estim...",0.870254,0.825374,0.839463,0.812841,0.895253,0.863344,0.865719,0.864353,0.867089,0.863283
4,df_circular.csv,SMOTENC,SelectKBest,RandomForestClassifier,"{'feature_selection__sampling_strategy': 1.0, ...",0.872323,,,,,0.860129,0.861685,0.865815,0.857595,0.86017
5,df_circular.csv,RandomUnderSampler,SelectKBest,RandomForestClassifier,"{'feature_selection__sampling_strategy': 1.0, ...",0.870252,,,,,0.860129,0.8608,0.87055,0.851266,0.860273
11,df_circular.csv,RandomUnderSampler,FeatureAgglomeration,RandomForestClassifier,"{'feature_selection__sampling_strategy': 1.0, ...",0.879225,,,,,0.860129,0.860353,0.872964,0.848101,0.860325
10,df_circular.csv,SMOTENC,FeatureAgglomeration,RandomForestClassifier,"{'feature_selection__sampling_strategy': 1.0, ...",0.872666,,,,,0.856913,0.858954,0.860317,0.857595,0.856902
6,df_circular.csv,,LDA,RandomForestClassifier,"{'classifier__n_estimators': 50, 'classifier__...",0.825388,0.82437,0.824045,0.824937,0.868373,0.840836,0.844584,0.838006,0.851266,0.840666


## Prikažemo najboljsi model (pipeline)

In [6]:
best_model_config = results_df.iloc[0]
best_model_config

Fingerprint                                            df_circular.csv
Feature_Selection                                                 None
Dim_Reduction                                                     None
Classifier                                      RandomForestClassifier
Best_Params          {'classifier__n_estimators': 100, 'classifier_...
CV_Mean_Accuracy                                               0.88095
CV_Mean_F1                                                     0.87697
CV_Mean_Precision                                             0.878041
CV_Mean_Recall                                                0.876199
CV_Mean_ROC_AUC                                               0.934861
Val_Accuracy                                                  0.876206
Val_F1                                                        0.878357
Val_Precision                                                 0.876972
Val_Recall                                                    0.879747
Val_RO

In [7]:
# Example usage of best_model_config values
best_fingerprint = best_model_config['Fingerprint']
best_clf_name = best_model_config['Classifier']
best_dr_name = best_model_config['Dim_Reduction']
best_fs_name = best_model_config['Feature_Selection']

print(f"Combination scoring highest accuracy: \n\
Best fingerprint: {best_fingerprint}\n\
Best Classification model: {best_clf_name} \n\
Best dimensionality reduction method: {best_dr_name}\n\
Best Feature Selection model: {best_fs_name}\n\
\n\
Validation accuracy: {best_model_config['Val_Accuracy']}\n\
Training set cross validation accuracy: {best_model_config['CV_Mean_Accuracy']}")

Combination scoring highest accuracy: 
Best fingerprint: df_circular.csv
Best Classification model: RandomForestClassifier 
Best dimensionality reduction method: None
Best Feature Selection model: None

Validation accuracy: 0.8762057877813505
Training set cross validation accuracy: 0.8809499136442142


In [8]:
best_model_save= results_df.iloc[0:1]

In [9]:
dir_path = dir1/model_SU
filename = 'top_pipeline.csv'
dir_path = os.path.join(PATH, filename)
best_model_save.to_csv(dir_path,index=False)

NameError: name 'PATH' is not defined

## Reproduciramo najboljši pipeline

In [None]:
df = df.copy()

In [None]:
df

In [None]:
#Best parameters found:  {'classifier__max_depth': None, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, make_scorer
from imblearn.pipeline import Pipeline as ImbPipeline
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, make_scorer
from imblearn.pipeline import Pipeline as ImbPipeline
import pandas as pd
import numpy as np

# Assuming 'df' is your initial DataFrame
# Calculate fingerprints to get X and y for the best fingerprint
df_or, X = calc_fingerprints(df, best_fingerprint)
y = df_or['Activity'].values.ravel()  # Ensure y is 1D

# Remove constant features
selector = VarianceThreshold()
X = pd.DataFrame(selector.fit_transform(X), columns=selector.get_feature_names_out())

# Remove collinear features
X = remove_collinear_features_simple(X, threshold=0.95)

# Split the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0, shuffle=True, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=15/85, random_state=0, shuffle=True, stratify=y_train)

# Rebuild the best pipeline for the selected fingerprint
steps = []
if best_fs_name != 'None':
    steps.append(('feature_selection', sampling_techniques[best_fs_name]))
if best_dr_name != 'None':
    steps.append(('dim_reduction', dim_reduction_methods[best_dr_name]))
if best_clf_name != 'None':
    steps.append(('Classifier', classifiers[best_clf_name]))

best_pipeline = ImbPipeline(steps)



# Define Stratified 5-Fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# Define scorers
scorers = {
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score, average='macro'),
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro'),
}

# Calculate cross-validated metrics for the training set using parallel processing
cv_results = {}
for metric_name, scorer in scorers.items():
    cv_score = cross_val_score(best_pipeline, X_train, y_train, cv=cv, scoring=scorer, n_jobs=-1).mean()
    cv_results[f'CV_{metric_name}'] = cv_score

# Since cross_val_score fits the model internally, there's no need to fit it beforehand for CV metrics.
# Fit the model once for validation and test set predictions.
best_pipeline.fit(X_train, y_train)

# Predict on the validation set and calculate metrics
y_val_pred = best_pipeline.predict(X_val)
val_metrics = {
    'Validation_accuracy': accuracy_score(y_val, y_val_pred),
    'Validation_f1': f1_score(y_val, y_val_pred, average='macro'),
    'Validation_precision': precision_score(y_val, y_val_pred, average='macro'),
    'Validation_recall': recall_score(y_val, y_val_pred, average='macro'),
}

# Predict on the test set and calculate metrics
y_test_pred = best_pipeline.predict(X_test)
test_metrics = {
    'Test_accuracy': accuracy_score(y_test, y_test_pred),
    'Test_f1': f1_score(y_test, y_test_pred, average='macro'),
    'Test_precision': precision_score(y_test, y_test_pred, average='macro'),
    'Test_recall': recall_score(y_test, y_test_pred, average='macro'),
}

# Merge all results
all_metrics = {**cv_results, **val_metrics, **test_metrics}

# Convert to DataFrame for display
results_df = pd.DataFrame(all_metrics, index=[0])

results_df

In [None]:
# Calculate and plot confusion matrix
cm = confusion_matrix(y_test, y_test_pred)

# Plot using seaborn for a nicer visualization
plt.figure(figsize=(7, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
if hasattr(best_pipeline, "predict_proba"):
    # Calculate probabilities for the positive class
    y_test_prob = best_pipeline.predict_proba(X_test)[:, 1]
    
    # Calculate ROC curve and AUC
    fpr, tpr, thresholds = roc_curve(y_test, y_test_prob)
    roc_auc = auc(fpr, tpr)
    
    # Plot ROC curve
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()