In [1]:
import os
import ast
import pandas as pd
import numpy as np
import joblib
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib import rcParams
from imblearn.over_sampling import SMOTE, SVMSMOTE, ADASYN, RandomOverSampler,BorderlineSMOTE, KMeansSMOTE
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, ComplementNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, StackingClassifier, VotingClassifier, AdaBoostClassifier

from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, RidgeClassifier, RidgeClassifierCV, SGDClassifier, Perceptron, PassiveAggressiveClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
 
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
from sklearn.dummy import DummyClassifier
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score, cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import roc_curve, auc, matthews_corrcoef

from sklearn.datasets import make_classification
from sklearn.preprocessing import label_binarize
from lightgbm import LGBMClassifier

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
rcParams['font.family'] = 'serif'
rcParams['font.serif'] = 'Times New Roman'

In [3]:
path = "smote/oversampled_datasets/"
dfs = {
    'Printability' : 'Printability_resampled_df.csv',
    'Cell_Response' : 'Cell_Response_resampled_df.csv',
    'Scaffold_Quality' : 'Scaffold_Quality_(PxC)_resampled_df.csv'
}

In [None]:
for item in dfs.keys():
    os.makedirs(f'models/{item}', exist_ok=True)
    
    df_path = path + dfs[item]
    df = pd.read_csv(df_path)

    X = df.iloc[:,:-1]
    y = df.iloc[:,-1:].values.ravel()
    
    unique_values = np.unique(y)

    value_to_continuous = {original_value: new_value for new_value, original_value in enumerate(unique_values)}

    
    y_continuous = np.array([value_to_continuous[value] for value in y])
    y = y_continuous
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

    # Scaling
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Save the scaler
    joblib.dump(scaler, f'models/{item}/scaler.pkl')

    # Setting up GridSearchCV parameters
    param_grid = {
        'BernoulliNB': {
            'model': BernoulliNB(),
            'params': {'alpha': [0.01, 0.1, 1]}
        },
        'DecisionTreeClassifier': {
            'model': DecisionTreeClassifier(random_state=42),
            'params': {'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10],
                      'criterion': ['gini', 'entropy', 'log_loss'], 'splitter': ['best', 'random'],
                      'max_features': ['sqrt', 'log2']}
        },
        'ExtraTreeClassifier': {
            'model': ExtraTreeClassifier(random_state=42),
            'params': {'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10],
                      'criterion': ['gini', 'entropy', 'log_loss'], 'splitter': ['best', 'random'],
                      'max_features': ['sqrt', 'log2']}
        },
        'ExtraTreesClassifier': {
            'model': ExtraTreesClassifier(random_state=42),
            'params': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20],
                      'criterion': ['gini', 'entropy', 'log_loss'], 'max_features': ['sqrt', 'log2'],
                      'class_weight': ['balanced', 'balanced_subsample']}
        },
        'GaussianNB': {
            'model': GaussianNB(),
            'params': {}
        },
        'KNeighborsClassifier': {
            'model': KNeighborsClassifier(),
            'params': {'n_neighbors': [3, 5, 7, 10], 'weights': ['uniform', 'distance'],
                      'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
        },
        'LinearDiscriminantAnalysis': {
            'model': LinearDiscriminantAnalysis(),
            'params': {'solver': ['svd', 'lsqr', 'eigen'], 'shrinkage': ['auto', None]}
        },
        'LogisticRegression': {
            'model': LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=42),
            'params': {'C': [0.01, 0.1, 1], 'penalty': ['l1', 'l2', 'elasticnet', None], 'class_weight': ['balanced', None]}
        },
        'MLPClassifier': {
            'model': MLPClassifier(random_state=42),
            'params': {'hidden_layer_sizes': [(50,), (100,)], 'activation': ['identity', 'logistic', 'tanh', 'relu'],
                       'solver': ['lbfgs', 'sgd', 'adam'], 'learning_rate': ['constant', 'invscaling', 'adaptive'],
                        'max_iter': [1000, 10000]}
        },
        'RandomForestClassifier': {
            'model': RandomForestClassifier(random_state=42),
            'params': {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 50, 80], 
                       'criterion': ['gini', 'entropy', 'log_loss'], 'max_features': ['sqrt', 'log2', None],
                       'class_weight': ['balanced', 'balanced_subsample']}
        },
        'SVC': {
            'model': SVC(random_state=42, probability=True),
            'params': {'C': [0.01, 0.1, 1], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
                      'gamma': ['scale', 'auto']}
        },
        'GaussianProcessClassifier': {
            'model': GaussianProcessClassifier(random_state=42),
            'params': {'multi_class': ['one_vs_rest']}
        },
        'GradientBoostingClassifier': {
            'model': GradientBoostingClassifier(random_state=42),
            'params': {'n_estimators': [100, 200], 'learning_rate': [0.001, 0.01, 0.1], 'max_depth': [3, 5, 10],
                      'loss': ['log_loss', 'exponential'], 'criterion': ['friedman_mse', 'squared_error'], 
                       'max_features': ['sqrt', 'log2', None]}
        },
        'AdaBoostClassifier': {
            'model': AdaBoostClassifier(random_state=42),
            'params': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1], 'algorithm': ['SAMME', 'SAMME.R']}
        },
        'BaggingClassifier': {
            'model': BaggingClassifier(random_state=42),
            'params': {'n_estimators': [10, 50, 100], 'max_samples': [0.5, 1.0], 'max_features': [0.5, 1.0]}
        },
        'ComplementNB': {
            'model': ComplementNB(),
            'params': {'alpha': [0.01, 0.1, 1]}
        },
        'DummyClassifier': {
            'model': DummyClassifier(),
            'params': {'strategy': ['most_frequent', 'prior', 'stratified', 'uniform', 'constant']}
        },
        'HistGradientBoostingClassifier': {
            'model': HistGradientBoostingClassifier(random_state=42),
            'params': {'max_iter': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [None, 10, 20],
                      'interaction_cst': ['pairwise', 'no_interactions']}
        },
        'LabelPropagation': {
            'model': LabelPropagation(),
            'params': {'gamma': [0.01, 0.1, 1], 'kernel': ['knn', 'rbf']}
        },
        'LabelSpreading': {
            'model': LabelSpreading(),
            'params': {'gamma': [0.01, 0.1, 1], 'kernel': ['knn', 'rbf'], 'n_neighbors': [7, 9, 15]}
        },
        'LogisticRegressionCV': {
            'model': LogisticRegressionCV(multi_class='multinomial', random_state=42),
            'params': {'Cs': [10, 20], 'cv': [3, 5], 'max_iter': [10000],
                       'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']}
        },
        'MultinomialNB': {
            'model': MultinomialNB(),
            'params': {'alpha': [0.01, 0.1, 1]}
        },
        'NuSVC': {
            'model': NuSVC(probability=True, random_state=42), 'gamma': ['scale', 'auto'], 'class_weight': [None, 'balanced'],
            'params': {'nu': [0.1, 0.5], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
        },
        'QuadraticDiscriminantAnalysis': {
            'model': QuadraticDiscriminantAnalysis(),
            'params': {'reg_param': [0.0, 0.1, 0.5]}
        },
        'RadiusNeighborsClassifier': {
            'model': RadiusNeighborsClassifier(),
            'params': {
                'radius': [5, 10, 15], 
                'weights': ['uniform', 'distance'],
                'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'outlier_label': [None, 'most_frequent']
            }
        },
        'XGBoost': {
            'model': XGBClassifier(random_state=42),
            'params': {
                'n_estimators': [100, 200], 'booster': ['gbtree', 'gblinear', 'dart'], 
                'learning_rate': [0.001, 0.01, 0.1], 'importance_type': ['gain', 'weight', 'cover', 'total_gain', 'total_cover'],
                'max_depth': [3, 5, 9]
            }
        },
        'SGDClassifier': {
            'model': SGDClassifier(random_state=42, loss='log_loss'),
            'params': {'penalty': ['l2', 'l1', 'elasticnet', None], 'alpha': [0.0001, 0.001, 0.01], 'class_weight': ['balanced', None],
                      'loss': ['log_loss', 'modified_huber', 'squared_hinge', 'perceptron',# 'hinge', 
                           'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
                       'max_iter': [10000]
                      }
        },
        'CatBoost': {
            'model': CatBoostClassifier(verbose=0, random_state=42),
            'params': {
                'iterations': [100, 200],
                'learning_rate': [0.01, 0.1],
                'depth': [3, 5]
            }
        },
        'LightGBM': {
            'model': LGBMClassifier(random_state=42, verbose=-1),
            'params': {
                'n_estimators': [100, 200], 'boosting_type': ['gbdt', 'dart', 'rf'], 'class_weight': ['balanced', None],
                'learning_rate': [0.001, 0.01, 0.1],
                'max_depth': [3, 5, 9],
                'num_leaves': [31, 62]
            }
        },
    }

    # Cross-validation setup
    cv = StratifiedKFold(n_splits=10)

    # DataFrame for results and ROC data
    globals()[f'{item}_result_df'] = pd.DataFrame(columns=['Classifier', 'Best Params', 'Precision', 'Recall', 
                                                           'Accuracy', 'F1 Score', 'AUC', 'Kappa', 'MCC'])
    roc_data_list = []

    # Binarize the output for ROC curve
    y_test_binarized = label_binarize(y_test, classes=np.unique(y_test))
    n_classes = y_test_binarized.shape[1]

    # Figure for combined ROC Curves
    plt.figure(figsize=(20, 16))

    for name, config in param_grid.items():
        print(f"*** Processing {name} ***")
        grid_search = GridSearchCV(config['model'], config['params'], cv=StratifiedKFold(n_splits=10), scoring='accuracy', verbose=1)
        grid_search.fit(X_train_scaled, y_train)

        # Predictions and evaluations
        y_pred = grid_search.predict(X_test_scaled)
        y_proba = grid_search.predict_proba(X_test_scaled)

        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')
        auc_score = roc_auc_score(y_test, y_proba, multi_class='ovr')
        kappa = cohen_kappa_score(y_test, y_pred)
        mcc = matthews_corrcoef(y_test, y_pred)

        # Update results DataFrame
        globals()[f'{item}_result_df'].loc[len(globals()[f'{item}_result_df'].index)] = [name, grid_search.best_params_, precision, recall, accuracy, f1, auc_score, kappa, mcc]

        # Separate ROC curve plotting
        fpr, tpr, _ = roc_curve(y_test_binarized.ravel(), y_proba.ravel())
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f'{name} (area = {roc_auc:.4f})')

        roc_data = {'Classifier': name, 'FPR': fpr.tolist(), 'TPR': tpr.tolist(), 'AUC': roc_auc}
        roc_data_list.append(roc_data)

    # Finalizing the combined ROC curve plot
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.title(f'{item} ROC Curves', fontsize=20, fontweight='bold')
    plt.xticks(fontsize=14)  
    plt.yticks(fontsize=14) 
    plt.legend(loc="lower right", fontsize=13)
    plt.show()

    # Convert ROC data list to DataFrame for future use
    globals()[f'{item}_roc_df'] = pd.DataFrame(roc_data_list)
    globals()[f'{item}_roc_df'].to_csv(f'models/{item}/{item}_roc.csv')

    # Save the results DataFrame
    globals()[f'{item}_result_df'].to_csv(f'models/{item}/{item}_results.csv')
    
    # Save ROC data to JSON
    with open(f'models/{item}/{item}_roc.json', 'w') as f:
        json.dump(roc_data_list, f)