In [1]:
from sklearn.datasets import load_iris, make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score
from itertools import product
from src.utils import load_sparse_adj_mat, load_design_matrix
import pandas as pd
import numpy as np
import pickle
import json

# Settings
train_data_name = 'swissprot'
embed_type = 'esm'
inner_kfold = 5
outer_kfold = 5
hpo_metric = 'f1_weighted'
do_save = True

# Classifier names
names = [
    # "svm",
    "random_forest",
    # "naive_bayes",
    # "logistic_regression"
]

# Classifier objects
classifiers = [
    # MultiOutputClassifier(SVC()),
    RandomForestClassifier(),
    # MultiOutputClassifier(GaussianNB()),
    # MultiOutputClassifier(LogisticRegression())
]

# Construct svm estimators of varied params for MultiOutputClassifier
# wrapper during grid search
svm_params = {'kernel' : ['linear', 'rbf'], 'C' : [0.1, 1]}
svm_param_combos = list(product(*svm_params.values())) # Cartesian product of svm parameters
svm_estimators = [SVC(**{k:param_combo[i] for i, k in enumerate(svm_params.keys())}) for param_combo in svm_param_combos] # Construct svm estimators

# Params for grid search
parameter_grids = [
    # {'estimator' : svm_estimators},
    {'n_estimators' : [10, 100]},
    # {'estimator' : [GaussianNB()]},
    # {'estimator' : [LogisticRegression()]}
]

# Metrics to evaluate generalization error
scoring_metrics = {
    'accuracy': accuracy_score,
    'f1_weighted' : lambda y, y_pred : f1_score(y, y_pred, average='weighted'),
    'recall_weighted' : lambda y, y_pred : recall_score(y, y_pred, average='weighted'),
    'precision_weighted' : lambda y, y_pred : precision_score(y, y_pred, average='weighted'),
    'roc_auc_weighted' : lambda y, y_pred : roc_auc_score(y, y_pred, average='weighted'),
    'roc_auc_macro' : lambda y, y_pred : roc_auc_score(y, y_pred, average='macro'),
    'f1_samples' : lambda y, y_pred : f1_score(y, y_pred, average='samples', zero_division=0),
    'recall_samples' : lambda y, y_pred : recall_score(y, y_pred, average='samples', zero_division=0),
    'precision_samples' : lambda y, y_pred : precision_score(y, y_pred, average='samples', zero_division=0)
}

# Load dataset
y, idx_sample, idx_feature = load_sparse_adj_mat(train_data_name)
sample_idx = {v:k for k,v in idx_sample.items()}
y = y.toarray()

Constructing swissprot sparse adjacency matrix
227361

In [5]:

# Count low sample number classes
sample_cts = np.array(y.sum(axis=0))
for i in range(1,9):
    print(i, len(sample_cts[sample_cts == i]))

print(min(sample_cts))
print(len(sample_cts[sample_cts < 9]))

1 1360
2 636
3 419
4 291
5 228
6 169
7 120
8 113
1
3336


In [14]:
clf = RandomForestClassifier()
clf.fit(np.array([[1,3], [3,1]]), np.array([[1,0], [1, 0]]))

In [18]:
clf = MultiOutputClassifier(SVC())
clf.fit(np.array([[1,3], [3,1]]), np.array([[1,0], [1, 0]]))

ValueError: The number of classes has to be greater than one; got 1 class

In [6]:
# Perform nested cross-validation w/ normal kfold
from sklearn.model_selection import KFold, StratifiedKFold

inner_k = 3
outer_k = 3
X = np.zeros(shape=(y.shape[0], 1))

inner_cv = KFold(n_splits=inner_k, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=outer_k, shuffle=True, random_state=42)

for train_index, test_index in outer_cv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    for inner_train_index, inner_test_index in inner_cv.split(X_train):
        y_inner_train, y_inner_test = y_train[inner_train_index], y_train[inner_test_index]
        train_class_no_samples = np.any(np.array(y_inner_train.sum(axis=0).tolist()[0]) == 0)
        test_class_no_samples = np.any(np.array(y_inner_test.sum(axis=0).tolist()[0]) == 0)
        print(f"Train split missing class samples {train_class_no_samples}, Test split missing class samples {test_class_no_samples}")

: 

: 

: 

In [32]:
# Try w/ stratified kfold

from sklearn.model_selection import KFold, StratifiedKFold

inner_k = 3
outer_k = 3
X = np.zeros(shape=(y.shape[0], 1))

inner_cv = StratifiedKFold(n_splits=inner_k, shuffle=True, random_state=42)
outer_cv = StratifiedKFold(n_splits=outer_k, shuffle=True, random_state=42)

for train_index, test_index in outer_cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    for inner_train_index, inner_test_index in inner_cv.split(X_train, y_train):
        X_inner_train, X_inner_test = X_train[inner_train_index], X_train[inner_test_index]
        y_inner_train, y_inner_test = y_train[inner_train_index], y_train[inner_test_index]
        train_class_no_samples = np.any(np.array(y_inner_train.sum(axis=0).tolist()[0]) == 0)
        test_class_no_samples = np.any(np.array(y_inner_test.sum(axis=0).tolist()[0]) == 0)
        print(f"Train split missing class samples {train_class_no_samples}, Test split missing class samples {test_class_no_samples}")

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [1]:
type(y)

NameError: name 'y' is not defined

In [4]:
# How dts and rfs predict?

from sklearn.datasets import make_multilabel_classification
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier


X, y = make_multilabel_classification(n_samples=100, n_features=10, n_classes=5)

rf = RandomForestClassifier()
rf.fit(X, y)
rf.predict(X)

dts = [(f"dt_{i}", DecisionTreeClassifier()) for i in range(3)]
for elt in dts:
    elt[1].fit(X, y)

my_rf = VotingClassifier(estimators=dts, voting='soft')
my_rf.predict(X)

NotFittedError: This VotingClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [10]:
hasattr(my_rf, "__sklearn_is_fitted__")

False

In [11]:
my_rf.fit(X, y)

NotImplementedError: Multilabel and multi-output classification is not supported.

In [16]:
dir(rf)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_compute_oob_predictions',
 '_estimator_type',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_oob_predictions',
 '_get_param_names',
 '_get_tags',
 '_make_estimator',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_set_oob_score_and_attributes',
 '_validate_X_predict',
 '_validate_data',
 '_validate