In [68]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [69]:
!pip install torch_geometric



In [70]:
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.1.0+cu121.html

Looking in links: https://data.pyg.org/whl/torch-2.1.0+cu121.html


In [71]:
cd /content/drive/MyDrive/Academic/Topics/AI/Machine\ Learning\ Dr.\ Montazeri/Project/ml_mda

/content/drive/MyDrive/Academic/Topics/AI/Machine Learning Dr. Montazeri/Project/ml_mda


# Requirements

In [72]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(stream=sys.stdout)
    ],
    force=True
)

In [73]:
logger = logging.getLogger(__name__)

In [74]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [75]:
from src.optimization import MatrixFeatureBasedSklearnClassifierTrainer, MatrixFeatureBasedSklearnClassifierTester
from src.config import SklearnClassifierConfig
from src.models import JaccardSimilarityFeatureBasedSklearnClassifier, JaccardSimilarityFeatureBasedSklearnClassifierFactory
from src.data import MicrobeDiseaseAssociationData, MicrobeDiseaseAssociationTrainTestSpliter
from src.features import get_associations, get_entities
from src.utils import train_test_sampler
from base import cross_validation, OptimizerConfig


# Classification

## Data

In [76]:
associations = get_associations()

train_indices, test_indices = train_test_sampler(associations.shape[0], 0.7)

data = MicrobeDiseaseAssociationData(associations)

train_data = MicrobeDiseaseAssociationData(associations.iloc[train_indices])
test_data = MicrobeDiseaseAssociationData(associations.iloc[test_indices])

## Classifier

In [77]:
microbe_ids = get_entities().loc[get_entities()['type'] == 'Microbe']['id'].tolist()
disease_ids = get_entities().loc[get_entities()['type'] == 'Disease']['id'].tolist()

In [78]:
classifier_config = SklearnClassifierConfig()
classifier_config.classifier = 'RF'
classifier_config.random_state = 0
classifier_config.penalty = 'l2'
classifier_config.C = 0.5
classifier_config.n_estimators = 50000
classifier_config.criterion = 'gini'
classifier_config.max_depth = None

In [79]:
mda_classifier = JaccardSimilarityFeatureBasedSklearnClassifier(classifier_config, microbe_ids=microbe_ids, disease_ids=disease_ids)

2024-02-12 07:32:26,314 [INFO] Initializing MatrixFeatureExtractor
2024-02-12 07:32:26,317 [INFO] Initializing SimilarityFeatureExtractor
2024-02-12 07:32:26,319 [INFO] Initializing MatrixFeatureBasedSklearnClassifier with model : None


## Optimizer

In [80]:
classifier_optimizer_config = OptimizerConfig()
classifier_optimizer_config.exp_name = "adam optimizer"
classifier_optimizer_config.threshold = 0.5

## Train Test Approach

### Train

In [81]:
train_result = MatrixFeatureBasedSklearnClassifierTrainer().train(model=mda_classifier, data=train_data,config=classifier_optimizer_config)

2024-02-12 07:32:26,346 [INFO] Call Training with adam optimizer
2024-02-12 07:32:26,358 [INFO] Calling build with associations :      disease  microbe  increased
206    63129    29241          1
427    43621    33867          1
889      200    32631          0
153    43621    23544          1
263    50863    41608          1
..       ...      ...        ...
517    50436    59432          0
19     13213    58532          1
364      654    54932          1
441    43621    31268          1
129    13213     7093          1

[628 rows x 3 columns]
2024-02-12 07:32:26,515 [INFO] interaction matrix with shape (5179, 5645) has built
2024-02-12 07:32:27,108 [INFO] mask matrix with shape (5179, 5645) has built. This matrix shows not non elements.
2024-02-12 07:32:28,575 [INFO] interaction has been imputed to delete nans
2024-02-12 07:32:28,577 [INFO] Building Jaccard similarity for diseases
2024-02-12 07:32:35,466 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0

  similarity_matrix = dot_product / union


2024-02-12 07:32:37,051 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 07:32:37,064 [INFO] Building Jaccard similarity for microbes
2024-02-12 07:32:43,776 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 07:32:43,802 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-12 07:32:43,913 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 07:32:44,276 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 07:32:44,279 [INFO] building data for classifier
2024-02-12 07:32:44,486 [INFO] shape of y : (628,), shape of X : (628, 10824)
2024-02-12 07:32:44,488 [INFO] fitting classifier : RF
2024-02-12 07:40:39,625 [INFO] Result on Train Data : {'AUC': 1.0, 'ACC': 1.0, 'F1 Score': 1.0, 'AUPR': 0, 'Loss':

### Test

In [82]:
test_result = MatrixFeatureBasedSklearnClassifierTester().test(model=mda_classifier, data=test_data, config=classifier_optimizer_config)

2024-02-12 07:40:39,637 [INFO] Call Testing with adam optimizer
2024-02-12 07:40:46,204 [INFO] Result on Test Data : {'AUC': 0.9724768124691291, 'ACC': 0.9, 'F1 Score': 0.8993913799528008, 'AUPR': 0, 'Loss': 0}


In [83]:
test_result.get_result()

{'AUC': 0.9724768124691291,
 'ACC': 0.9,
 'F1 Score': 0.8993913799528008,
 'AUPR': 0,
 'Loss': 0}

In [84]:
mda_classifier.fe.microbe_similarity_matrix.sum(axis=0)[:100]

array([  0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
        13.21785714,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
        35.35559163,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.  

## Cross Validation

In [85]:
trainer = MatrixFeatureBasedSklearnClassifierTrainer()
tester = MatrixFeatureBasedSklearnClassifierTester()
factory = JaccardSimilarityFeatureBasedSklearnClassifierFactory(model_config=classifier_config, microbe_ids=microbe_ids, disease_ids=disease_ids)
spliter = MicrobeDiseaseAssociationTrainTestSpliter(data.associations)
cross_validation(k=5, data_size=data.associations.shape[0], train_test_spliter=spliter, model_factory=factory,
                    trainer=trainer, tester=tester, config=classifier_optimizer_config)

2024-02-12 07:40:46,304 [INFO] Initializing SimilarityFeatureBasedSklearnClassifierFactory
2024-02-12 07:40:46,313 [INFO] Initializing MicrobeDiseaseAssociationTrainTestSpliter
2024-02-12 07:40:46,319 [INFO] Start 5-fold Cross Validation with config : adam optimizer
2024-02-12 07:40:46,322 [INFO] ---- Fold 1 ----
2024-02-12 07:40:46,331 [INFO] Initializing MatrixFeatureExtractor
2024-02-12 07:40:46,334 [INFO] Initializing SimilarityFeatureExtractor
2024-02-12 07:40:46,341 [INFO] Initializing MatrixFeatureBasedSklearnClassifier with model : None
2024-02-12 07:40:46,351 [INFO] Call Training with adam optimizer
2024-02-12 07:40:46,361 [INFO] Calling build with associations :      disease  microbe  increased
0      50863    33211          1
1      43621    40832          1
2      33293    47880          1
3      13213    53186          1
5      33293    35937          1
..       ...      ...        ...
893    64642    53920          0
894    25026    60601          0
895    25026    44316 

  similarity_matrix = dot_product / union


2024-02-12 07:40:58,794 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 07:40:58,816 [INFO] Building Jaccard similarity for microbes
2024-02-12 07:41:05,381 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 07:41:05,407 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-12 07:41:05,628 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 07:41:06,168 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 07:41:06,171 [INFO] building data for classifier
2024-02-12 07:41:06,473 [INFO] shape of y : (719,), shape of X : (719, 10824)
2024-02-12 07:41:06,475 [INFO] fitting classifier : RF
2024-02-12 07:50:18,594 [INFO] Result on Train Data : {'AUC': 1.0, 'ACC': 1.0, 'F1 Score': 1.0, 'AUPR': 0, 'Loss':

  similarity_matrix = dot_product / union


2024-02-12 07:50:31,934 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 07:50:31,937 [INFO] Building Jaccard similarity for microbes
2024-02-12 07:50:36,336 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 07:50:36,362 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-12 07:50:36,472 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 07:50:36,862 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 07:50:36,866 [INFO] building data for classifier
2024-02-12 07:50:37,098 [INFO] shape of y : (719,), shape of X : (719, 10824)
2024-02-12 07:50:37,100 [INFO] fitting classifier : RF
2024-02-12 08:00:42,799 [INFO] Result on Train Data : {'AUC': 0.9999999999999999, 'ACC': 1.0, 'F1 Score': 1.0, 'AU

  similarity_matrix = dot_product / union


2024-02-12 08:00:56,344 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:00:56,348 [INFO] Building Jaccard similarity for microbes
2024-02-12 08:01:02,238 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:01:02,263 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-12 08:01:02,374 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:01:02,736 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:01:02,739 [INFO] building data for classifier
2024-02-12 08:01:02,978 [INFO] shape of y : (719,), shape of X : (719, 10824)
2024-02-12 08:01:02,980 [INFO] fitting classifier : RF
2024-02-12 08:09:48,770 [INFO] Result on Train Data : {'AUC': 1.0, 'ACC': 1.0, 'F1 Score': 1.0, 'AUPR': 0, 'Loss':

  similarity_matrix = dot_product / union


2024-02-12 08:10:02,064 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:10:02,069 [INFO] Building Jaccard similarity for microbes
2024-02-12 08:10:06,671 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:10:06,697 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-12 08:10:06,806 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:10:07,181 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:10:07,185 [INFO] building data for classifier
2024-02-12 08:10:07,422 [INFO] shape of y : (719,), shape of X : (719, 10824)
2024-02-12 08:10:07,424 [INFO] fitting classifier : RF
2024-02-12 08:18:41,517 [INFO] Result on Train Data : {'AUC': 1.0, 'ACC': 1.0, 'F1 Score': 1.0, 'AUPR': 0, 'Loss':

  similarity_matrix = dot_product / union


2024-02-12 08:18:54,748 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:18:54,751 [INFO] Building Jaccard similarity for microbes
2024-02-12 08:18:59,051 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:18:59,078 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-12 08:18:59,188 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:18:59,574 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:18:59,579 [INFO] building data for classifier
2024-02-12 08:18:59,825 [INFO] shape of y : (716,), shape of X : (716, 10824)
2024-02-12 08:18:59,826 [INFO] fitting classifier : RF
2024-02-12 08:27:21,307 [INFO] Result on Train Data : {'AUC': 1.0, 'ACC': 1.0, 'F1 Score': 1.0, 'AUPR': 0, 'Loss':

<base.evaluation.Result at 0x78a1ad403be0>