In [70]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [71]:
!pip install torch_geometric



In [72]:
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.1.0+cu121.html

Looking in links: https://data.pyg.org/whl/torch-2.1.0+cu121.html


In [73]:
cd /content/drive/MyDrive/Me\ --\ Education/Topics/AI/Machine Learning Dr. Montazeri/Project/ml_mda

/content/drive/MyDrive/Me -- Education/Topics/AI/Machine Learning Dr. Montazeri/Project/ml_mda


# Requirements

In [74]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(stream=sys.stdout)
    ],
    force=True
)

In [75]:
logger = logging.getLogger(__name__)

In [76]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [77]:
from src.optimization import MatrixFeatureBasedSklearnClassifierTrainer, MatrixFeatureBasedSklearnClassifierTester
from src.config import SklearnClassifierConfig
from src.models import JaccardSimilarityFeatureBasedSklearnClassifier, JaccardSimilarityFeatureBasedSklearnClassifierFactory
from src.data import MicrobeDiseaseAssociationData, MicrobeDiseaseAssociationTrainTestSpliter
from src.features import get_associations, get_entities
from src.utils import train_test_sampler
from base import cross_validation, OptimizerConfig


# Classification

## Data

In [78]:
associations = get_associations()

train_indices, test_indices = train_test_sampler(associations.shape[0], 0.7)

data = MicrobeDiseaseAssociationData(associations)

train_data = MicrobeDiseaseAssociationData(associations.iloc[train_indices])
test_data = MicrobeDiseaseAssociationData(associations.iloc[test_indices])

## Classifier

In [79]:
microbe_ids = get_entities().loc[get_entities()['type'] == 'Microbe']['id'].tolist()
disease_ids = get_entities().loc[get_entities()['type'] == 'Disease']['id'].tolist()

In [80]:
classifier_config = SklearnClassifierConfig()
classifier_config.classifier = 'Logistic'
classifier_config.random_state = 0
classifier_config.penalty = 'l1'
classifier_config.C = 0.1
classifier_config.n_estimators = 1000
classifier_config.criterion = 'gini'
classifier_config.max_depth = None
classifier_config.solver = 'saga'

In [81]:
mda_classifier = JaccardSimilarityFeatureBasedSklearnClassifier(classifier_config, microbe_ids=microbe_ids, disease_ids=disease_ids)

2024-02-16 09:18:50,101 [INFO] Initializing MatrixFeatureExtractor
2024-02-16 09:18:50,107 [INFO] Initializing SimilarityFeatureExtractor
2024-02-16 09:18:50,110 [INFO] Initializing MatrixFeatureBasedSklearnClassifier with model : None


## Optimizer

In [82]:
classifier_optimizer_config = OptimizerConfig()
classifier_optimizer_config.exp_name = "adam optimizer"
classifier_optimizer_config.threshold = 0.5

## Train Test Approach

### Train

In [83]:
train_result = MatrixFeatureBasedSklearnClassifierTrainer().train(model=mda_classifier, data=train_data,config=classifier_optimizer_config)

2024-02-16 09:18:50,154 [INFO] Call Training with adam optimizer
2024-02-16 09:18:50,160 [INFO] Calling build with associations :      disease  microbe  increased
328    61336    54894          1
244    33293    54894          1
105    12403    15620          1
629    25026     5846          0
148    13213    51508          1
..       ...      ...        ...
421    43621    20153          1
498    54370    55403          0
738    64642    54235          0
9      43621    14120          1
479    63129    46117          0

[628 rows x 3 columns]
2024-02-16 09:18:50,411 [INFO] interaction matrix with shape (5179, 5645) has built
2024-02-16 09:18:51,357 [INFO] mask matrix with shape (5179, 5645) has built. This matrix shows not non elements.
2024-02-16 09:18:53,845 [INFO] interaction has been imputed to delete nans
2024-02-16 09:18:53,847 [INFO] Building Jaccard similarity for diseases
2024-02-16 09:19:01,617 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0

  similarity_matrix = dot_product / union


2024-02-16 09:19:02,685 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:19:02,689 [INFO] Building Jaccard similarity for microbes
2024-02-16 09:19:10,399 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:19:10,431 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-16 09:19:10,561 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:19:11,155 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:19:11,159 [INFO] building data for classifier
2024-02-16 09:19:11,448 [INFO] shape of y : (628,), shape of X : (628, 10824)
2024-02-16 09:19:11,449 [INFO] fitting classifier : Logistic




2024-02-16 09:19:24,474 [INFO] Result on Train Data : {'AUC': 0.9047626301091647, 'ACC': 0.7770700636942676, 'F1 Score': 0.7705325357303183, 'AUPR': 0, 'Loss': 0}


### Test

In [84]:
test_result = MatrixFeatureBasedSklearnClassifierTester().test(model=mda_classifier, data=test_data, config=classifier_optimizer_config)

2024-02-16 09:19:24,490 [INFO] Call Testing with adam optimizer
2024-02-16 09:19:24,669 [INFO] Result on Test Data : {'AUC': 0.9081655172413793, 'ACC': 0.7925925925925926, 'F1 Score': 0.7757475083056479, 'AUPR': 0, 'Loss': 0}


In [85]:
test_result.get_result()

{'AUC': 0.9081655172413793,
 'ACC': 0.7925925925925926,
 'F1 Score': 0.7757475083056479,
 'AUPR': 0,
 'Loss': 0}

In [86]:
mda_classifier.fe.microbe_similarity_matrix.sum(axis=0)[:100]

array([  0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
       112.80952381,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
        31.91720779,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.  

## Cross Validation

In [87]:
trainer = MatrixFeatureBasedSklearnClassifierTrainer()
tester = MatrixFeatureBasedSklearnClassifierTester()
factory = JaccardSimilarityFeatureBasedSklearnClassifierFactory(model_config=classifier_config, microbe_ids=microbe_ids, disease_ids=disease_ids)
spliter = MicrobeDiseaseAssociationTrainTestSpliter(data.associations)
cross_validation(k=5, data_size=data.associations.shape[0], train_test_spliter=spliter, model_factory=factory,
                    trainer=trainer, tester=tester, config=classifier_optimizer_config)

2024-02-16 09:19:24,770 [INFO] Initializing SimilarityFeatureBasedSklearnClassifierFactory
2024-02-16 09:19:24,773 [INFO] Initializing MicrobeDiseaseAssociationTrainTestSpliter
2024-02-16 09:19:24,777 [INFO] Start 5-fold Cross Validation with config : adam optimizer
2024-02-16 09:19:24,781 [INFO] ---- Fold 1 ----
2024-02-16 09:19:24,786 [INFO] Initializing MatrixFeatureExtractor
2024-02-16 09:19:24,788 [INFO] Initializing SimilarityFeatureExtractor
2024-02-16 09:19:24,790 [INFO] Initializing MatrixFeatureBasedSklearnClassifier with model : None
2024-02-16 09:19:24,792 [INFO] Call Training with adam optimizer
2024-02-16 09:19:24,799 [INFO] Calling build with associations :      disease  microbe  increased
0      50863    33211          1
1      43621    40832          1
3      13213    53186          1
4      33293    14909          1
5      33293    35937          1
..       ...      ...        ...
890      654    61681          0
892    22068    20153          0
893    64642    53920 

  similarity_matrix = dot_product / union


2024-02-16 09:19:36,633 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:19:36,642 [INFO] Building Jaccard similarity for microbes
2024-02-16 09:19:44,031 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:19:44,057 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-16 09:19:44,183 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:19:44,651 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:19:44,654 [INFO] building data for classifier
2024-02-16 09:19:45,001 [INFO] shape of y : (719,), shape of X : (719, 10824)
2024-02-16 09:19:45,002 [INFO] fitting classifier : Logistic
2024-02-16 09:19:54,908 [INFO] Result on Train Data : {'AUC': 0.8823372382735728, 'ACC': 0.7830319888734353,

  similarity_matrix = dot_product / union


2024-02-16 09:20:04,366 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:20:04,373 [INFO] Building Jaccard similarity for microbes
2024-02-16 09:20:09,565 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:20:09,592 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-16 09:20:09,721 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:20:10,116 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:20:10,122 [INFO] building data for classifier
2024-02-16 09:20:10,411 [INFO] shape of y : (719,), shape of X : (719, 10824)
2024-02-16 09:20:10,413 [INFO] fitting classifier : Logistic
2024-02-16 09:20:23,391 [INFO] Result on Train Data : {'AUC': 0.9044243953016914, 'ACC': 0.7885952712100139,

  similarity_matrix = dot_product / union


2024-02-16 09:20:32,540 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:20:32,549 [INFO] Building Jaccard similarity for microbes
2024-02-16 09:20:37,653 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:20:37,681 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-16 09:20:37,807 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:20:38,191 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:20:38,196 [INFO] building data for classifier
2024-02-16 09:20:38,474 [INFO] shape of y : (719,), shape of X : (719, 10824)
2024-02-16 09:20:38,477 [INFO] fitting classifier : Logistic
2024-02-16 09:20:47,152 [INFO] Result on Train Data : {'AUC': 0.8987959824813903, 'ACC': 0.7913769123783032,

  similarity_matrix = dot_product / union


2024-02-16 09:20:55,778 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:20:55,781 [INFO] Building Jaccard similarity for microbes
2024-02-16 09:21:02,584 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:21:02,613 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-16 09:21:02,738 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:21:03,113 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:21:03,118 [INFO] building data for classifier
2024-02-16 09:21:03,438 [INFO] shape of y : (719,), shape of X : (719, 10824)
2024-02-16 09:21:03,440 [INFO] fitting classifier : Logistic
2024-02-16 09:21:11,261 [INFO] Result on Train Data : {'AUC': 0.8882118539151965, 'ACC': 0.7941585535465925,

  similarity_matrix = dot_product / union


2024-02-16 09:21:22,636 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:21:22,639 [INFO] Building Jaccard similarity for microbes
2024-02-16 09:21:29,087 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:21:29,123 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-16 09:21:29,270 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:21:29,745 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-16 09:21:29,750 [INFO] building data for classifier
2024-02-16 09:21:30,111 [INFO] shape of y : (716,), shape of X : (716, 10824)
2024-02-16 09:21:30,113 [INFO] fitting classifier : Logistic
2024-02-16 09:21:43,189 [INFO] Result on Train Data : {'AUC': 0.9086847957551403, 'ACC': 0.7960893854748603,

<base.evaluation.Result at 0x795b9ce440a0>

## Coefficients

In [88]:
(mda_classifier.classifier.coef_ != .0).sum()

107