In [32]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
!pip install torch_geometric



In [34]:
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.1.0+cu121.html

Looking in links: https://data.pyg.org/whl/torch-2.1.0+cu121.html


In [35]:
cd /content/drive/MyDrive/Academic/Topics/AI/Machine\ Learning\ Dr.\ Montazeri/Project/ml_mda

/content/drive/MyDrive/Academic/Topics/AI/Machine Learning Dr. Montazeri/Project/ml_mda


# Requirements

In [36]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(stream=sys.stdout)
    ],
    force=True
)

In [37]:
logger = logging.getLogger(__name__)

In [38]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [39]:
from src.optimization import MatrixFeatureBasedSklearnClassifierTrainer, MatrixFeatureBasedSklearnClassifierTester
from src.config import SklearnClassifierConfig
from src.models import JaccardSimilarityFeatureBasedSklearnClassifier, JaccardSimilarityFeatureBasedSklearnClassifierFactory
from src.data import MicrobeDiseaseAssociationData, MicrobeDiseaseAssociationTrainTestSpliter
from src.features import get_associations, get_entities
from src.utils import train_test_sampler
from base import cross_validation, OptimizerConfig


# Classification

## Data

In [40]:
associations = get_associations()

train_indices, test_indices = train_test_sampler(associations.shape[0], 0.7)

data = MicrobeDiseaseAssociationData(associations)

train_data = MicrobeDiseaseAssociationData(associations.iloc[train_indices])
test_data = MicrobeDiseaseAssociationData(associations.iloc[test_indices])

## Classifier

In [41]:
microbe_ids = get_entities().loc[get_entities()['type'] == 'Microbe']['id'].tolist()
disease_ids = get_entities().loc[get_entities()['type'] == 'Disease']['id'].tolist()

In [42]:
classifier_config = SklearnClassifierConfig()
classifier_config.classifier = 'DecisionTree'
classifier_config.random_state = 0
classifier_config.penalty = 'l2'
classifier_config.C = 0.5
classifier_config.n_estimators = 1000
classifier_config.criterion = 'gini'
classifier_config.max_depth = 5

In [43]:
mda_classifier = JaccardSimilarityFeatureBasedSklearnClassifier(classifier_config, microbe_ids=microbe_ids, disease_ids=disease_ids)

2024-02-12 08:41:55,680 [INFO] Initializing MatrixFeatureExtractor
2024-02-12 08:41:55,682 [INFO] Initializing SimilarityFeatureExtractor
2024-02-12 08:41:55,684 [INFO] Initializing MatrixFeatureBasedSklearnClassifier with model : None


## Optimizer

In [44]:
classifier_optimizer_config = OptimizerConfig()
classifier_optimizer_config.exp_name = "adam optimizer"
classifier_optimizer_config.threshold = 0.5

## Train Test Approach

### Train

In [45]:
train_result = MatrixFeatureBasedSklearnClassifierTrainer().train(model=mda_classifier, data=train_data,config=classifier_optimizer_config)

2024-02-12 08:41:55,739 [INFO] Call Training with adam optimizer
2024-02-12 08:41:55,753 [INFO] Calling build with associations :      disease  microbe  increased
235    43621     1049          1
343    43621    65679          1
414    12403     7093          1
399    50436    57582          1
755    40873    64132          0
..       ...      ...        ...
147    43621    19630          1
602    64642    10559          0
122    50436    59776          1
26     44112    61711          1
618    37496    27509          0

[628 rows x 3 columns]
2024-02-12 08:41:55,796 [INFO] interaction matrix with shape (5179, 5645) has built
2024-02-12 08:41:56,208 [INFO] mask matrix with shape (5179, 5645) has built. This matrix shows not non elements.
2024-02-12 08:41:57,062 [INFO] interaction has been imputed to delete nans
2024-02-12 08:41:57,063 [INFO] Building Jaccard similarity for diseases
2024-02-12 08:42:02,183 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0

  similarity_matrix = dot_product / union


2024-02-12 08:42:02,593 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:02,597 [INFO] Building Jaccard similarity for microbes
2024-02-12 08:42:07,115 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:07,138 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-12 08:42:07,205 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:07,389 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:07,392 [INFO] building data for classifier
2024-02-12 08:42:07,539 [INFO] shape of y : (628,), shape of X : (628, 10824)
2024-02-12 08:42:07,540 [INFO] fitting classifier : DecisionTree
2024-02-12 08:42:07,742 [INFO] Result on Train Data : {'AUC': 0.9837512171372931, 'ACC': 0.9474522292993

### Test

In [46]:
test_result = MatrixFeatureBasedSklearnClassifierTester().test(model=mda_classifier, data=test_data, config=classifier_optimizer_config)

2024-02-12 08:42:07,750 [INFO] Call Testing with adam optimizer
2024-02-12 08:42:07,827 [INFO] Result on Test Data : {'AUC': 0.833049777728994, 'ACC': 0.8407407407407408, 'F1 Score': 0.8376927485985097, 'AUPR': 0, 'Loss': 0}


In [47]:
test_result.get_result()

{'AUC': 0.833049777728994,
 'ACC': 0.8407407407407408,
 'F1 Score': 0.8376927485985097,
 'AUPR': 0,
 'Loss': 0}

In [48]:
mda_classifier.fe.microbe_similarity_matrix.sum(axis=0)[:100]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        , 99.25277778,  0.        ,
        0.        ,  0.        , 16.47777778,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        , 37.23816739,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

## Cross Validation

In [49]:
trainer = MatrixFeatureBasedSklearnClassifierTrainer()
tester = MatrixFeatureBasedSklearnClassifierTester()
factory = JaccardSimilarityFeatureBasedSklearnClassifierFactory(model_config=classifier_config, microbe_ids=microbe_ids, disease_ids=disease_ids)
spliter = MicrobeDiseaseAssociationTrainTestSpliter(data.associations)
cross_validation(k=5, data_size=data.associations.shape[0], train_test_spliter=spliter, model_factory=factory,
                    trainer=trainer, tester=tester, config=classifier_optimizer_config)

2024-02-12 08:42:07,866 [INFO] Initializing SimilarityFeatureBasedSklearnClassifierFactory
2024-02-12 08:42:07,868 [INFO] Initializing MicrobeDiseaseAssociationTrainTestSpliter
2024-02-12 08:42:07,869 [INFO] Start 5-fold Cross Validation with config : adam optimizer
2024-02-12 08:42:07,871 [INFO] ---- Fold 1 ----
2024-02-12 08:42:07,873 [INFO] Initializing MatrixFeatureExtractor
2024-02-12 08:42:07,875 [INFO] Initializing SimilarityFeatureExtractor
2024-02-12 08:42:07,876 [INFO] Initializing MatrixFeatureBasedSklearnClassifier with model : None
2024-02-12 08:42:07,878 [INFO] Call Training with adam optimizer
2024-02-12 08:42:07,881 [INFO] Calling build with associations :      disease  microbe  increased
0      50863    33211          1
1      43621    40832          1
2      33293    47880          1
3      13213    53186          1
6      12403    26565          1
..       ...      ...        ...
889      200    32631          0
893    64642    53920          0
894    25026    60601 

  similarity_matrix = dot_product / union


2024-02-12 08:42:14,163 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:14,169 [INFO] Building Jaccard similarity for microbes
2024-02-12 08:42:19,214 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:19,240 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-12 08:42:19,315 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:19,574 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:19,578 [INFO] building data for classifier
2024-02-12 08:42:19,911 [INFO] shape of y : (719,), shape of X : (719, 10824)
2024-02-12 08:42:19,912 [INFO] fitting classifier : DecisionTree
2024-02-12 08:42:20,218 [INFO] Result on Train Data : {'AUC': 0.9837813578470062, 'ACC': 0.9694019471488

  similarity_matrix = dot_product / union


2024-02-12 08:42:30,343 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:30,361 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-12 08:42:30,440 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:30,640 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:30,642 [INFO] building data for classifier
2024-02-12 08:42:30,856 [INFO] shape of y : (719,), shape of X : (719, 10824)
2024-02-12 08:42:30,858 [INFO] fitting classifier : DecisionTree
2024-02-12 08:42:31,131 [INFO] Result on Train Data : {'AUC': 0.9779810193212781, 'ACC': 0.9318497913769124, 'F1 Score': 0.931678512212386, 'AUPR': 0, 'Loss': 0}
2024-02-12 08:42:31,133 [INFO] Call Testing with adam optimizer
2024-02-12 08:42:31,183 [INFO] Result on Test Data : {'AUC': 0.5977495599698265, 'ACC': 0.57

  similarity_matrix = dot_product / union


2024-02-12 08:42:36,745 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:36,749 [INFO] Building Jaccard similarity for microbes
2024-02-12 08:42:40,147 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:40,168 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-12 08:42:40,247 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:40,439 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:40,443 [INFO] building data for classifier
2024-02-12 08:42:40,616 [INFO] shape of y : (719,), shape of X : (719, 10824)
2024-02-12 08:42:40,617 [INFO] fitting classifier : DecisionTree
2024-02-12 08:42:40,882 [INFO] Result on Train Data : {'AUC': 0.9794178182887386, 'ACC': 0.9415855354659

  similarity_matrix = dot_product / union


2024-02-12 08:42:49,909 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:49,929 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-12 08:42:49,998 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:50,187 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:50,189 [INFO] building data for classifier
2024-02-12 08:42:50,371 [INFO] shape of y : (719,), shape of X : (719, 10824)
2024-02-12 08:42:50,372 [INFO] fitting classifier : DecisionTree
2024-02-12 08:42:50,640 [INFO] Result on Train Data : {'AUC': 0.9714596494487798, 'ACC': 0.9290681502086231, 'F1 Score': 0.9290593677270327, 'AUPR': 0, 'Loss': 0}
2024-02-12 08:42:50,641 [INFO] Call Testing with adam optimizer
2024-02-12 08:42:50,690 [INFO] Result on Test Data : {'AUC': 0.5537288989669942, 'ACC': 0.5

  similarity_matrix = dot_product / union


2024-02-12 08:42:59,088 [INFO] calculating dot product :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:59,110 [INFO] sum rows :
[0. 0. 0. 0. 0.]
2024-02-12 08:42:59,177 [INFO] union :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:59,412 [INFO] similarity matrix :
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
2024-02-12 08:42:59,415 [INFO] building data for classifier
2024-02-12 08:42:59,676 [INFO] shape of y : (716,), shape of X : (716, 10824)
2024-02-12 08:42:59,678 [INFO] fitting classifier : DecisionTree
2024-02-12 08:43:00,085 [INFO] Result on Train Data : {'AUC': 0.970941711734913, 'ACC': 0.9511173184357542, 'F1 Score': 0.9511095937179924, 'AUPR': 0, 'Loss': 0}
2024-02-12 08:43:00,087 [INFO] Call Testing with adam optimizer
2024-02-12 08:43:00,155 [INFO] Result on Test Data : {'AUC': 0.4038275193798449, 'ACC': 0.52

<base.evaluation.Result at 0x7f6dec0d7c40>