In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import pygraphviz
from pandas import read_csv
import seaborn as sns
import lightgbm as lgb

from sklearn.datasets import make_classification
from sklearn.dummy import DummyClassifier

from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.preprocessing import OrdinalEncoder

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
import os
import re



# refactor ignore

In [2]:
from data.utils.HGTDB.ml_data_loader import ml_load_species

In [3]:
X_ecoli_A, y_ecoli_A = ml_load_species('ecoli', 'A')
X_ecoli_B, y_ecoli_B = ml_load_species('ecoli', 'B')
X_ecoli_C, y_ecoli_C = ml_load_species('ecoli', 'C')
X_ecoli_D, y_ecoli_D = ml_load_species('ecoli', 'D')
X_ecoli_E, y_ecoli_E = ml_load_species('ecoli', 'E')
print(X_ecoli_A.shape)
print(X_ecoli_B.shape)
print(X_ecoli_C.shape)
print(X_ecoli_D.shape)
print(X_ecoli_E.shape)

(4276, 13)
(4276, 12)
(4276, 10)
(4276, 4)
(4276, 8)


In [25]:
X_ecoli_A[0]

array([0, 63, nan, 22.7, -5.9, 68.2, 6.3, 63.6, 1.0, 51.5, 0.0, 422.6, 1],
      dtype=object)

In [35]:
X_ecoli_A

array([[0, 63, nan, ..., 0.0, 422.6, 1],
       [0, 2460, 'E', ..., 0.4, 4.7, 1],
       [0, 930, 'E', ..., 1.0, 9.0, 1],
       ...,
       [1, 714, 'T', ..., -0.1, 11.5, 1],
       [0, 138, nan, ..., -1.8, 92.5, 1],
       [0, 684, 'J', ..., -0.3, 17.9, 1]], dtype=object)

In [4]:
from trainer.ml_trainer import ml_trainer

In [5]:
ecoli_B_lgbm = LGBMClassifier()
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
ml_trainer(ecoli_B_lgbm,'ecoli','B',cv,'average_precision',False)

**** Training on ecoli with type B ****

average_precision: 0.611 (0.065)
[[776  29]
 [ 23  28]]
              precision    recall  f1-score   support

         0.0       0.97      0.96      0.97       805
         1.0       0.49      0.55      0.52        51

    accuracy                           0.94       856
   macro avg       0.73      0.76      0.74       856
weighted avg       0.94      0.94      0.94       856



In [6]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [7]:
from data.utils.HGTDB.dl_data_loader import SingleSpecies

In [8]:
test_single = SingleSpecies(species="bsub",data_type="B")

test_single[500]

{'gc_signature': array([ 0.000e+00,  1.173e+03,  3.980e+01, -2.200e+00,  2.450e+01,
        -2.200e+00,  2.810e+01, -2.100e+00,  3.080e+01, -3.000e+00,
         1.490e+01,  1.000e+00]),
 'hgt': 1.0}

In [9]:
from data.utils.HGTDB.dl_data_loader import AllSpecies

In [10]:
test_all = AllSpecies(data_type="B")

test_all[500]

{'gc_signature': tensor([  0.0000, 387.0000,  56.2000,   0.8000,  38.5000,   0.9000,  26.2000,
          -1.2000,  40.3000,   0.0000,  70.8000,   1.0000], dtype=torch.float64),
 'hgt': tensor(0., dtype=torch.float64)}

In [11]:
class Feedforward(torch.nn.Module):
    def __init__(self, input_size):
        super(Feedforward, self).__init__()
        self.input_size = input_size
        self.fc1 = torch.nn.Linear(self.input_size, 100).double()
        self.relu1 = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(100, 1).double()
        self.sigmoid1 = torch.nn.Sigmoid()
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.sigmoid1(x)
        return x

In [14]:
from trainer.dl_trainer import DLClassifier

In [15]:
b_holdout_train_set = AllSpecies('B',normalize=False, partition_type='holdout', partition='train')
b_holdout_test_set = AllSpecies('B',normalize=False, partition_type='holdout', partition='test')
bacteria_classifier_B = DLClassifier(Feedforward(12), torch.nn.BCELoss(), torch.optim.Adam, 0.001)

bacteria_classifier_B.train(b_holdout_train_set, 16, 10)

EPOCH 1:
LOSS train 0.39401663493158723
EPOCH 2:
LOSS train 0.3002877629044
EPOCH 3:
LOSS train 0.2577981494156747
EPOCH 4:
LOSS train 0.2271811320152321
EPOCH 5:
LOSS train 0.2179799544590502
EPOCH 6:
LOSS train 0.2117221989823492
EPOCH 7:
LOSS train 0.20662721559992359
EPOCH 8:
LOSS train 0.19963526992741173
EPOCH 9:
LOSS train 0.19359634094590297
EPOCH 10:
LOSS train 0.1906631274738119


In [1]:
from data.utils.NCBI.data_loader import NCBIDataLoader

In [2]:
yakult = NCBIDataLoader('ASM82905v1')

found 1 ids
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/829/055/GCF_000829055.1_ASM82905v1/GCF_000829055.1_ASM82905v1_cds_from_genomic.fna.gz


In [4]:
print(yakult.complete_sequence)

ATGCCCAATTTAGAGGAGCTTTGGGCTTACCTGAATGATAAATTCCGTGAAGAGTTGACCCCAGTCGGCTACAGCACATGGATTCAAACAGCCAAACCCGTTAAATTGACCAAAGATAAACTCGAAATCGAAGTCCCGGCATCGTTGCATAAGGCTTACTGGGAGAAAAATCTGGTCACCAAAGTCGTGGAAGGGGTCTATGAATTTGCCCAGCTGGAAGTCGATCCGGTGATCATGACCAAAGACGAGTTACAGCCGGTCACGACGCACCAGCAACCAGCGACTGCCGATGATGATGATCAACAACTAACTTTTAAGGCGAAAACGCATCTCAATCCGAAATACACGTTTGACCGGTTCGTGATCGGCAAAGGCAACCAAATGGCGCATGCCGCGACGTTAGCGGTTGCCGAAGCTCCCGGCACGACGTATAATCCGCTGTTTATTTATGGTGGCGTCGGTTTGGGCAAGACGCACTTGATGCAGGCTATCGGTAACCTGGTTTTGGAAAATAATCCAGCCGCTAACATTAAATATGTCACCAGCGAGAATTTTGCCAACGACTTCATTAACTCGATTCAAACCAAGCAGCAGGAGCAATTTCGTCAGGAGTATCGCAATGTTGACCTGCTGTTGGTTGATGATATCCAGTTTTTTGGTGACAAAGAAGCCACGCAGGAAGAATTCTTCCATACGTTTAACACGCTGTACGAAAATATGAAGCAGATCGTACTCACAAGCGATCGCCTGCCAAACGAAATTCCTAAGCTGCAGGAGCGGCTGGTGTCGCGGTTTAACAAAGGCTTGTCCGTTGACGTGACGCCGCCTGATCTCGAAACCCGCATTGCCATCTTGCGCAATAAAGCCGATGCCGAAGATCTCAGCATTCCTGATGACACGCTTTCTTACATTGCCGGCCAAATTGAAAGTAACGTGCGTGATTTGGAAGGGGCTTTGGTGCGTGTCCAGGCTTTTTCTACTATGAAAAATGAAGATATCA

In [3]:
print(yakult.mean_GCT)
print(yakult.mean_GC1)
print(yakult.mean_GC2)
print(yakult.mean_GC3)
print(yakult.std_GCT)
print(yakult.std_GC1)
print(yakult.std_GC2)
print(yakult.std_GC3)

47.99780371380697
54.43022578518216
38.120079456762745
51.4432288224944
3.9333303002550197
5.59620581567793
5.1420556965271995
7.1901061819491865


In [4]:
print(yakult.rel_freq)
print(yakult.nucleutide_identity)
print(yakult.dinucleutide_identity)
print(yakult.cub)

{'G': 0.25390022015214475, 'A': 0.26340302506088364, 'C': 0.2329938908275224, 'T': 0.2497028639594492}
{'A': [0.2553701468935958, 0.2571262119761974, 0.2777127163128578], 'T': [0.26331821397361743, 0.2391815333765315, 0.2466088445281987], 'G': [0.24067365367352883, 0.27363940624312644, 0.24738760053977898], 'C': [0.240637985459258, 0.23005284840414464, 0.22829083861916452]}
{'AA': [0.08006800739520976, 0.08457528073856982, 0.08832173718677326], 'AG': [0.055446239084040255, 0.04828525059892876, 0.05131829876682947], 'AT': [0.06612173561530642, 0.06895022500698503, 0.07916569569745709], 'AC': [0.053734164799039336, 0.0553154556317138, 0.05890612590419031], 'TA': [0.04349738730330466, 0.04250224412514787, 0.037454047395979476], 'TG': [0.08495455275031656, 0.06909051998311705, 0.06969934037503983], 'TT': [0.07885172128857369, 0.07794337076514264, 0.0850497691074851], 'TC': [0.05601455263142251, 0.04964539850312394, 0.054405980853279814], 'GA': [0.060298305165351895, 0.07731204337254856, 0.

# cross reference bsub HGTDB and NCBI

In [80]:
from data.utils.NCBI.data_loader import NCBIDataLoader
from data.utils.HGTDB.ml_data_loader import ml_load_species

In [81]:
bsub_HGTDB = ml_load_species('bsub', 'A', return_df=True)

In [82]:
bsub_HGTDB

Unnamed: 0_level_0,Strand,Length,FunctionCode,GC1,SD1,GC2,SD2,GC3,SD3,GCT,SDT,Mah,AADev,HGT
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Bsu0001,0,1338,L,50.6,-0.2,32.7,-0.6,37.1,-0.9,40.1,-0.8,6.1,1,0
Bsu0002,0,1134,L,50.9,-0.2,34.6,-0.2,34.3,-1.3,39.9,-0.9,12.0,1,0
Bsu0003,0,213,S,55.6,0.7,30.6,-1.0,44.4,0.1,43.5,-0.1,39.7,1,0
Bsu0004,0,1110,L,57.7,1.1,31.0,-0.9,48.0,0.5,45.6,0.4,9.7,1,0
Bsu0005,0,156,,50.9,-0.2,26.4,-1.9,39.6,-0.6,39.0,-1.1,79.1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Bsu4099,1,1377,R,59.3,1.3,35.4,0.0,42.6,-0.2,45.8,0.5,8.9,1,0
Bsu4100,1,624,R,51.7,0.0,34.0,-0.3,40.2,-0.5,41.9,-0.4,21.5,1,0
Bsu4101,1,783,N,45.0,-1.2,32.1,-0.7,44.7,0.1,40.6,-0.7,20.5,1,0
Bsu4102,1,348,J,45.3,-1.2,29.9,-1.1,42.7,-0.2,39.3,-1.0,27.7,1,0


In [83]:
bsub_HGTDB['HGT']['Bsu0001']

0

In [84]:
bsub_NCBI = NCBIDataLoader('ASM904v1')

found 1 ids
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/009/045/GCF_000009045.1_ASM904v1/GCF_000009045.1_ASM904v1_cds_from_genomic.fna.gz


In [85]:
bsub_NCBI["BSU_00010"]

{'gene': 'dnaA',
 'protein': 'chromosomal replication initiator informational ATPase',
 'location': '410..1750',
 'g_count': 268,
 'a_count': 455,
 'c_count': 270,
 't_count': 348,
 'GC1': 50.5592841163311,
 'SD1': -0.09326804548721919,
 'GC2': 32.66219239373602,
 'SD2': -0.4094106168070079,
 'GC3': 37.13646532438479,
 'SD3': -0.7813120097588252,
 'GCT': 40.1193139448173,
 'SDT': -0.6333862030937689,
 'rel_freq': {'G': 0.19985085756897839,
  'A': 0.33929903057419836,
  'C': 0.20134228187919462,
  'T': 0.2595078299776286},
 '12_symbols': {'A': [0.3243847874720358,
   0.378076062639821,
   0.31543624161073824],
  'T': [0.17002237136465326, 0.2953020134228188, 0.3131991051454139],
  'G': [0.31096196868008946, 0.10514541387024609, 0.18344519015659955],
  'C': [0.19463087248322147, 0.2214765100671141, 0.18791946308724833]},
 '48_symbols': {'AA': [0.1319910514541387,
   0.1610738255033557,
   0.10762331838565023],
  'AG': [0.03131991051454139, 0.05145413870246085, 0.10762331838565023],
  'AT

In [86]:
len(bsub_NCBI)

4237

In [87]:
len(bsub_NCBI["BSU_00010"]["sequence"])

1341

i guess this is how it is supposed to be implemented

In [92]:
X_rel_freq = []
X_nuc_id = []
y_ncbi = []


not_in_ncbi = []
for i in bsub_HGTDB.index.values:
    # print(i)
    # prefix = i[:3]
    # print(prefix.upper())
    word = "".join(re.findall("[a-zA-Z]+", i))
    # print(word.upper())
    number = "".join(re.findall("[0-9]+", i))
    # print(number+"0")
    id_in_ncbi = word.upper()+"_"+number+"0"
    # print(id_in_ncbi)
    try:
        X_rel_freq.append(bsub_NCBI[id_in_ncbi]['rel_freq'])
        X_nuc_id.append(bsub_NCBI[id_in_ncbi]['12_symbols'])
        y_ncbi.append(bsub_HGTDB['HGT'][i])
    except:
        not_in_ncbi.append(id_in_ncbi)
    

In [89]:
print(len(y_ncbi))
print(y_ncbi.count(1))

4002
528


change from dict to numpy array

In [94]:
# rel_freq
X_rel_freq_array = []

for i in X_rel_freq:
    X_rel_freq_array.append([values for values in i.values()])

X_rel_freq_array = np.array(X_rel_freq_array)

In [95]:
X_rel_freq_array

array([[0.19985086, 0.33929903, 0.20134228, 0.25950783],
       [0.20228672, 0.33333333, 0.19700967, 0.26737027],
       [0.28240741, 0.28703704, 0.15277778, 0.27777778],
       ...,
       [0.24642289, 0.30789613, 0.21515633, 0.23052464],
       [0.2673913 , 0.28768116, 0.19057971, 0.25434783],
       [0.23763955, 0.3476874 , 0.18181818, 0.23285486]])

In [127]:
# nuc_id (12 symbols)
X_nuc_id_array = []

for i in X_nuc_id:
    flat= []
    for values in i.values():
        flat.extend(values)
    X_nuc_id_array.append(flat)

X_nuc_id_array = np.array(X_nuc_id_array)

In [131]:
X_nuc_id_array[0]

array([0.32438479, 0.37807606, 0.31543624, 0.17002237, 0.29530201,
       0.31319911, 0.31096197, 0.10514541, 0.18344519, 0.19463087,
       0.22147651, 0.18791946])

In [130]:
len(X_nuc_id_array)

4002

# try training

In [96]:
from trainer.ml_trainer import MLClassifier
from sklearn.model_selection import train_test_split

## Rel_freq

In [97]:
X_train, X_test, y_train, y_test = train_test_split( X_rel_freq_array, y_ncbi, test_size=0.5, random_state=42)

In [98]:
nucleotide_rel_freq_lgbm = MLClassifier(LGBMClassifier())

nucleotide_rel_freq_lgbm.train(X_train, y_train)
nucleotide_rel_freq_lgbm.eval(X_test, y_test)

[[1656   73]
 [ 160  112]]
              precision    recall  f1-score   support

           0       0.91      0.96      0.93      1729
           1       0.61      0.41      0.49       272

    accuracy                           0.88      2001
   macro avg       0.76      0.68      0.71      2001
weighted avg       0.87      0.88      0.87      2001



In [99]:
nucleotide_rel_freq_hgb = MLClassifier(HistGradientBoostingClassifier())

nucleotide_rel_freq_hgb.train(X_train, y_train)
nucleotide_rel_freq_hgb.eval(X_test, y_test)

[[1652   77]
 [ 166  106]]
              precision    recall  f1-score   support

           0       0.91      0.96      0.93      1729
           1       0.58      0.39      0.47       272

    accuracy                           0.88      2001
   macro avg       0.74      0.67      0.70      2001
weighted avg       0.86      0.88      0.87      2001



## nuc id

In [135]:
X_train, X_test, y_train, y_test = train_test_split( X_nuc_id_array, y_ncbi, test_size=0.20, random_state=42)

In [136]:
nucleotide_id_lgbm = MLClassifier(LGBMClassifier())

nucleotide_id_lgbm.train(X_train, y_train)
nucleotide_id_lgbm.eval(X_test, y_test)

[[683  16]
 [ 62  40]]
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       699
           1       0.71      0.39      0.51       102

    accuracy                           0.90       801
   macro avg       0.82      0.68      0.73       801
weighted avg       0.89      0.90      0.89       801



In [137]:
nucleotide_id_hgb = MLClassifier(HistGradientBoostingClassifier())

nucleotide_id_hgb.train(X_train, y_train)
nucleotide_id_hgb.eval(X_test, y_test)

[[683  16]
 [ 63  39]]
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       699
           1       0.71      0.38      0.50       102

    accuracy                           0.90       801
   macro avg       0.81      0.68      0.72       801
weighted avg       0.89      0.90      0.89       801



# Bacillus/ Clostridium Class: order classifier


orders:

- Bacilli
- Clostridia
- Mollicutes

In [138]:
list_bacilli = {
    'bsub',
    'bhal',
    'linno',
    'lmono',
    'sau2',
    'sau1',
    'sau3',
    'llac',
    'spyo',
    'spyo2',
    'spneu1',
    'spneu2'
}

list_clostridia = {
    'caceto',
    'cperf',
    'tteng'
}

list_mollicutes = {
    'mgen',
    'mpneu',
    'mpul',
    'uure'
}

In [139]:
def aggregate_sequences(list_of_bacteria, data_type):
    if data_type == 'B':
        columns = 12
    elif data_type == 'C':
        columns = 10
    elif data_type == 'D':
        columns = 4
    elif data_type == 'E':
        columns = 8
    else:
        raise ValueError(f'Unknown data type {data_type}')
    X_agg= np.array([]).reshape(0,columns)
    y_agg = np.array([]).reshape(0,)
    for i in list_of_bacteria:
        X,y = ml_load_species(i, data_type)
        X_agg = np.concatenate([X, X_agg], axis = 0)
        y_agg = np.concatenate([y, y_agg], axis = 0)
        
    return X_agg,y_agg

## Bacilli

In [141]:
X_bacilli, y_bacilli = aggregate_sequences(list_bacilli, 'E')
X_train, X_test, y_train, y_test = train_test_split( X_bacilli, y_bacilli, test_size=0.2, random_state=42)

In [142]:
# init model and init classifier
HGT_bacilli_E_lgbm = MLClassifier(LGBMClassifier())

# train and eval
HGT_bacilli_E_lgbm.train(X_train, y_train)
HGT_bacilli_E_lgbm.eval(X_test, y_test)
HGT_bacilli_E_lgbm.get_precision(X_test, y_test)
HGT_bacilli_E_lgbm.get_roc_auc(X_test, y_test)

[[5801   87]
 [ 290  198]]
              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97      5888
         1.0       0.69      0.41      0.51       488

    accuracy                           0.94      6376
   macro avg       0.82      0.70      0.74      6376
weighted avg       0.93      0.94      0.93      6376

prec: 0.6947368421052632
roc_auc: 0.9430470211711511


## Clostridia

In [143]:
X_clostridia, y_clostridia = aggregate_sequences(list_clostridia, 'E')
X_train, X_test, y_train, y_test = train_test_split( X_clostridia, y_clostridia, test_size=0.2, random_state=42)

In [144]:
# init model and init classifier
HGT_clostridia_E_lgbm = MLClassifier(LGBMClassifier())

# train and eval
HGT_clostridia_E_lgbm.train(X_train, y_train)
HGT_clostridia_E_lgbm.eval(X_test, y_test)
HGT_clostridia_E_lgbm.get_precision(X_test, y_test)
HGT_clostridia_E_lgbm.get_roc_auc(X_test, y_test)

[[1631   27]
 [  98   28]]
              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96      1658
         1.0       0.51      0.22      0.31       126

    accuracy                           0.93      1784
   macro avg       0.73      0.60      0.64      1784
weighted avg       0.91      0.93      0.92      1784

prec: 0.509090909090909
roc_auc: 0.9208622934497481


## Mollicutes

In [145]:
X_mollicutes, y_mollicutes = aggregate_sequences(list_mollicutes, 'E')
X_train, X_test, y_train, y_test = train_test_split( X_mollicutes, y_mollicutes, test_size=0.2, random_state=42)

In [146]:
# init model and init classifier
HGT_mollicutesa_E_lgbm = MLClassifier(LGBMClassifier())

# train and eval
HGT_mollicutesa_E_lgbm.train(X_train, y_train)
HGT_mollicutesa_E_lgbm.eval(X_test, y_test)
HGT_mollicutesa_E_lgbm.get_precision(X_test, y_test)
HGT_mollicutesa_E_lgbm.get_roc_auc(X_test, y_test)

[[484   6]
 [ 17   7]]
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98       490
         1.0       0.54      0.29      0.38        24

    accuracy                           0.96       514
   macro avg       0.75      0.64      0.68       514
weighted avg       0.95      0.96      0.95       514

prec: 0.5384615384615384
roc_auc: 0.8715986394557823
