In [1]:
%%capture
!pip install mljar-supervised;
!pip install gdown
!pip install imblearn



In [3]:

# Download both interpro and rast annotations for genomes for this to work. Stored in data/ Both files are in .gitignore
import os



# Check and download allgenomes.RAST.txt.ps.tsv file
if not os.path.exists("data/rast_features.tsv"):
    print("Downloading File data/rast_features.tsv")
    !gdown --no-check-certificate 1dDLqkjN0YGKa7mNDKhTmGSeZZWt1T3cl -O data/rast_features.tsv
else:
    print("File data/rast_features.tsv already exists")

# Check and download allgenomes.interpro.txt.ps.tsv file
if not os.path.exists("data/interpro_features.tsv"):
    print("Downloading File data/interpro_features.tsv")
    !gdown --no-check-certificate 13I7lBj02KEsA4OowW89h0zW8tLsmQrrb -O data/interpro_features.tsv
else:
    print("File data/interpro_features.tsv already exists")
    
    

File data/rast_features.tsv already exists
File data/interpro_features.tsv already exists


In [3]:
import uuid
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.metrics import classification_report, accuracy_score
import shap
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

from sklearn.model_selection import train_test_split
from supervised.automl import AutoML
from IPython.display import display, HTML
import joblib



def get_X_y(feature_df, pheno_full_df, phenotype_name, variance_threshold):

   
    # Read phenotype data
    phenotype_df = pheno_full_df[phenotype].dropna()

    # Merge genotype and phenotype df 
    merged_df = feature_df.merge(phenotype_df, left_index=True, right_index=True, how='inner')

    # Remove all features small variance as they don't have much discriminatory power
    constant_filter = VarianceThreshold(threshold=variance_threshold)
    constant_filter.fit(merged_df)
    selected_columns_indices = constant_filter.get_support()
    selected_column_names = merged_df.columns[selected_columns_indices]
    df_filtered = merged_df[selected_column_names]


    # Get X and Y
    X = df_filtered.drop(columns=[phenotype])
    y = df_filtered[phenotype]
    
    return [X,y]


def generate_html_table(df: pd.DataFrame):
        """Display a pandas.DataFrame as jQuery DataTables"""

        # Generate random container name
        id_container = uuid.uuid1()
        output = """
    <div id="datatable-container-{id_container}">
      <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.0/jquery.min.js"></script>
      <script type="text/javascript" src="https://cdn.datatables.net/1.13.5/js/jquery.dataTables.min.js"></script>
      <link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.13.5/css/jquery.dataTables.min.css"/>
      <script type="text/javascript">
        $(document).ready( function () {{
            $('#BGCtable').DataTable();
        }});
      </script>
      <!-- Insert table below -->
      {table}
    </div>
        """.format(
            id_container=id_container,
            table=df.to_html(
                index=False,
                table_id="classification",
                classes="display"
            ),
        )
        return output




def get_shap_plot(model, X_test, X, title, result_dir):
    # Initialize the SHAP explainer
    explainer = shap.TreeExplainer(model)

        # Calculate SHAP values for the test set
    shap_values = explainer.shap_values(X_test)

        # Get feature importance based on SHAP values
    mean_abs_shap = np.mean(np.abs(shap_values[1]), axis=0)
    feature_importance = pd.DataFrame({'feature': X.columns, 'importance': mean_abs_shap})
    feature_importance = feature_importance.sort_values(by='importance', ascending=False)

    # Select the top 100 features 
    top_features = feature_importance.head(20)[['feature', 'importance']]
    
    tsv_file =  result_dir + "/" + title + ".tsv"
    # Save the top_features DataFrame to a TSV file
    top_features.to_csv(tsv_file , sep='\t', index=False)

    # Concatenate the feature and importance columns into a single string separated by '#'
    top_features['combined'] = top_features.apply(lambda row: f"{row['feature']} // {row['importance']}", axis=1)

     # Extract the 'combined' column as a list of strings
    concatenated_features = top_features['combined'].tolist()

     # If you want a single string with all features separated by a space or newline:
    formatted_top_features = ' # '.join(concatenated_features)
  


    plt.switch_backend('Agg')
    
    # Plot SHAP summary plot for the top 20 features
    plt.figure(figsize=(20, 10))  # Adjust width (20) and height (10) as desired
    plt.title(title)
    shap.summary_plot(shap_values[1], X_test, max_display=20, plot_size=None)
    img_path = result_dir + "/" + title + ".png"
    plt.savefig(img_path)
    plt.close()

    return [formatted_top_features, img_path]



def get_shap_plot_catboost(model, X_test, X, title, result_dir):
    # Create explainer object
    explainer = shap.Explainer(model)
    
    # Calculate SHAP values for all instances in your dataset
    shap_values = explainer.shap_values(X)

    # Get feature importance based on SHAP values
    mean_abs_shap = np.mean(np.abs(shap_values), axis=0)
    feature_importance = pd.DataFrame({'feature': X.columns, 'importance': mean_abs_shap})
    feature_importance = feature_importance.sort_values(by='importance', ascending=False)

    # Select the top 100 features 
    top_features = feature_importance.head(20)[['feature', 'importance']]
    
    tsv_file =  result_dir + "/" + title + ".tsv"
    # Save the top_features DataFrame to a TSV file
    top_features.to_csv(tsv_file , sep='\t', index=False)
    
    # Concatenate the feature and importance columns into a single string separated by '#'
    top_features['combined'] = top_features.apply(lambda row: f"{row['feature']} // {row['importance'] }", axis=1)

     # Extract the 'combined' column as a list of strings
    concatenated_features = top_features['combined'].tolist()

     # If you want a single string with all features separated by a space or newline:
    formatted_top_features = ' # '.join(concatenated_features)
  


    plt.switch_backend('Agg')
    plt.figure(figsize=(20, 10))  # Adjust width (20) and height (10) as desired
    plt.title(title)
    shap.summary_plot(shap_values, X, max_display=20, plot_size=None)
    img_path = result_dir + "/" + title + ".png"
    plt.savefig(img_path)
    plt.close()

    return formatted_top_features, img_path

    
    
def get_classifier_report(X, y, phenotype_name, result_dir):

    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
    import xgboost as xgb
    import lightgbm as lgb
    import catboost as cb
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, confusion_matrix
    from sklearn.feature_selection import SelectKBest, chi2
    import pandas as pd



    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1176)
    
    # Compute the class ratio
    minority_class_count = sum(y_train == 1)  # Assuming '1' denotes the minority class
    majority_class_count = len(y_train) - minority_class_count
    class_ratio = minority_class_count / majority_class_count

    # If the class ratio is too low (e.g., below 0.75), perform over-sampling
    if class_ratio < 0.75:
        ros = RandomOverSampler(sampling_strategy=0.75, random_state=1176)
        X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    else:
       # If not, simply keep the original samples
        X_train_resampled, y_train_resampled = X_train, y_train

    # If the class ratio after over-sampling exceeds a threshold (e.g., greater than 0.9), perform under-sampling
    if class_ratio > 0.9:
        rus = RandomUnderSampler(sampling_strategy=1.0, random_state=1176)
        X_train_resampled, y_train_resampled = rus.fit_resample(X_train_resampled, y_train_resampled)


    # Dictionary to store the classifiers and their parameters
    classifiers = {
        'Decision Tree': {
            'model': DecisionTreeClassifier(),
            'params': {}
        },
        'Random Forest': {
            'model': RandomForestClassifier(),
            'params': {'n_estimators': 1000, 'max_depth': None, 'random_state': 100}
        }, 

        'CatBoost': {
            'model': cb.CatBoostClassifier(),
            'params': {'iterations': 1000, 'depth': 6, 'learning_rate': 0.1, 'random_state': 42, 'verbose': False}
        }

    }

    html_table_rows = []

    img_paths = []

    # Train and evaluate each classifier
    for clf_name, clf_data in classifiers.items():
        print ("Running " + clf_name + " for phenotype " + phenotype_name)
        model = clf_data['model']
        params = clf_data['params']

        # Train the classifier
        model.set_params(**params)
        model.fit(X_train_resampled, y_train_resampled)

        # Make predictions on the test set
        y_pred = model.predict(X_test)



        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        
        confusion_matrix_info = confusion_matrix(y_test, y_pred)
        
        # Extracting values
        TP = confusion_matrix_info[0, 0]
        FP = confusion_matrix_info[0, 1]
        FN = confusion_matrix_info[1, 0]
        TN = confusion_matrix_info[1, 1]
        
        confusion_matrix_data = "TP=" + str(TP) + " TN=" + str(TN) + " FP=" + str(FP) + "FN=" + str(FN)
   
        title = clf_name + "___" + phenotype_name
        
        # Print shap
        if clf_name == "CatBoost":
            top_features, img_path = get_shap_plot_catboost(model, X_test, X, title, result_dir)
            img_paths.append(img_path)
        else:
            top_features, img_path = get_shap_plot (model, X_test, X, title, result_dir)
            img_paths.append(img_path)

        # Append row to the HTML table
        html_table_rows.append([clf_name, accuracy, precision, recall, f1, balanced_accuracy, confusion_matrix_data, top_features])
        
        

            
            

    # Create a DataFrame to display the results
    headers = ["Model", "Accuracy", "Precision", "Recall", "F1-score", "Balanced Accuracy", "Confusion Matrix", "Top features"]
    report_df = pd.DataFrame(html_table_rows, columns=headers)
    fname = result_dir + "/" + phenotype_name + "_complete.tsv"
    report_df.to_csv(fname , sep='\t', index=False)
    report_html_content = generate_html_table (report_df)
    print ("######### Combined report for " + phenotype_name + "#####################")
    display(HTML(report_html_content))
    html_img = ""
    for img_path in img_paths:
        html_img += "<img src='" + img_path + "'</></br>"
        
    display(HTML(html_img))

    



In [4]:

result_dir = "ml_results/non_metabolic_phenotypes"

VARIANCE_THRESHOLD = 0.0001

# Read rast features
rast_annotated_df = pd.read_csv("data/rast_features.tsv", sep = "\t", index_col=0)
# Print some stats about data
# Cleaning function
#def clean_column_name(col_name):
#    return ''.join(filter(str.isalnum, col_name.replace("[", "").replace("]", "").replace("<", "")))

# Cleaning function
def clean_column_name(col_name):
    for char in [",", "[", "]", "<"]:
        col_name = col_name.replace(char, "")
    return col_name


# Apply the cleaning function to each column name
rast_annotated_df.columns = [clean_column_name(col) for col in rast_annotated_df.columns]




metabolic_phenotypes_df = pd.read_csv("data/non_metabolic_phenotypes_bacdive.tsv", sep = "\t", index_col=0)  


for phenotype in metabolic_phenotypes_df:
    # All phenotype names have -- in them
    if "strain" in phenotype:
        continue
    

    try:
        X, y = get_X_y(rast_annotated_df, metabolic_phenotypes_df, phenotype,VARIANCE_THRESHOLD )

        num_genome, num_features = X.shape
        display(HTML("<H2> Phenotype: " + phenotype + "</h2>"))
        print ("Numer of genomes:" + str(num_genome) + "\n" + "Number of genomic features :" + str(num_features)) 
        print ("Shape of y:" + str(y.shape))
        count_1 = sum(y==1)
        count_0 = sum(y==0)
        print ("Count of 1: " + str(count_1) + "\n" + "Count of 0: " + str(count_0))
    
        get_classifier_report(X,y, phenotype, result_dir)
        

    except Exception as e:
        print (e)


Numer of genomes:3488
Number of genomic features :5745
Shape of y:(3488,)
Count of 1: 1259
Count of 0: 2229
Running Decision Tree for phenotype gram_positive




Running Random Forest for phenotype gram_positive




Running CatBoost for phenotype gram_positive




######### Combined report for gram_positive#####################


Model,Accuracy,Precision,Recall,F1-score,Balanced Accuracy,Confusion Matrix,Top features
Decision Tree,0.964183,0.964578,0.964183,0.964278,0.964402,TP=423 TN=250 FP=16FN=9,SSO:000036573__LPS-assembly protein LptD // 0.34979481931651524 # SSO:000024182__Ribosome maturation factor rimM // 0.09152243913391771 # SSO:000002230__DegV family protein // 0.020738837684618656 # SSO:000012016__Cd(II)/Pb(II)-responsive transcriptional regulator // 0.017657299670408723 # SSO:000020884__Outer membrane lipoprotein-sorting protein // 0.01006426067560261 # SSO:000000116__2-C-methyl-D-erythritol 4-phosphate cytidylyltransferase (EC 2.7.7.60) // 0.00906185286642901 # SSO:000010833__Arylesterase // 0.007001536892138345 # SSO:000031056__metalloenzyme domain protein // 0.005753350162300797 # SSO:000012210__Chorismate mutase // 0.00552087930300321 # SSO:000002975__GDP-mannose pyrophosphatase NudK // 0.004437231896723305 # SSO:000008885__Uroporphyrinogen III decarboxylase (EC 4.1.1.37) // 0.004383590125632627 # SSO:000002109__DNA repair protein RecN // 0.004236251394579737 # SSO:000007701__Stage III sporulation protein AE // 0.003989698156648993 # SSO:000000325__3-oxoacid CoA-transferase (EC 2.8.3.5) // 0.0031681144617546927 # SSO:000025220__Sucrose phosphorylase (EC 2.4.1.7) // 0.0029998275362918777 # SSO:000007864__Superoxide dismutase Mn (EC 1.15.1.1) // 0.0028419093200254633 # SSO:000033516__thrombospondin type 3 repeat-containing protein // 0.0026879462003157597 # SSO:000002315__Dihydropyrimidinase (EC 3.5.2.2) // 0.0023440485539583805 # SSO:000007156__Ribulokinase (EC 2.7.1.16) // 0.0020978749509766373 # SSO:000012775__DNA integrity scanning protein DisA // 0.0019723515026714242
Random Forest,0.975645,0.976584,0.975645,0.97576,0.979055,TP=424 TN=257 FP=15FN=2,SSO:000007937__Tetraacyldisaccharide 4'-kinase (EC 2.7.1.130) // 0.014633609701367893 # SSO:000036573__LPS-assembly protein LptD // 0.01458385153790173 # SSO:000008604__UDP-3-O-3-hydroxymyristoyl glucosamine N-acyltransferase (EC 2.3.1.-) // 0.013232399168802079 # SSO:000006816__Pyridoxine 5'-phosphate synthase (EC 2.6.99.2) // 0.012327954130347157 # SSO:000004460__Lipid-A-disaccharide synthase (EC 2.4.1.182) // 0.011347989885386099 # SSO:000008104__TonB-dependent receptor // 0.010842533316228059 # SSO:000043994__3-deoxy-8-phosphooctulonate synthase (EC 2.5.1.55) // 0.01053073983835863 # SSO:000000281__3-deoxy-manno-octulosonate cytidylyltransferase (EC 2.7.7.38) // 0.009890819949860885 # SSO:000020865__Outer membrane beta-barrel protein // 0.009063766089221126 # SSO:000008598__UDP-23-diacylglucosamine diphosphatase (EC 3.6.1.54) // 0.00723727383951851 # SSO:000004958__MotA/TolQ/ExbB proton channel family protein // 0.007078046998995059 # SSO:000035439__DUF402 domain-containing protein // 0.006882835027475438 # SSO:000000280__3-deoxy-D-manno-octulosonic acid transferase (EC 2.4.99.13) (EC 2.4.99.12) // 0.0064925846477843245 # SSO:000019343__Mannose-6-phosphate isomerase class I (EC 5.3.1.8) // 0.006326758968222951 # SSO:000023973__TonB family protein // 0.0062976924297360815 # SSO:000004949__Monofunctional biosynthetic peptidoglycan transglycosylase (EC 2.4.2.-) // 0.006197319117931633 # SSO:000013395__Endonuclease NucS // 0.006087605866210845 # SSO:000023921__Redox-sensing transcriptional repressor Rex // 0.006064350759241957 # SSO:000002241__Demethylmenaquinone methyltransferase (EC 2.1.1.163) // 0.005703261010304857 # SSO:000002230__DegV family protein // 0.005555864434911057
CatBoost,0.974212,0.975043,0.974212,0.974325,0.977124,TP=424 TN=256 FP=15FN=3,SSO:000036573__LPS-assembly protein LptD // 0.8796578444948923 # SSO:000008104__TonB-dependent receptor // 0.720688302019267 # SSO:000008604__UDP-3-O-3-hydroxymyristoyl glucosamine N-acyltransferase (EC 2.3.1.-) // 0.650741113766911 # SSO:000006816__Pyridoxine 5'-phosphate synthase (EC 2.6.99.2) // 0.35136703805538233 # SSO:000004460__Lipid-A-disaccharide synthase (EC 2.4.1.182) // 0.3423223681927135 # SSO:000033044__pyrimidine 5'-nucleotidase // 0.2457206819833774 # SSO:000024227__Rod shape-determining protein // 0.1984993162120978 # SSO:000007862__Superoxide dismutase Fe (EC 1.15.1.1) // 0.1884489517247105 # SSO:000006040__Phosphoribosylanthranilate isomerase (EC 5.3.1.24) // 0.18177532261810073 # SSO:000020789__OmpA family protein // 0.17498752395738687 # SSO:000013635__FHA domain-containing protein // 0.17166205507822635 # SSO:000043994__3-deoxy-8-phosphooctulonate synthase (EC 2.5.1.55) // 0.16497529806756162 # SSO:000002793__Flagellar basal-body P-ring formation protein FlgA // 0.15965561843097067 # SSO:000024182__Ribosome maturation factor rimM // 0.1500176849595837 # SSO:000004958__MotA/TolQ/ExbB proton channel family protein // 0.12263642807186254 # SSO:000002230__DegV family protein // 0.11391326128189248 # SSO:000002241__Demethylmenaquinone methyltransferase (EC 2.1.1.163) // 0.11190425941785152 # SSO:000002076__DNA mismatch repair protein MutS // 0.11142286617711364 # SSO:000020884__Outer membrane lipoprotein-sorting protein // 0.11081315558726698 # SSO:000001295__CCA tRNA nucleotidyltransferase (EC 2.7.7.72) // 0.10974547133456834


Numer of genomes:3175
Number of genomic features :5659
Shape of y:(3175,)
Count of 1: 1478
Count of 0: 1697
Running Decision Tree for phenotype motility




Running Random Forest for phenotype motility




Running CatBoost for phenotype motility




######### Combined report for motility#####################


Model,Accuracy,Precision,Recall,F1-score,Balanced Accuracy,Confusion Matrix,Top features
Decision Tree,0.770079,0.771441,0.770079,0.769653,0.769634,TP=260 TN=229 FP=61FN=85,SSO:000002795__Flagellar basal-body rod protein FlgB // 0.18308957152064434 # SSO:000002839__Flagellin // 0.06378305343015095 # SSO:000016805__Flagellar motor protein MotB // 0.0476786249227176 # SSO:000002805__Flagellar biosynthesis protein FliQ // 0.034872771691269996 # SSO:000043244__geranylgeranylglycerol-phosphate geranylgeranyltransferase (EC 2.5.1.42) // 0.03164046809479167 # SSO:000002802__Flagellar biosynthesis protein FlhF // 0.024169765062585485 # SSO:000043066__hypoxanthine phosphoribosyltransferase (EC 2.4.2.8) // 0.01928242651824288 # SSO:000042174__acetylornithine transaminase // 0.019045253121735294 # SSO:000022945__Protein-L-isoaspartate(D-aspartate) O-methyltransferase( EC:2.1.1.77 ) // 0.01867129364497628 # SSO:000013192__DnaD domain protein // 0.017664423346657875 # SSO:000009334__tRNA (cytidine(34)-2'-O)-methyltransferase (EC 2.1.1.207) // 0.016958192977045466 # SSO:000021274__Patatin-like phospholipase family protein // 0.014809230154945069 # SSO:000020985__Oxygen-independent coproporphyrinogen III oxidase // 0.013487579585737321 # SSO:000028910__VanZ family protein // 0.012903306357483869 # SSO:000005809__Peptidyl-prolyl cis-trans isomerase (EC 5.2.1.8) // 0.011428697128600005 # SSO:000029525__cation diffusion facilitator family transporter // 0.010641646490403691 # SSO:000002819__Flagellar motor switch protein FliM // 0.01060050656538925 # SSO:000021761__Phytase // 0.009665996699978108 # SSO:000044159__glycine C-acetyltransferase (EC 2.3.1.29) // 0.009312644571066597 # SSO:000013035__DedA family protein // 0.008373130602939056
Random Forest,0.837795,0.838016,0.837795,0.837737,0.837619,TP=274 TN=258 FP=47FN=56,SSO:000002814__Flagellar hook-basal body complex protein FliE // 0.012406388827332093 # SSO:000002818__Flagellar motor switch protein FliG // 0.012066662233034306 # SSO:000002796__Flagellar basal-body rod protein FlgC // 0.011412345315577069 # SSO:000002819__Flagellar motor switch protein FliM // 0.011328995464604256 # SSO:000002795__Flagellar basal-body rod protein FlgB // 0.010439807111079452 # SSO:000002812__Flagellar hook-associated protein FlgK // 0.01032426472148563 # SSO:000002800__Flagellar biosynthesis protein FlhA // 0.01019008008933114 # SSO:000002805__Flagellar biosynthesis protein FliQ // 0.009673200274663106 # SSO:000002788__Flagellar M-ring protein FliF // 0.0094487024784861 # SSO:000002802__Flagellar biosynthesis protein FlhF // 0.008928477087048185 # SSO:000002798__Flagellar basal-body rod protein FlgG // 0.008643649235409065 # SSO:000002815__Flagellar hook-length control protein FliK // 0.007749854349932877 # SSO:000002801__Flagellar biosynthesis protein FlhB // 0.007686077102640336 # SSO:000012160__Chemotaxis response regulator protein-glutamate methylesterase (EC 3.1.1.61) // 0.007606418300871107 # SSO:000016805__Flagellar motor protein MotB // 0.007003138224563767 # SSO:000019203__METHYL-ACCEPTING CHEMOTAXIS PROTEIN // 0.00671689261061894 # SSO:000012147__Chemotaxis protein CheA (EC 2.7.3.-) // 0.006469669861163628 # SSO:000002820__Flagellar motor switch protein FliN // 0.006342501206001337 # SSO:000035923__Flagellar export protein FliJ // 0.005998529394469872 # SSO:000002797__Flagellar basal-body rod protein FlgF // 0.005997343295268956
CatBoost,0.829921,0.830323,0.829921,0.829825,0.829692,TP=273 TN=254 FP=48FN=60,SSO:000002814__Flagellar hook-basal body complex protein FliE // 0.2635650736132662 # SSO:000002796__Flagellar basal-body rod protein FlgC // 0.2562835857959187 # SSO:000002819__Flagellar motor switch protein FliM // 0.2378859033075014 # SSO:000019203__METHYL-ACCEPTING CHEMOTAXIS PROTEIN // 0.23483731699152224 # SSO:000002802__Flagellar biosynthesis protein FlhF // 0.22456413920958343 # SSO:000016805__Flagellar motor protein MotB // 0.18713300063076582 # SSO:000002795__Flagellar basal-body rod protein FlgB // 0.17298914956132377 # SSO:000002805__Flagellar biosynthesis protein FliQ // 0.16007203114697613 # SSO:000012160__Chemotaxis response regulator protein-glutamate methylesterase (EC 3.1.1.61) // 0.14959273047873897 # SSO:000035923__Flagellar export protein FliJ // 0.13135027673146168 # SSO:000002815__Flagellar hook-length control protein FliK // 0.11792540243877599 # SSO:000002812__Flagellar hook-associated protein FlgK // 0.11335283801457029 # SSO:000021790__PilZ domain-containing protein // 0.10287621153675014 # SSO:000020822__Organic hydroperoxide resistance protein // 0.09526600000540532 # SSO:000002801__Flagellar biosynthesis protein FlhB // 0.09474102814046646 # SSO:000002813__Flagellar hook-associated protein FlgL // 0.08561383631556872 # SSO:000017928__Hydantoinase/oxoprolinase family protein // 0.08367556187806083 # SSO:000018456__Inositol monophosphatase family protein // 0.08287625427913904 # SSO:000029108__YaiI/YqxD family protein // 0.08068765048765165 # SSO:000005188__NAD-dependent malic enzyme (EC 1.1.1.38) // 0.07901777142206096


Numer of genomes:2485
Number of genomic features :5714
Shape of y:(2485,)
Count of 1: 2239
Count of 0: 246
Running Decision Tree for phenotype oxygen_tolerance_aerobe




Running Random Forest for phenotype oxygen_tolerance_aerobe




Running CatBoost for phenotype oxygen_tolerance_aerobe




######### Combined report for oxygen_tolerance_aerobe#####################


Model,Accuracy,Precision,Recall,F1-score,Balanced Accuracy,Confusion Matrix,Top features
Decision Tree,0.837022,0.908924,0.837022,0.860051,0.828774,TP=45 TN=371 FP=10FN=71,SSO:000004504__Lipoyl synthase (EC 2.8.1.8) // 0.23201676791569695 # SSO:000035652__Peptide-methionine (S)-S-oxide reductase MsrA (EC 1.8.4.11) // 0.06833099202636361 # SSO:000017976__Hydroxylamine reductase (EC 1.7.99.1) // 0.059406447112222656 # SSO:000005342__Nitrite reductase (EC 1.7.2.1) // 0.03776254613891879 # SSO:000000381__4-hydroxyproline epimerase (EC 5.1.1.8) // 0.03132472330635041 # SSO:000002270__Diacylglycerol kinase (EC 2.7.1.107) // 0.02788946598441933 # SSO:000000315__3-methyl-2-oxobutanoate hydroxymethyltransferase (EC 2.1.2.11) // 0.02622462705608851 # SSO:000029372__anaerobic ribonucleoside-triphosphate reductase activating protein // 0.025980780064002725 # SSO:000028839__Universal stress protein // 0.023198252671002027 # SSO:000013137__Dihydroorotate dehydrogenase-like protein // 0.021343672174097262 # SSO:000000159__2-hydroxy-3-oxopropionate reductase (EC 1.1.1.60) // 0.0201611047236325 # SSO:000007158__Ribulose bisphosphate carboxylase (EC 4.1.1.39) // 0.018392284292701552 # SSO:000034727__Autonomous glycyl radical cofactor GrcA // 0.017949164884592115 # SSO:000003153__Glutamate racemase (EC 5.1.1.3) // 0.016380148834323704 # SSO:000024791__Site-specific DNA-methyltransferase // 0.015363456259094826 # SSO:000002702__Ferredoxin (EC 1.18.1.3 ) // 0.012935135057322146 # SSO:000020865__Outer membrane beta-barrel protein // 0.010470324246832462 # SSO:000034571__Aldose 1-epimerase family protein // 0.00903568479732897 # SSO:000016725__Fibrinogen-binding protein // 0.007451829704649573 # SSO:000042475__5-methyltetrahydropteroyltriglutamate---homocysteine S-methyltransferase (EC 2.1.1.14) // 0.005230704908981781
Random Forest,0.945674,0.951635,0.945674,0.947801,0.90578,TP=47 TN=423 FP=8FN=19,SSO:000028881__Uroporphyrinogen decarboxylase // 0.008848877898143124 # SSO:000029372__anaerobic ribonucleoside-triphosphate reductase activating protein // 0.00851102163182572 # SSO:000043905__4a-hydroxytetrahydrobiopterin dehydratase (EC 4.2.1.96) // 0.008399274397856921 # SSO:000013622__FAD-dependent monooxygenase // 0.007772177740948793 # SSO:000002250__Deoxyribodipyrimidine photolyase (EC 4.1.99.3) // 0.007697807600845828 # SSO:000000701__Acetyl-CoA C-acyltransferase (EC 2.3.1.16) // 0.007345645805637392 # SSO:000030895__iron-sulfur cluster assembly accessory protein // 0.007318692924588826 # SSO:000000074__2345-tetrahydropyridine-26-dicarboxylate N-succinyltransferase (EC 2.3.1.117) // 0.006968423483887416 # SSO:000009584__2OG-Fe(II) oxygenase // 0.006555065464498716 # SSO:000004504__Lipoyl synthase (EC 2.8.1.8) // 0.006399126375742099 # SSO:000029974__gamma-glutamyltransferase( EC:2.3.2.2 ) // 0.006348669473808555 # SSO:000000380__4-hydroxyphenylpyruvate dioxygenase (EC 1.13.11.27) // 0.006242483360984596 # SSO:000013134__Dihydroorotate dehydrogenase // 0.006179695727300499 # SSO:000000206__2-oxoglutarate dehydrogenase E1 component (EC 1.2.4.2) // 0.005619289155338894 # SSO:000029482__c-type cytochrome // 0.0055814417359000175 # SSO:000012589__Cytochrome P-450 // 0.005564773895784095 # SSO:000012628__Cytochrome c oxidase subunit I // 0.005446919601918514 # SSO:000002683__Fatty acid desaturase (EC 1.14.19.1) // 0.005179270714447092 # SSO:000024194__Rieske (2Fe-2S) domain-containing protein // 0.00511748493507482 # SSO:000002897__Formate--tetrahydrofolate ligase (EC 6.3.4.3) // 0.004958326996258002
CatBoost,0.925553,0.940873,0.925553,0.930615,0.894467,TP=47 TN=413 FP=8FN=29,SSO:000043905__4a-hydroxytetrahydrobiopterin dehydratase (EC 4.2.1.96) // 0.567129727712252 # SSO:000013144__DinB family protein // 0.4650426429027175 # SSO:000000701__Acetyl-CoA C-acyltransferase (EC 2.3.1.16) // 0.36399282846493247 # SSO:000030895__iron-sulfur cluster assembly accessory protein // 0.3341200891876979 # SSO:000013134__Dihydroorotate dehydrogenase // 0.3222402503459715 # SSO:000029372__anaerobic ribonucleoside-triphosphate reductase activating protein // 0.30855304383545995 # SSO:000029974__gamma-glutamyltransferase( EC:2.3.2.2 ) // 0.27746014945402764 # SSO:000002683__Fatty acid desaturase (EC 1.14.19.1) // 0.25839768839336985 # SSO:000013622__FAD-dependent monooxygenase // 0.21413097548903637 # SSO:000003277__Glycine oxidase ThiO (EC 1.4.3.19) // 0.207864509827748 # SSO:000017976__Hydroxylamine reductase (EC 1.7.99.1) // 0.2016684529060649 # SSO:000002250__Deoxyribodipyrimidine photolyase (EC 4.1.99.3) // 0.19853231896762047 # SSO:000000206__2-oxoglutarate dehydrogenase E1 component (EC 1.2.4.2) // 0.19345557779501366 # SSO:000004504__Lipoyl synthase (EC 2.8.1.8) // 0.1752720683359583 # SSO:000012589__Cytochrome P-450 // 0.165482565820354 # SSO:000017597__HTH domain-containing protein // 0.14916712407854824 # SSO:000000406__5-(carboxyamino)imidazole ribonucleotide synthase (EC 6.3.4.18) // 0.14790391904626843 # SSO:000010682__Anti-sigma factor // 0.1457848093614768 # SSO:000000286__3-hydroxyacyl-CoA dehydrogenase (EC 1.1.1.35) // 0.13777486993595128 # SSO:000019509__Metal-dependent hydrolase // 0.13215363829717933


Numer of genomes:2560
Number of genomic features :5375
Shape of y:(2560,)
Count of 1: 735
Count of 0: 1825
Running Decision Tree for phenotype spore_formation




Running Random Forest for phenotype spore_formation




Running CatBoost for phenotype spore_formation




######### Combined report for spore_formation#####################


Model,Accuracy,Precision,Recall,F1-score,Balanced Accuracy,Confusion Matrix,Top features
Decision Tree,0.919922,0.92011,0.919922,0.920012,0.899907,TP=352 TN=119 FP=21FN=20,SSO:000025068__Sporulation protein // 0.16975429726264474 # SSO:000007697__Stage III sporulation protein AA // 0.08118373209128858 # SSO:000023921__Redox-sensing transcriptional repressor Rex // 0.0756220169745765 # SSO:000020544__Nitric oxide synthase oxygenase (EC 1.-.-.-) // 0.03979136913022205 # SSO:000002702__Ferredoxin (EC 1.18.1.3 ) // 0.031001852484789743 # SSO:000012811__DNA polymerase III subunit delta' // 0.02418650496698533 # SSO:000010833__Arylesterase // 0.022943764554313087 # SSO:000001864__Cysteine synthase (EC 2.5.1.47) // 0.019338336181067277 # SSO:000025027__Spore cortex-lytic enzyme // 0.011946749416486479 # SSO:000008961__Xanthine phosphoribosyltransferase (EC 2.4.2.22) // 0.009201137457320333 # SSO:000042567__mercury(II) reductase (EC 1.16.1.1) // 0.00790315893766282 # SSO:000017637__Haloalkane dehalogenase (EC 3.8.1.5) // 0.007233455939333527 # SSO:000009661__4-alpha-glucanotransferase // 0.0068755152676560245 # SSO:000005097__N-acetyl-gamma-glutamyl-phosphate reductase (EC 1.2.1.38) // 0.00675979253332924 # SSO:000010697__Anti-sigma-V factor RsiV // 0.005624400509020343 # SSO:000009313__stage V sporulation protein B // 0.005311482575733133 # SSO:000016824__Flavin reductase (EC 1.5.1.30) // 0.004341838029500944 # SSO:000013628__FAD:protein FMN transferase (EC 2.7.1.180) // 0.0038540995852703415 # SSO:000001044__Arginine decarboxylase (EC 4.1.1.19) // 0.0037963745395961108 # SSO:000033054__radical SAM protein // 0.003391363557865968
Random Forest,0.958984,0.959337,0.958984,0.959121,0.95154,TP=361 TN=130 FP=12FN=9,SSO:000025068__Sporulation protein // 0.008942308715967854 # SSO:000023921__Redox-sensing transcriptional repressor Rex // 0.007896822036279293 # SSO:000002241__Demethylmenaquinone methyltransferase (EC 2.1.1.163) // 0.0073033454177757785 # SSO:000002702__Ferredoxin (EC 1.18.1.3 ) // 0.0070768265743266145 # SSO:000035439__DUF402 domain-containing protein // 0.006453343981082653 # SSO:000007937__Tetraacyldisaccharide 4'-kinase (EC 2.7.1.130) // 0.005980371709978124 # SSO:000022969__Protoporphyrinogen oxidase // 0.005928286444193033 # SSO:000007697__Stage III sporulation protein AA // 0.005872044620440602 # SSO:000006816__Pyridoxine 5'-phosphate synthase (EC 2.6.99.2) // 0.005827720485923993 # SSO:000004460__Lipid-A-disaccharide synthase (EC 2.4.1.182) // 0.005551366542448786 # SSO:000036573__LPS-assembly protein LptD // 0.005548932024581187 # SSO:000001005__Anti-sigma F factor (EC 2.7.11.1) // 0.005540486454511106 # SSO:000036507__L-glutamate gamma-semialdehyde dehydrogenase (EC 1.2.1.88) // 0.005517931787517067 # SSO:000007707__Stage IV sporulation protein A // 0.005305227105699486 # SSO:000025017__Spore coat protein // 0.005173540460607951 # SSO:000008104__TonB-dependent receptor // 0.005157838578703942 # SSO:000022976__PspA/IM30 family protein // 0.004722049204454313 # SSO:000025054__Sporulation integral membrane protein YtvI // 0.004688770634178325 # SSO:000008604__UDP-3-O-3-hydroxymyristoyl glucosamine N-acyltransferase (EC 2.3.1.-) // 0.004676343764029396 # SSO:000007702__Stage III sporulation protein AF // 0.004628008545276964
CatBoost,0.964844,0.964844,0.964844,0.964844,0.955562,TP=364 TN=130 FP=9FN=9,SSO:000025068__Sporulation protein // 0.4023731807869705 # SSO:000034380__ABC-F type ribosomal protection protein // 0.3570154517434232 # SSO:000007097__Ribonuclease Z (EC 3.1.26.11) // 0.35137754329920745 # SSO:000001091__Aspartate 1-decarboxylase (EC 4.1.1.11) // 0.24247620366035555 # SSO:000025053__Sporulation integral membrane protein YlbJ // 0.23666413294493455 # SSO:000035439__DUF402 domain-containing protein // 0.22684245218040083 # SSO:000023921__Redox-sensing transcriptional repressor Rex // 0.2242525353048122 # SSO:000042897__acetoacetate---CoA ligase (EC 6.2.1.16) // 0.22331510938567947 # SSO:000012992__DUF350 domain-containing protein // 0.22164243660381797 # SSO:000002241__Demethylmenaquinone methyltransferase (EC 2.1.1.163) // 0.2212657873277283 # SSO:000002702__Ferredoxin (EC 1.18.1.3 ) // 0.19215805990217005 # SSO:000007697__Stage III sporulation protein AA // 0.18681491459660915 # SSO:000017300__Glutaredoxin-like protein nrdH // 0.18530064754711228 # SSO:000042174__acetylornithine transaminase // 0.17836284351864393 # SSO:000025595__Thioesterase // 0.16925453909294033 # SSO:000017050__Gas vesicle protein // 0.15152432711844285 # SSO:000025017__Spore coat protein // 0.1481665567309411 # SSO:000003012__Galactose-1-phosphate uridylyltransferase (EC 2.7.7.10) // 0.12750112402744176 # SSO:000021054__PASTA domain containing protein // 0.12640810900620314 # SSO:000024867__Small acid-soluble spore protein alpha/beta type // 0.119273161577042
