In [15]:
# Download dependencies
!pip install gdown
!pip install lightgbm
!pip install xgboost
!pip install catboost==1.1.1




In [14]:
# Download all files needed for this to work
import os



# Check and download allgenomes.RAST.txt.ps.tsv file
if not os.path.exists("data/allgenomes.RAST.txt.ps.tsv"):
    print("Downloading File data/allgenomes.RAST.txt.ps.tsv")
    !gdown --no-check-certificate 15py4qVFmLiHF-a341e_YX1w3HgEDwbDz -O data/allgenomes.RAST.txt.ps.tsv
else:
    print("File data/allgenomes.RAST.txt.ps.tsv already exists")

# Check and download allgenomes.interpro.txt.ps.tsv file
if not os.path.exists("data/allgenomes.interpro.txt.ps.tsv"):
    print("Downloading File data/allgenomes.interpro.txt.ps.tsv")
    !gdown --no-check-certificate 1PuDgDnWd5qumI0hkIKNYrTqICnh-_WlS -O data/allgenomes.interpro.txt.ps.tsv
else:
    print("File data/allgenomes.interpro.txt.ps.tsv already exists")

File data/allgenomes.RAST.txt.ps.tsv already exists
File data/allgenomes.interpro.txt.ps.tsv already exists


In [18]:
#Load genomic features and phenotype features

import pandas as pd


#Load phenotype data with all metabolic phenotypes (several phenotypes here)
phenotype_metabolic_df = pd.read_csv("data/metabolic_phenotype_data_bacdive.tsv", sep = "\t", index_col=0)

#Load rast features
rast_df = pd.read_csv("data/allgenomes.RAST.txt.ps.tsv", sep = "\t", index_col=0)

#Load interpro features
interpro_df = pd.read_csv("data/allgenomes.interpro.txt.ps.tsv", sep = "\t", index_col=0)



In [None]:
def generate_html_table(df: pd.DataFrame):
        """Display a pandas.DataFrame as jQuery DataTables"""

        # Generate random container name
        id_container = uuid.uuid1()
        output = """
    <div id="datatable-container-{id_container}">
      <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.0/jquery.min.js"></script>
      <script type="text/javascript" src="https://cdn.datatables.net/1.13.5/js/jquery.dataTables.min.js"></script>
      <link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.13.5/css/jquery.dataTables.min.css"/>
      <script type="text/javascript">
        $(document).ready( function () {{
            $('#BGCtable').DataTable();
        }});
      </script>
      <!-- Insert table below -->
      {table}
    </div>
        """.format(
            id_container=id_container,
            table=df.to_html(
                index=False,
                table_id="classification",
                classes="display"
            ),
        )
        return output
    

def get_classifier_report(X, y):

    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
    import xgboost as xgb
    import lightgbm as lgb
    import catboost as cb
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score
    from sklearn.feature_selection import SelectKBest, chi2
    import pandas as pd



 



    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Dictionary to store the classifiers and their parameters
    classifiers = {

        'Decision Tree': {
            'model': DecisionTreeClassifier(),
            'params': {}
        },
        'Random Forest': {
            'model': RandomForestClassifier(),
            'params': {'n_estimators': 10000, 'max_depth': None, 'random_state': 100}
        },
        'XGBoost': {
            'model': xgb.XGBClassifier(),
            'params': {'n_estimators': 10000, 'max_depth': 3, 'learning_rate': 0.1, 'random_state': 100}
        },

        'CatBoost': {
            'model': cb.CatBoostClassifier(),
            'params': {'iterations': 10000, 'depth': 6, 'learning_rate': 0.1, 'random_state': 42, 'verbose': False}
        },
        'AdaBoost': {
            'model': AdaBoostClassifier(),
            'params': {'n_estimators': 10000, 'random_state': 100}
        }
    }

    html_table_rows = []



    # Train and evaluate each classifier
    for clf_name, clf_data in classifiers.items():
        model = clf_data['model']
        params = clf_data['params']

        # Train the classifier
        model.set_params(**params)
        model.fit(X_train, y_train)

        # Make predictions on the test set
        y_pred = model.predict(X_test)



        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

        # Extract top features (importances or coefficients)
        try:
            feature_importances = dict(zip(X.columns, model.feature_importances_))
        except:
            try:
                feature_importances = dict(zip(X.columns, abs(model.coef_[0])))
            except:
                feature_importances = {}

        # Sort and select the top 10 features
        top_features = dict(sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)[:20])

        # Format the top features for prettier display
        formatted_top_features = ', '.join([f"{feat} ({importance:.2f})" for feat, importance in top_features.items()])

        # Append row to the HTML table
        html_table_rows.append([clf_name, accuracy, precision, recall, f1, balanced_accuracy, formatted_top_features])

    # Create a DataFrame to display the results
    headers = ["Model", "Accuracy", "Precision", "Recall", "F1-score", "Balanced Accuracy", "Top Features"]
    report_df = pd.DataFrame(html_table_rows, columns=headers)
    return report_df


    



In [None]:
# Prepare X and y
from IPython.display import display, HTML

#Load phenotype data with gram stain data (only gram stain phenotype here)
phenotype_gram_stain_df = pd.read_csv("data/gram_stain_bacdive.tsv", sep = "\t", index_col=0)

phenotype = "gram_stain"
phenotype_df = phenotype_gram_stain_df[phenotype].dropna()
merged_df = rast_df.merge(phenotype_df, left_index=True, right_index=True, how='inner')
X = merged_df.drop(columns=[phenotype])
y = merged_df[phenotype]

# Get classifier report
report_df = get_classifier_report(X,y)

# Generate html report
report_html_content = generate_html_table (report_df)
HTML(report_html_content)
