In [123]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

sklearn.set_config(transform_output="pandas")
from tensorflow import keras

# tf.debugging.set_log_device_placement(True)

In [None]:
def load_data(path, metadata, reads_list):
    all_metadata = pd.read_csv(
        os.path.join(path, f"{metadata}.csv"),
        index_col="index",
        encoding="utf8",
        sep=";",
    )

    metadata = list(
        all_metadata[
            ((all_metadata["include_in_model"] == True))
            & (all_metadata["exclude_from_model"] == False)
        ].index
    )

    raw_reads_df = pd.DataFrame()
    for filename in reads_list:
        temp_df = pd.read_csv(
            os.path.join(path, filename), encoding="utf8", index_col="gene"
        )
        raw_reads_df = raw_reads_df.join(temp_df, how="right")

    raw_reads_df.reset_index(inplace=True, drop=False)

    # Makes sample names uniform
    new_columns = ("&").join(list(raw_reads_df.columns))
    new_columns = new_columns.replace("control", "IMDM")
    new_columns = new_columns.replace("CL307", "CL-307")
    new_columns = new_columns.replace("Albu", "AH1252")
    new_columns = new_columns.replace("cAlb", "AH1397")
    new_columns = new_columns.replace("Alb2", "AH1405")
    new_columns = new_columns.replace("LPS_PGN", "LPS-PGN")
    new_columns = new_columns.replace("LPS_R848", "LPS-R848")
    new_columns = new_columns.replace("Pam3_PGN", "Pam3-PGN")
    new_columns = new_columns.replace("Pam3_R848", "Pam3-R848")
    new_columns = new_columns.replace("PAM3", "Pam3")
    new_columns = new_columns.replace("Fla-ST", "Fla-St")
    new_columns = new_columns.replace("TL8", "TL8-506")
    new_columns = new_columns.replace("CL8", "TL8-506")

    new_columns = new_columns.split("&")
    raw_reads_df.columns = new_columns

    included_raw_reads = raw_reads_df[metadata].copy()
    included_raw_reads["gene"] = raw_reads_df["gene"]
    sample_name_list = get_pyrogen_name(set(included_raw_reads.columns[:-1]))
    print("current samples:", list(set(sample_name_list)))

    return raw_reads_df[metadata].copy()


def get_pyrogen_name(sample_name_list):
    output = []
    for sample in sample_name_list:
        output.append(sample.split("_")[1])
    return output


def raw_to_rpm(DataFrame):
    outputDF = pd.DataFrame()
    for sample in DataFrame.columns:
        total_reads = DataFrame[sample].sum()
        outputDF[sample] = DataFrame[sample].apply(
            lambda x: (x / total_reads) * 1000000
        )
    return outputDF


def evaluate_model(name, real_labels, predicted_labels):
    accuracy, precision, recall, specificity, f1_score, confusion_matrix = (
        metrics.accuracy_score(real_labels, predicted_labels),
        metrics.precision_score(
            real_labels, predicted_labels, average="macro", zero_division=0
        ),
        metrics.recall_score(real_labels, predicted_labels, average="macro"),
        metrics.recall_score(real_labels, predicted_labels, average="macro"),
        metrics.f1_score(real_labels, predicted_labels, average="macro"),
        metrics.confusion_matrix(real_labels, predicted_labels),
    )
    statistics_dict[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "Specificity": specificity,
        "F1 Score": f1_score,
        "Confusion Matrix": confusion_matrix,
    }
    return statistics_dict

In [None]:
file_name_list = [
    "BA034_raw_reads.csv",
    "BA044_raw_reads.csv",
    "BA051_raw_reads.csv",
    "BA061_raw_reads.csv",
    "BA064_raw_reads.csv",
]

# loads all seq datasets
path = "~/MATseq/notebooks/support_files/"


# loads metadata
train_set = load_data(path=path, metadata="all_metadata", reads_list=file_name_list)
test_set = load_data(
    path=path, metadata="double_pyr_metadata", reads_list=file_name_list
)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


class LibraryLengthNormalizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Normalise the gene counts to the library size
        X = X.apply(lambda x: (x / (x.sum() if x.sum() != 0 else 1)) * 1000000)
        return X

In [89]:
# loads metadata
all_metadata = pd.read_csv(
    os.path.join(path, "double_pyr_metadata.csv"),
    encoding="utf8",
    sep=";",
    index_col="index",
)

metadata = list(
    all_metadata[
        ((all_metadata["include_in_model"] == True))
        & (all_metadata["exclude_from_model"] == False)
    ].index
)

raw_reads_df = pd.DataFrame()
for filename in file_name_list:
    temp_df = pd.read_csv(
        os.path.join(path, filename), encoding="utf8", index_col="gene"
    )
    raw_reads_df = raw_reads_df.join(temp_df, how="right")

raw_reads_df.reset_index(inplace=True, drop=False)

In [90]:
raw_reads_df

Unnamed: 0,gene,BA028_Fla-St_21,BA028_Fla-St_22,BA028_IMDM_01,BA028_IMDM_02,BA028_IMDM_03,BA028_LPS_05,BA028_LPS_06,BA028_LPS_07,BA028_PAM3_09,...,BA063_MPLA_15,BA063_Mur_19,BA063_Mur_20,BA063_Mur_21,BA063_Pam3_16,BA063_Pam3_17,BA063_Pam3_18,BA063_TL8_10,BA063_TL8_11,BA063_TL8_12
0,HUNK,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0
1,KLHL8,47,44,45,37,8,55,114,91,55,...,276.0,217.0,215.0,266.0,115.0,277.0,274.0,223.0,309.0,250.0
2,ZNF576,23,21,22,24,4,20,49,41,28,...,90.0,110.0,87.0,112.0,32.0,95.0,100.0,70.0,91.0,105.0
3,UGT1A5,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,FCF1,148,173,205,110,15,131,248,221,152,...,570.0,515.0,471.0,553.0,235.0,532.0,552.0,529.0,527.0,548.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19987,GALNT17,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
19988,DEPDC4,0,0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0
19989,REL,1277,1440,1183,1137,149,1000,2778,1573,1350,...,4540.0,5679.0,4797.0,4826.0,3349.0,5576.0,6483.0,5673.0,7063.0,6091.0
19990,RPL23,1819,1989,2006,1989,179,1577,4170,2971,2014,...,7257.0,6249.0,6200.0,7662.0,2760.0,6945.0,7529.0,6468.0,6251.0,7952.0


In [101]:
# make sample names uniform
new_columns = ("&").join(list(raw_reads_df.columns))

new_columns = new_columns.replace("control", "IMDM")
new_columns = new_columns.replace("CL307", "CL-307")
new_columns = new_columns.replace("Albu", "AH1252")
new_columns = new_columns.replace("cAlb", "AH1397")
new_columns = new_columns.replace("Alb2", "AH1405")
new_columns = new_columns.replace("LPS_PGN", "LPS-PGN")
new_columns = new_columns.replace("LPS_R848", "LPS-R848")
new_columns = new_columns.replace("Pam3_PGN", "Pam3-PGN")
new_columns = new_columns.replace("Pam3_R848", "Pam3-R848")
new_columns = new_columns.replace("PAM3", "Pam3")
new_columns = new_columns.replace("Fla-ST", "Fla-St")
new_columns = new_columns.replace("TL8", "TL8-506")
new_columns = new_columns.replace("CL8", "TL8-506")

new_columns = new_columns.split("&")
raw_reads_df.columns = new_columns

included_raw_reads = raw_reads_df[metadata].copy()

target = [("_").join(i.split("_")[1:2]) for i in included_raw_reads]
included_raw_reads = included_raw_reads.T
included_raw_reads.columns = raw_reads_df["gene"]

In [None]:
file_name = "gene_counts_NN_55_training.csv"

data = pd.read_csv(file_name)
data_features_train = data.drop(columns=["sample"])
target_train = data["sample"]

In [115]:
from sklearn.pipeline import Pipeline
from feature_engine.selection import (
    DropDuplicateFeatures,
    DropConstantFeatures,
    DropCorrelatedFeatures,
)
from sklearn.preprocessing import StandardScaler

feature_transformer = Pipeline(
    [
        ("drop_duplicates", DropDuplicateFeatures()),
        ("constant_feature_drop", DropConstantFeatures(tol=0.50)),
        ("normalise_for_library_size", LibraryLengthNormalizer()),
        (
            "drop correlating features",
            DropCorrelatedFeatures(variables=None, method="pearson", threshold=0.8),
        ),
        ("standard scaler", StandardScaler()),
    ]
)
print("Number of features before: ", included_raw_reads.shape[1])

X_train = feature_transformer.fit_transform(data_features_train)
X_test = feature_transformer.transform(included_raw_reads)

print("Number of features after: ", X_test.shape[1])

Number of features before:  19992
Number of features after:  2106


In [133]:
from sklearn.preprocessing import LabelEncoder

y_train = target_train
y_test = target

le = LabelEncoder()

y_train = le.fit_transform(y_train)

In [104]:
from joblib import dump, load

bag_multi = load("BaggingClassifier.joblib")

In [111]:
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier

bag_multi = BaggingClassifier(
    estimator=XGBClassifier(
        n_estimators=2, max_depth=2, learning_rate=1, objective="binary:logistic"
    ),
    n_estimators=500,
    max_features=100,
    max_samples=0.5,
    bootstrap=True,
    bootstrap_features=True,
    warm_start=False,
)

In [117]:
from sklearn.metrics import accuracy_score, f1_score

model_names = []
accuracy_scores = []
f1_scores = []


bag_multi.fit(X_train, y_train)

# model scoring
train_pred = bag_multi.predict(X_train)

# Evaluate model performance
train_accuracy = accuracy_score(y_train, train_pred)
train_f1 = f1_score(y_train, train_pred, average="macro")

# Print model performance
print("Training score:")
print(f"Accuracy: {round(train_accuracy, 4)} | F1 score: {round(train_f1, 4)}")

Training score:
Accuracy: 1.0 | F1 score: 1.0


In [124]:
test_pred = bag_multi.predict_proba(X_test)

In [134]:
test_pred_df = pd.DataFrame(test_pred, columns=le.classes_)

In [135]:
test_pred_df

Unnamed: 0,CL-307,CRX-527,Fla-PA,IMDM,LPS,LTA,PGN,Pam3,R848
0,0.067914,0.101096,0.026648,0.271973,0.17806,0.028973,0.122046,0.1228,0.08049
1,0.071984,0.107853,0.026451,0.267166,0.176445,0.026888,0.12181,0.120873,0.08053
2,0.076209,0.098531,0.024237,0.265336,0.18045,0.025811,0.123535,0.122297,0.083594
3,0.081336,0.129767,0.058832,0.197792,0.208211,0.051106,0.096256,0.104594,0.072106
4,0.086044,0.112161,0.026163,0.238015,0.159785,0.028656,0.120907,0.123454,0.104814
5,0.087063,0.113404,0.025598,0.229578,0.153946,0.028607,0.117705,0.141547,0.102552
6,0.080037,0.119554,0.026368,0.215438,0.170334,0.028809,0.120682,0.135384,0.103394
7,0.074046,0.102778,0.026019,0.265245,0.193491,0.026963,0.11684,0.113669,0.080949
8,0.081573,0.108172,0.041974,0.274879,0.173851,0.04154,0.092377,0.104725,0.080909
9,0.077367,0.09588,0.025521,0.273359,0.186447,0.027848,0.119759,0.1157,0.078118


In [136]:
y_test

['LPS-PGN',
 'LPS-PGN',
 'LPS-PGN',
 'LPS-PGN',
 'LPS-R848',
 'LPS-R848',
 'LPS-R848',
 'Pam3-PGN',
 'Pam3-PGN',
 'Pam3-PGN',
 'Pam3-R848',
 'Pam3-R848',
 'Pam3-R848']

In [138]:
predicted_labels = test_pred_df.idxmax(axis="columns")

In [139]:
predicted_labels

0     IMDM
1     IMDM
2     IMDM
3      LPS
4     IMDM
5     IMDM
6     IMDM
7     IMDM
8     IMDM
9     IMDM
10    IMDM
11    IMDM
12    IMDM
dtype: object

In [None]:
true_labels = y_test.idxmax(axis="columns")