In [None]:
# Import modules
import pandas as pd
import scanpy as sc
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import scipy.sparse as sp
from tqdm.auto import tqdm
from scipy.stats.contingency import crosstab
import seaborn as sns
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load the datasets
path_file = '8_Classifiers/Data/Input/pred_adata_myeloid_.h5ad'

adata = sc.read_h5ad(path_file)
adata

AnnData object with n_obs × n_vars = 182934 × 10
    obs: 'UMAP1', 'UMAP2', 'leiden_0.4', 'leiden_0.6', 'leiden_1.0', 'leiden_1.4', 'cell_type', 'louvain_None', 'louvain_0.4', 'louvain_0.6', 'louvain_1.0', 'louvain_1.4', 'myeloid_cell_type', 'myeloid_louvain_None', 'myeloid_louvain_0.4', 'myeloid_louvain_0.6', 'myeloid_louvain_1.0', 'myeloid_louvain_1.4', 'myeloid_louvain_1.6', 'myeloid_louvain_1.8', 'myeloid_louvain_2.0', 'myeloid_louvain_1.2', 'predicted_type_1.0', 'predicted_type_1.2', 'predicted_type_1.4', 'predicted_type_1.6', 'predicted_type_1.8', 'predicted_type_2.0', 'xgb_predicted_cell_type', 'rf_predicted_cell_type', 'lr_predicted_cell_type', 'svm_predicted_cell_type'
    var: 'gene_id', 'gene_name'
    uns: 'cell_type_colors', 'leiden_0.4', 'leiden_0.4_colors', 'leiden_0.6', 'leiden_0.6_colors', 'leiden_1.0', 'leiden_1.0_colors', 'leiden_1.4', 'leiden_1.4_colors', 'log1p', 'louvain_0.4', 'louvain_0.6', 'louvain_1.0', 'louvain_1.4', 'louvain_None', 'myeloid_cell_type_colors',

In [3]:
# Set X and Y
X = adata.X
Y = adata.obs['myeloid_cell_type']

In [None]:
# Create an array of indices for all cells
all_indices = np.arange(X.shape[0])

# Split indices, keeping track of which are train/test
train_idx, test_idx = train_test_split(
    all_indices,
    test_size=0.1,
    stratify=Y,
    random_state=42)

# Use indices to subset X and Y
X_train, X_test = X[train_idx], X[test_idx]
Y_train, Y_test = Y[train_idx], Y[test_idx]

  Y_train, Y_test = Y[train_idx], Y[test_idx]


In [None]:
# Add the RF classifier prediction
rf_clf = joblib.load('8_Classifiers/Models/best_rf_classifier.pkl')

Y_pred_rf = rf_clf.predict(X[test_idx])
Y_test = Y[test_idx]

# Get obs_names (row labels) for test set
test_obs_names = adata.obs_names[test_idx]

# Assign NaNs first
adata.obs['rf_predicted_cell_type'] = np.nan

# Now assign predictions using aligned obs_names
adata.obs.loc[test_obs_names, 'rf_predicted_cell_type'] = Y_pred_rf

adata.write_h5ad('8_Classifiers/Data/Input/pred_adata_myeloid_.h5ad')

  Y_test = Y[test_idx]
 'Macrophages']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  adata.obs.loc[test_obs_names, 'rf_predicted_cell_type'] = Y_pred_rf


In [None]:
# Add the LR classifier predictions
lr_clf = joblib.load('8_Classifiers/Models/best_lr_model.pkl')

Y_pred_lr = lr_clf.predict(X[test_idx])
Y_test = Y[test_idx]

# Get obs_names (row labels) for test set
test_obs_names = adata.obs_names[test_idx]

# Assign NaNs first
adata.obs['lr_predicted_cell_type'] = np.nan

# Now assign predictions using aligned obs_names
adata.obs.loc[test_obs_names, 'lr_predicted_cell_type'] = Y_pred_lr

adata.write_h5ad('8_Classifiers/Data/Input/pred_adata_myeloid_.h5ad')

  Y_test = Y[test_idx]
 'Macrophages']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  adata.obs.loc[test_obs_names, 'lr_predicted_cell_type'] = Y_pred_lr


In [None]:
# Add the SVM classifier predictions
svm_clf = joblib.load('8_Classifiers/Models//best_svm_classifier.pkl')

Y_pred_svm = svm_clf.predict(X[test_idx])
Y_test = Y[test_idx]

# Get obs_names (row labels) for test set
test_obs_names = adata.obs_names[test_idx]

# Assign NaNs first
adata.obs['svm_predicted_cell_type'] = np.nan

# Now assign predictions using aligned obs_names
adata.obs.loc[test_obs_names, 'svm_predicted_cell_type'] = Y_pred_svm

adata.write_h5ad('8_Classifiers/Data/Output/pred_adata_myeloid.h5ad')

  Y_test = Y[test_idx]
 'Macrophages']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  adata.obs.loc[test_obs_names, 'svm_predicted_cell_type'] = Y_pred_svm
