In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import uniform, randint
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
import anndata as ad
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# 01 - Load data

In [None]:
#Read adatas
adatab=ad.read_h5ad('/home/local.hcpa.ufrgs.br/tkruger/V02_Glioblastoma_atlas/adatas/adata_4.h5ad')

In [None]:
#Make row names unique
adata.obs_names_make_unique()
adatab.obs_names_make_unique()

In [None]:
#Find intersections between adatas
match = np.intersect1d(adata.obs_names, adatab.obs_names)

In [None]:
#Keep adata matrix with obs that are in adatab
adata = adata[match]

In [None]:
#Map classes from adatab to adata
adata.obs['broad_cell_type'] = adata.obs.index.map(adatab.obs['broad_cell_type'])

In [None]:
#Extract adata matrix
X = adata.X

In [None]:
if not isinstance(X, np.ndarray):
    X_array = X.toarray()
else:
    X_array = X

In [None]:
#Extract classes
y = adata.obs['broad_cell_type']

In [None]:
df = pd.DataFrame({'label': y.values})

In [None]:
#Save matrices and labels of train and test
np.save('main_X.npy', X_array)
np.save('main_Y.npy', y)
df.to_csv('main_df.csv')

In [None]:
df = pd.read_csv('main_df.csv')

# 02 - Define Model 

In [None]:
# Best params:
best_learning_rate = 0.22959818254342154
best_max_depth = 7
best_n_estimators = 70

In [None]:
#Define model
model = XGBClassifier(
    learning_rate=best_learning_rate,
    max_depth=best_max_depth,
    n_estimators=best_n_estimators,
    use_label_encoder=False,
    eval_metric='logloss',
    n_jobs=1,
    verbosity=1
)


# 03 - Generate subsets

In [None]:
sampled_indices = (
    df.groupby('label')
    .apply(lambda g: g.sample(n=2000))
    .index.get_level_values(1)
)

In [None]:
X_sampled = X_array[sampled_indices]
y_sampled = y.iloc[sampled_indices].reset_index(drop=True)

In [None]:
X_sampled, y_sampled = shuffle(X_sampled, y_sampled)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_sampled, y_sampled, test_size=0.2, stratify=y_sampled, random_state=42
)

In [None]:
#Get rows sums
row_sums_train = X_train.sum(axis=1, keepdims=True)
row_sums_test = X_test.sum(axis=1, keepdims=True)

In [None]:
#Convert eventual 0 sums to 1
row_sums_train[row_sums_train == 0] = 1
row_sums_test[row_sums_test == 0] = 1

In [None]:
#Normalize both matrices using row sums and scaling to 1000k counts
X_train_normalized = X_train / row_sums_train * 1000
X_test_normalized = X_test / row_sums_test * 1000

In [None]:
#Apply log1p to normalized counts
X_train_log1p = np.log1p(X_train_normalized)
X_test_log1p = np.log1p(X_test_normalized)

In [None]:
#Apply scaler for mean = 0
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_log1p)

In [None]:
means = scaler.mean_
stds = scaler.scale_ 

In [None]:
#Encode labels
label_encoder = LabelEncoder()

In [None]:
train_labels_encoded = label_encoder.fit_transform(y_train)

In [None]:
model.fit(X_train_scaled, train_labels_encoded)

In [None]:
X_test_scaled = (X_test_log1p - means) / stds

In [None]:
y_pred = model.predict(X_test_scaled)


In [None]:
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [None]:
accuracy = accuracy_score(y_test_encoded, y_pred)
report = classification_report(y_test_encoded, y_pred)

In [None]:
print(report)