In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve,auc, accuracy_score,classification_report,confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GroupShuffleSplit


In [2]:

#reading the data in 
na_strings = ["NA"] 

df = (pd.read_csv("/Users/ryansilva/Downloads/GSE171524 Processed Data.csv.gz", header=0, index_col=0, compression="infer").T)
df.index.name = "NAME"
#now reading in the data
meta = pd.read_csv("/Users/ryansilva/Downloads/GSE171524 Lung Metadata.txt", sep="\t", skiprows=[1])
meta["covid"] = (meta["disease__ontology_label"] == "COVID-19").astype(int)
meta = meta.set_index("NAME")


  meta = pd.read_csv("/Users/ryansilva/Downloads/GSE171524 Lung Metadata.txt", sep="\t", skiprows=[1])


In [None]:

#now using the indices to propelry identify which cells come from which donor (to prevent data leakage)
#also using indices to identify covid status of a cell for proper evaluation
df = df[df.index.isin(meta.index)]
#stripping in case data read in affected names with hidden characters
df.index = df.index.str.strip()
meta.index = meta.index.str.strip()
df["donor_id"] = meta.loc[df.index, "donor_id"]
df["covid"] = meta.loc[df.index, "covid"]

#train test split by features for group shuffle to account for not splitting up donor cells across sets
X = df.drop(columns=["donor_id", "covid"])
y = df["covid"].values
groups = df["donor_id"].values
#train test sets
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))
#train test sets
X_train = X.iloc[train_idx]
y_train = y[train_idx]
X_test = X.iloc[test_idx]
y_test = y[test_idx]

#have to do manual class weighting when using sgd, not built in
class_weights = compute_class_weight(class_weight="balanced", classes=np.array([0, 1]), y=y_train)
cw_dict = {0: class_weights[0], 1: class_weights[1]}

#cv not possible with the scale of the data, too computationally intensive
#next best option is the use a grid search for the alpha value
param_grid = [0.001]
#to compare models later
results = {}
models = {}

for alpha in param_grid:
    print(f"Current alpha being evaluated = {alpha}")
    model = SGDClassifier(
        #uses logistic regression
        loss="log_loss",
        #to help reduce parameter dimensionality, >2000 parameters
        penalty="l1",
        alpha=alpha,
        max_iter=1000,
        class_weight=cw_dict,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred)
    
    results[alpha] = auc
    models[alpha] = model
    print(f"Current Alpha: {alpha} — Yielded Test is AUROC: {auc}")

#now checking what the best final was, based on the highest result (best auc score)
best_alpha = max(results, key=results.get)
final_model = models[best_alpha]
y_pred_best = final_model.predict(X_test)
y_pred_proba_best = final_model.predict_proba(X_test)[:, 1]

# Evaluate metrics using auc score and the accuracy score
accuracy = accuracy_score(y_test, y_pred_best)
auc = roc_auc_score(y_test, y_pred_proba_best)
print(f"Optimal model alpha = {best_alpha}. Calculated AUC score for this value was:{results[best_alpha]}. Accuracy score was: {accuracy}")
print("Classification Report:")
classification_report(y_test,y_pred_best)
#now for conclusion, useful to know which genes were prioritized (weren't penalized as heavily by L1)
coefs = final_model.coef_.flatten()
gene_importance = pd.DataFrame({
    "gene": X.columns,
    "weight": coefs,
    "abs_weight": np.abs(coefs)
}).sort_values("weight", ascending=False)

print("Genes with most absolute weight in the final model:")
print(gene_importance.head(10))




In [None]:

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_best)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

# ROC Curve & AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_best)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f'AUC = {roc_auc}')
plt.plot([0, 1], [0, 1], 'k--') 
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()