In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Lasso
import matplotlib.pyplot as plt
from pathlib import Path
from scipy.stats import ttest_ind
import statsmodels.stats.multitest as smm

### Lasso model on TF-perturb seq data vs. unperturbed control cells

In [23]:
from pathlib import Path
import pandas as pd

# Define base directory
base_dir = Path("...")

# Define file paths
x_path = base_dir / "../X_TFname_NTC.csv"
y_path = base_dir / "../Y_TFname_NTC.csv"

# Load data
X = pd.read_csv(x_path, index_col=0)
Y = pd.read_csv(y_path, index_col=0)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
lasso = LogisticRegression(penalty='l1',solver='liblinear', max_iter=10000, random_state=42)

lasso.fit(X_train, Y_train)

Y_pred= lasso.predict(X_test)

auc = roc_auc_score(Y_test, Y_pred)
print(f"AUC : {auc:.2f}")


In [None]:
plt.hist(Y_pred)

### Transfer the model on an unseen unperturb data

In [None]:
# path to unperturbed dataset
input_path1 = "..."

unperturbed_x = pd.read_csv(f"{input_path1}/unperturbed_x.csv", index_col=0)

unperturbed_x_scaled = scaler.transform(unperturbed_x)
Y_unperturbed= lasso.predict(unperturbed_x_scaled)
Y_unperturbed_df = pd.DataFrame(Y_unperturbed, index=unperturbed_x.index, columns=['Predicted_Label'])
plt.hist(Y_unperturbed_df)

In [None]:
predisposed_unperturbed = Y_unperturbed_df[ Y_unperturbed_df['Predicted_Label'] > 0.5]
print(predisposed_unperturbed.shape[0])
print(Y_unperturbed_df.shape[0])
print(100*predisposed_unperturbed.shape[0]/Y_unperturbed_df.shape[0])

In [None]:
Y_unperturbed_proba = lasso.predict_proba(unperturbed_x_scaled)[:, 1]  # probability of class 1
Y_unperturbed_df = pd.DataFrame(Y_unperturbed_proba, index=unperturbed_x.index, columns=['Predicted_Probability'])

plt.hist(Y_unperturbed_df, bins=50, color='skyblue', edgecolor='black', orientation='horizontal')
plt.ylabel('Predicted Probability')
plt.xlabel('Frequency')
plt.title('Distribution of Predicted Probabilities (D2)')
plt.xlim(0, 4500)
plt.show()

### Is predisposed set is separable from the real bifurcated state?

In [None]:
F = Y_unperturbed_df[ Y_unperturbed_df['Predicted_Probability'] > 0.5]
print(F.shape[0])
print(100*F.shape[0]/Y_unperturbed_df.shape[0])

In [11]:
# X1 and X2 are the gene expression matrix of cells at the final bifurcated states 1 and 2
X1 = pd.read_csv(f"{input_path1}/X1.csv", index_col=0)
X2 = pd.read_csv(f"{input_path1}/X2.csv", index_col=0)


In [12]:

def build_dataset(X_pos, X_neg, positive_label=1, negative_label=0):
    X = pd.concat([X_pos, X_neg], axis=0)
    y = pd.Series(
        [positive_label] * len(X_pos) + [negative_label] * len(X_neg),
        index=X.index,
        name="Label"
    )
    return X, y

# X1 vs predisposed_unperturbed
X_X1_predisposed_unperturbed, Y_X1_predisposed_unperturbed = build_dataset(
    X1, predisposed_unperturbed
)

# X2 vs predisposed_unperturbed
X_X2_predisposed_unperturbed, Y_X2_predisposed_unperturbed = build_dataset(
    X2, predisposed_unperturbed
)


### Classification model

In [None]:
# X_X1_predisposed_unperturbed can be replaced by any other datasets
X_train, X_test, Y_train, Y_test = train_test_split(X_X1_predisposed_unperturbed, Y_X1_predisposed_unperturbed, test_size=0.2, random_state=42)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
lasso_gc = LogisticRegression(penalty='l1',solver='liblinear', max_iter=10000)
lasso_gc.fit(X_train, Y_train)

Y_pred= lasso_gc.predict(X_test)

auc = roc_auc_score(Y_test, Y_pred)
print(f"AUC : {auc:.2f}")

print(f1_score(Y_test, Y_pred))

# DEG

In [None]:

np.random.seed(42)

# Separate KO and control groups
ko_cells = Y[Y['V1'] == 1].index
control_cells = Y[Y['V1'] == 0].index

# Run t-tests for each gene
p_values = []
log2_fc = []
for gene in X.columns:
    expr_ko = X.loc[ko_cells, gene]
    expr_ctrl = X.loc[control_cells, gene]
    stat, pval = ttest_ind(expr_ko, expr_ctrl, equal_var=False)  # Welch's t-test
    p_values.append(pval)
    # Compute log2 fold change with small offset to avoid log(0)
    fc = (expr_ko.mean() + 1e-6) / (expr_ctrl.mean() + 1e-6)
    log2_fc.append(np.log2(fc))

# Multiple testing correction
_, adj_pvals, _, _ = smm.multipletests(p_values, method='fdr_bh')

# Create DEG results table
deg_results = pd.DataFrame({
    'gene': X.columns,
    'log2_fold_change': log2_fc,
    'p_value': p_values,
    'adj_p_value': adj_pvals
})

# Filter DEGs (adjust thresholds as needed)
deg_results_filtered = deg_results[(deg_results['adj_p_value'] < 0.05) & (deg_results['log2_fold_change'].abs() > 1)]
deg_results_filtered.sort_values("adj_p_value").head()

print(deg_results_filtered.sort_values("adj_p_value").head())

## Subset gene expression matrix to only have the DEGs
X_subset = X[deg_results_filtered['gene'].values]
print(X_subset.shape)

## Train the LASSO using only DEGs

In [None]:

X_train, X_test, Y_train, Y_test = train_test_split(X_subset, Y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
lasso = LogisticRegression(penalty='l1',solver='liblinear', max_iter=10000, random_state=42)
lasso.fit(X_train, Y_train)

Y_pred= lasso.predict(X_test)

auc = roc_auc_score(Y_test, Y_pred)
print(f"AUC : {auc:.2f}")

print(f1_score(Y_test, Y_pred))

## Transferring the model on unperturbed cells

In [None]:
## Subset unperturbed gene expression matrix to only have the DEGs
unperturbed_x_subset = unperturbed_x[deg_results_filtered['gene'].values]
print(unperturbed_x_subset.shape)

unperturbed_x_subset_scaled = scaler.transform(unperturbed_x_subset)
Y_unperturbed = lasso.predict(unperturbed_x_subset_scaled)
Y_unperturbed_df = pd.DataFrame(Y_unperturbed, index=unperturbed_x_subset.index, columns=['Predicted_Label'])

plt.hist(Y_unperturbed)

In [None]:
predisposed_unperturbed = Y_unperturbed_df[ Y_unperturbed_df['Predicted_Label'] > 0.5]
print(predisposed_unperturbed.shape[0])
print(Y_unperturbed_df.shape[0])
print(100*predisposed_unperturbed.shape[0]/Y_unperturbed_df.shape[0])


In [None]:
Y_unperturbed_proba = lasso.predict_proba(unperturbed_x_subset_scaled)[:, 1]  
Y_unperturbed_df = pd.DataFrame(Y_unperturbed_proba, index=unperturbed_x_subset.index, columns=['Predicted_Probability'])

plt.hist(Y_unperturbed_df, bins=50, color='skyblue', edgecolor='black', orientation='horizontal')
plt.ylabel('Predicted Probability')
plt.xlabel('Frequency')
plt.title('Distribution of Predicted Probabilities (D2)')
plt.xlim(0, 4500)
plt.show()


In [None]:
# Concat predisposed_unperturbed with X1 and X2 
predisposed_unperturbed_X = unperturbed_x_subset[unperturbed_x_subset.index.isin(predisposed_unperturbed.index)]

# Select genes
genes = deg_results_filtered['gene'].values

X1_subset = X1[genes]
X2_subset = X2[genes]

def build_dataset(X_pos, X_neg):
    X = pd.concat([X_pos, X_neg], axis=0)
    y = pd.Series(
        [1] * len(X_pos) + [0] * len(X_neg),
        index=X.index,
        name="Label"
    )
    return X, y

# X1 vs predisposed_unperturbed
X_X1_predisposed_unperturbed, Y_X1_predisposed_unperturbed = build_dataset(
    X1_subset, predisposed_unperturbed_X
)

# X2 vs predisposed_unperturbed
X_X2_predisposed_unperturbed, Y_X2_predisposed_unperturbed = build_dataset(
    X2_subset, predisposed_unperturbed_X
)


## Classification model

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_X1_predisposed_unperturbed, Y_X1_predisposed_unperturbed, test_size=0.2, random_state=42)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
lasso_gc = LogisticRegression(penalty='l1',solver='liblinear', max_iter=10000, random_state=42)
lasso_gc.fit(X_train, Y_train)

Y_pred= lasso_gc.predict(X_test)

auc = roc_auc_score(Y_test, Y_pred)
print(f"AUC : {auc:.2f}")

print(f1_score(Y_test, Y_pred))
