In [3]:
from google.colab import drive
import pandas as pd

print("Mounting Google Drive...")
drive.mount('/content/drive')

print("Loading master DataFrame from checkpoint.")
save_path = "/content/drive/My Drive/CPJUMP1_master_data_BATCH_1.parquet"
final_master_df = pd.read_parquet(save_path)

#Getting metacolumns
feature_prefixes = ['Cells_', 'Cytoplasm_', 'Nuclei_']
feature_cols = [col for col in final_master_df.columns if any(col.startswith(p) for p in feature_prefixes)]
meta_cols = [col for col in final_master_df.columns if col not in feature_cols]

meta_df_subset = final_master_df[meta_cols]
well_col = 'Metadata_Well'

print(" Checkpoint Loaded. Ready for analysis. ")
print(f"Loaded {len(final_master_df)} rows and {len(final_master_df.columns)} columns.")

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading master DataFrame from checkpoint.
 Checkpoint Loaded. Ready for analysis. 
Loaded 97656 rows and 5810 columns.


In [1]:
!pip install umap-learn



In [5]:
import plotly.express as px
import umap
from sklearn.preprocessing import StandardScaler


n_samples = 1000
if len(final_master_df) > n_samples:
    plot_df = final_master_df.sample(n=n_samples, random_state=42)
else:
    plot_df = final_master_df

print(f"Running UMAP on a sample of {len(plot_df)} rows.")

# Scale the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(plot_df[feature_cols])

# Run UMAP
reducer = umap.UMAP(n_components=2, random_state=42)
features_umap = reducer.fit_transform(features_scaled)

#DataFrame for plotting
umap_df = pd.DataFrame(data=features_umap, columns=['UMAP 1', 'UMAP 2'])

umap_df = pd.concat([
    umap_df,
    plot_df[meta_cols].reset_index(drop=True)
], axis=1)

print("Plotting...")

# plot
fig = px.scatter(
    umap_df,
    x='UMAP 1',
    y='UMAP 2',
    color='Timepoint',
    hover_data=['perturbation', 'gene', 'pert_type']
)

fig.show()

Running UMAP on a sample of 1000 rows.


  warn(


Plotting...
