In [1]:
import pandas as pd

file_path = '../data/04-predictions/protein_clusters.csv'

# Load the CSV into a DataFrame
try:
    clusters_df = pd.read_csv(file_path)
    print("Successfully loaded the cluster data.")
    print("DataFrame shape:", clusters_df.shape)
    print("\nHere's a preview of the data:")
    print(clusters_df.head())
    
    print("\nUnique cluster labels found:")
    print(clusters_df['Cluster'].unique())
    
except FileNotFoundError:
    print(f"Error: Could not find the file at '{file_path}'. Make sure the path is correct.")

Successfully loaded the cluster data.
DataFrame shape: (5159, 2)

Here's a preview of the data:
  GeneSymbol    Cluster
0     CRYBB2  Cluster 5
1       RAF1  Cluster 5
2     GUCA1A  Cluster 5
3      BECN1  Cluster 5
4       OCRL  Cluster 5

Unique cluster labels found:
['Cluster 5' 'Cluster 2' 'Cluster 1' 'Cluster 0' 'Cluster 3' 'Cluster 4']


In [2]:
# --- Let's analyze Cluster 0 (the Blue cluster) ---

target_cluster = 'Cluster 0'

# Filter the DataFrame to get only the rows for the target cluster
cluster_0_df = clusters_df[clusters_df['Cluster'] == target_cluster]

print(f"Found {len(cluster_0_df)} proteins in {target_cluster}.")

# --- Get the Gene List for Metascape ---

# Extract the 'GeneSymbol' column as a list of strings
gene_list_for_metascape = cluster_0_df['GeneSymbol'].tolist()

# Print the first 10 genes as a sample
print("\nSample of genes in this cluster:", gene_list_for_metascape[:10])

# To use in Metascape, you need the full list.
# The following line will print the entire list as a single block of text
# that you can easily copy and paste.
print("\n--- Gene List for Metascape (Copy the text below) ---")
print('\n'.join(gene_list_for_metascape))

Found 263 proteins in Cluster 0.

Sample of genes in this cluster: ['ASMTL', 'DNAJC16', 'TOR1AIP2', 'GOLM2', 'ASCC1', 'GNPTG', 'PCDHB2', 'FAM163B', 'ASPRV1', 'LRRN1']

--- Gene List for Metascape (Copy the text below) ---
ASMTL
DNAJC16
TOR1AIP2
GOLM2
ASCC1
GNPTG
PCDHB2
FAM163B
ASPRV1
LRRN1
TMCC3
CLK2
ADH5
BLVRA
DNAJB2
DNAJA2
DAPP1
AGFG1
AAGAB
DES
CAP1
AARS1
SRI
CCDC50
OTUD5
PITPNB
GLRX2
CCT5
LANCL2
ZNF334
TPPP2
GRAP
CSRP2
LCN10
HDHD2
GSTK1
ARL6IP5
PTGR1
ARHGAP45
MYOM3
ARHGEF1
CRYZ
LANCL1
BDNF
ANXA7
AGAP2
KIAA0040
MZF1
ARHGAP5
BOLA1
MAPK10
CNPY4
AKT1
NRXN3
GPN1
GDI1
ATXN3
RCN3
ADAMTSL1
ANXA4
BAG1
AASDHPPT
ZADH2
ABHD10
ACSF2
ANXA11
CHMP2A
ENSA
CAB39
PCNP
BPNT1
VAT1
RTN4IP1
FABP9
CHMP1A
LGALSL
DTD2
FKBP3
AKAP7
ALKBH3
DDX19A
INHBA|INHBC
DSTN
SH3BGRL2
MREG
LRRC59
DTD1
PDAP1
CRIPT
PET117
MDGA2
HDHD3
SEZ6L
ITGA2B|ITGB3
RAB3C
VSTM1
LY6D
OXSR1
ITGAV|ITGB3
TATDN1
LZTFL1
CPTP
PTRHD1
TTC33
TPPP3
AP1AR
ADGRL3
CTDSPL
RAB24
RAB23
PLEKHF2
FN3K
TBCEL
CTDSP1
CIAO1
RWDD4
SAMSN1
SYT6
CD8A|CD8B
UBE2Q1
C1GA

In [3]:

target_cluster = 'Cluster 1'

# Filter the DataFrame to get only the rows for the target cluster
cluster_1_df = clusters_df[clusters_df['Cluster'] == target_cluster]

print(f"Found {len(cluster_1_df)} proteins in {target_cluster}.")

# --- Get the Gene List for Metascape ---

# Extract the 'GeneSymbol' column as a list of strings
gene_list_for_metascape = cluster_1_df['GeneSymbol'].tolist()

# Print the first 10 genes as a sample
print("\nSample of genes in this cluster:", gene_list_for_metascape[:10])

# To use in Metascape, you need the full list.
# The following line will print the entire list as a single block of text
# that you can easily copy and paste.
print("\n--- Gene List for Metascape (Copy the text below) ---")
print('\n'.join(gene_list_for_metascape))

Found 69 proteins in Cluster 1.

Sample of genes in this cluster: ['APRT', 'KIR2DS2', 'EFCAB14', 'LRCH4', 'BLK', 'RBM24', 'MAGEB10', 'PHF3', 'DHX8', 'BRPF1']

--- Gene List for Metascape (Copy the text below) ---
APRT
KIR2DS2
EFCAB14
LRCH4
BLK
RBM24
MAGEB10
PHF3
DHX8
BRPF1
PDZD7
DIRAS3
ALOX15B
NOVA1
GMEB2
CARHSP1
ANP32A
LTBP4
CYB561D1
OTOR
USH1C
AMPD2
ARL9
ITLN1
DAP
LGALS13
PLPBP
TEF
ELAVL2
PHOSPHO2
BCL7A
TXNRD3NB
ITGAL|ITGB2
UBE2L3|UBB
C2orf73
EXOG
FAM102B
PGA4
SPANXN3
RAB37
FAM9B
INO80E
RFPL3
CPNE6
PPM1F
CRACR2A
BCL11A
CYP2C19
ALOX5
AGRP
HPGD
MFNG
ENDOU
ACP6
COL13A1
ANTKMT
FAM171B
CDHR3
ERAP2
LYG1
MAN2B2
B3GNT8
RCN1
DEFB136
SPATA31D4
C7orf69
ZNF275
ARID1A
PCDHB10
