In [2]:
import pandas as pd

# Load only needed columns (0 = SampleID, 1 = GeneID, 5 = Expression)
df = pd.read_csv("data/TCGA-BRCA-gitoolsweb-data.txt", sep="\t", usecols=[0, 1, 5])

# Rename columns for clarity
df.columns = ["SampleID", "GeneID", "Expression"]

# Pivot into matrix: rows = genes, columns = samples
mat = df.pivot(index="GeneID", columns="SampleID", values="Expression")

# Replace missing values with 0 (optional)
mat = mat.fillna(0)

# Save as tab-delimited txt
mat.to_csv("expression_dataclean.txt", sep="\t")

print("Matrix saved as expression_matrix.txt with shape:", mat.shape)


Matrix saved as expression_matrix.txt with shape: (20380, 889)


In [3]:
import pandas as pd

# Load only necessary columns (0 = SampleID, 1 = GeneID, 3 = MutationFlag)
df = pd.read_csv("data/TCGA-BRCA-gitoolsweb-data.txt", sep="\t", usecols=[0, 1, 3])

# Rename columns for clarity
df.columns = ["SampleID", "GeneID", "Mutation"]

# Filter only PIK3CA rows
pik3ca = df[df["GeneID"] == "PIK3CA"]

# Map "-" → "No", "1" → "Yes"
pik3ca["PIK3CA_mutated"] = pik3ca["Mutation"].map(lambda x: "Yes" if str(x) == "1" else "No")

# Keep only sample ID and result
result = pik3ca[["SampleID", "PIK3CA_mutated"]]

# Save as tab-delimited txt
result.to_csv("PIK3CA_mutation_status.txt", sep="\t", index=False)

print("Table saved as PIK3CA_mutation_status.txt with shape:", result.shape)


Table saved as PIK3CA_mutation_status.txt with shape: (843, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pik3ca["PIK3CA_mutated"] = pik3ca["Mutation"].map(lambda x: "Yes" if str(x) == "1" else "No")


In [4]:
import pandas as pd

# Load the file
df = pd.read_csv("data/PIK3CA_mutated_Group_Comparison-results-1-data.txt", sep="\t", header=0)

# Select the desired columns by their positions (0-based) or by name
# Using positions: 1=B, 6=G, 7=H, 8=I
new_df = df.iloc[:, [1, 6, 7, 8]]

# Optionally, rename columns for clarity
new_df.columns = ["GeneID", "Corrected_Left_P", "Corrected_Right_P", "Corrected_2Tail_P"]

# Save to a new tab-delimited txt file
new_df.to_csv("results_pvalues.txt", sep="\t", index=False)

print("New table saved as selected_pvalues.txt with shape:", new_df.shape)


New table saved as selected_pvalues.txt with shape: (20377, 4)


In [8]:
import pandas as pd

# Load the table
df = pd.read_csv("data/results_pvalues.txt", sep="\t")

# Convert p-value columns to numeric (handles scientific notation)
df["Corrected_Right_P"] = pd.to_numeric(df["Corrected_Right_P"], errors="coerce")
df["Corrected_Left_P"] = pd.to_numeric(df["Corrected_Left_P"], errors="coerce")

# Set significance threshold
alpha = 0.05

# Count over-expressed genes (right-tail p-value < alpha)
over_expressed = (df["Corrected_Right_P"] < alpha).sum()

# Count under-expressed genes (left-tail p-value < alpha)
under_expressed = (df["Corrected_Left_P"] < alpha).sum()

print("Number of over-expressed genes in mutated PIK3CA tumors:", over_expressed)
print("Number of under-expressed genes in mutated PIK3CA tumors:", under_expressed)



Number of over-expressed genes in mutated PIK3CA tumors: 2794
Number of under-expressed genes in mutated PIK3CA tumors: 3366


In [1]:
import pandas as pd
import gseapy as gp
import matplotlib.pyplot as plt
import seaborn as sns

# Load DE table
df = pd.read_csv("data/results_pvalues.txt", sep="\t")

# Convert 2-tailed p-values to numeric (scientific notation handled)
df["Corrected_2Tail_P"] = pd.to_numeric(df["Corrected_2Tail_P"], errors="coerce")

# Sort by p-value ascending and take top 100
top_genes = df.sort_values("Corrected_2Tail_P").head(100)

# Get gene list
gene_list = top_genes["GeneID"].tolist()

top_genes["GeneID"].to_csv("top100_genes.txt", index=False, header=False)

print("Top 100 genes saved to top100_genes.txt")


# GO Biological Process enrichment
go_enrich = gp.enrichr(gene_list=gene_list,
                       gene_sets=['GO_Biological_Process_2021'], 
                       organism='Human',  # or 'Mouse'
                       outdir='GO_results',  # output folder
                       cutoff=0.05)  # adjusted p-value cutoff

# KEGG pathway enrichment
kegg_enrich = gp.enrichr(gene_list=gene_list,
                         gene_sets=['KEGG_2021_Human'],
                         organism='Human',
                         outdir='KEGG_results',
                         cutoff=0.05)

# TF enrichment
tf_enrich = gp.enrichr(gene_list=gene_list,
                       gene_sets=['ChEA_2022'],
                       organism='Human',
                       outdir='TF_results',
                       cutoff=0.05)






Top 100 genes saved to top100_genes.txt
