# Common Pipeline to Process All 20 Cancer types Datasets
## The following script will:

### . Dynamically load and merge clinical, genomic, and sample data.
### . Preprocess the data by handling missing values, encoding categorical variables, and normalizing numerical features.
### . Train machine learning models to classify cancer types or predict survival outcomes.
### . Save preprocessed data and evaluation results for further analysis.

## BLCA

In [1]:
import pandas as pd

# Define file paths
patient_data_path = "data_clinical_patient.txt"  # Replace with the actual path
sample_data_path = "data_clinical_sample.txt"    # Replace with the actual path

# Function to read and display columns
def get_columns(file_path):
    try:
        # Read the file into a DataFrame
        # Skip rows with metadata headers (adjust skiprows as needed based on file structure)
        df = pd.read_csv(file_path, sep="\t", comment="#", low_memory=False)
        # Return column names
        return df.columns.tolist()
    except Exception as e:
        return f"Error reading file: {e}"

# Get columns for both files
patient_columns = get_columns(patient_data_path)
sample_columns = get_columns(sample_data_path)

# Print the results
print("Columns in data_clinical_patient.txt:")
print(patient_columns)

print("\nColumns in data_clinical_sample.txt:")
print(sample_columns)

Columns in data_clinical_patient.txt:
['PATIENT_ID', 'OTHER_PATIENT_ID', 'PRIMARY_SITE_PATIENT', 'DISEASE_TYPE', 'PROJECT_NAME', 'PROJECT_ID', 'SEX', 'RACE', 'ETHNICITY', 'VITAL_STATUS', 'YEAR_OF_DEATH', 'SMOKING_PACK_YEARS', 'PRIMARY_DIAGNOSIS', 'YEAR_OF_DIAGNOSIS', 'PATH_M_STAGE', 'BIOPSY_SITE', 'AJCC_STAGING_EDITION', 'ICD_10', 'AGE', 'CLIN_T_STAGE', 'PATH_STAGE', 'MORPHOLOGY', 'PATH_T_STAGE', 'PRIOR_TREATMENT', 'PATH_N_STAGE', 'PRIOR_MALIGNANCY', 'PROJECT_STATE', 'OS_STATUS', 'OS_MONTHS', 'DFS_STATUS', 'DFS_MONTHS']

Columns in data_clinical_sample.txt:
['PATIENT_ID', 'SAMPLE_ID', 'ONCOTREE_CODE', 'CANCER_TYPE', 'CANCER_TYPE_DETAILED', 'OTHER_SAMPLE_ID', 'SAMPLE_TYPE', 'IS_FFPE']


In [2]:
import pandas as pd

# Load the files
patient_df = pd.read_csv("data_clinical_patient.txt", sep="\t", comment="#")
sample_df = pd.read_csv("data_clinical_sample.txt", sep="\t", comment="#")

# Merge the files
merged_df = pd.merge(patient_df, sample_df, on="PATIENT_ID", how="inner")

In [3]:
# Define relevant columns
relevant_columns = [
    "PATIENT_ID", "SAMPLE_ID", "SEX", "RACE", "ETHNICITY", "AGE", 
    "VITAL_STATUS", "OS_STATUS", "OS_MONTHS", "DFS_STATUS", "DFS_MONTHS",
    "PRIMARY_DIAGNOSIS", "PATH_STAGE", "PATH_T_STAGE", "PATH_N_STAGE", "PATH_M_STAGE",
    "CANCER_TYPE", "CANCER_TYPE_DETAILED", "SAMPLE_TYPE", "IS_FFPE"
]

# Filter the merged DataFrame
clinical_data = merged_df[relevant_columns]

In [4]:
# Replace "not reported" or blank values with NaN
clinical_data.replace(["not reported", ""], pd.NA, inplace=True)

# Drop rows with excessive missing data (optional)
clinical_data.dropna(thresh=int(0.7 * len(relevant_columns)), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinical_data.replace(["not reported", ""], pd.NA, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinical_data.dropna(thresh=int(0.7 * len(relevant_columns)), inplace=True)


In [5]:
# Save to a tab-delimited file
clinical_data.to_csv("combined_clinical_data.csv", index=False, sep="\t")
print("Combined clinical data saved to combined_clinical_data.csv")

Combined clinical data saved to combined_clinical_data.csv


In [6]:
import pandas as pd

# Load the combined clinical data
data_path = "combined_clinical_data.csv"  # Replace with the actual path
df = pd.read_csv(data_path, sep="\t")

In [7]:
import numpy as np

# Replace blanks with NaN
df.replace(["", "not reported"], np.nan, inplace=True)

# Impute numerical columns with the median
numerical_cols = ["AGE", "OS_MONTHS", "DFS_MONTHS"]
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Impute categorical columns with the mode
categorical_cols = ["RACE", "ETHNICITY", "PATH_STAGE", "PATH_T_STAGE", "PATH_N_STAGE", "PATH_M_STAGE"]
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Drop rows with excessive missing data (optional)
df.dropna(thresh=int(0.7 * len(df.columns)), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [8]:
# One-hot encode categorical variables
categorical_cols = ["SEX", "RACE", "ETHNICITY", "VITAL_STATUS", "OS_STATUS", "DFS_STATUS",
                    "PRIMARY_DIAGNOSIS", "PATH_STAGE", "PATH_T_STAGE", "PATH_N_STAGE", "PATH_M_STAGE",
                    "CANCER_TYPE", "CANCER_TYPE_DETAILED", "SAMPLE_TYPE", "IS_FFPE"]

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [9]:
from sklearn.preprocessing import MinMaxScaler

# Normalize numerical features
scaler = MinMaxScaler()
numerical_cols = ["AGE", "OS_MONTHS", "DFS_MONTHS"]
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [11]:
output_file = "preprocessed_clinical_data.csv"
df.to_csv(output_file, index=False, sep="\t")

print(f"Preprocessed clinical data saved to {output_file}")

Preprocessed clinical data saved to preprocessed_clinical_data.csv


In [12]:
import pandas as pd

# Load the preprocessed clinical data
data_path = "preprocessed_clinical_data.csv"  # Replace with the actual path
df = pd.read_csv(data_path, sep="\t")

In [29]:
print(df.head())       # View the first few rows
print(df.info())       # Check data types and missing values
print(df.describe())   # Summary statistics for numerical columns

     PATIENT_ID         SAMPLE_ID       AGE  OS_MONTHS  DFS_MONTHS  SEX_Male  \
0  TCGA-3C-AAAU  TCGA-3C-AAAU-01A  0.460317   0.470739    0.211314     False   
1  TCGA-3C-AALI  TCGA-3C-AALI-01A  0.380952   0.465862    0.468093     False   
2  TCGA-3C-AALJ  TCGA-3C-AALJ-01A  0.571429   0.171969    0.172277     False   
3  TCGA-3C-AALK  TCGA-3C-AALK-01A  0.412698   0.168950    0.089060     False   
4  TCGA-4H-AAAK  TCGA-4H-AAAK-01A  0.380952   0.041222    0.040673     False   

   RACE_ASIAN  RACE_BLACK OR AFRICAN AMERICAN  RACE_WHITE  \
0       False                           False        True   
1       False                            True       False   
2       False                            True       False   
3       False                            True       False   
4       False                           False        True   

   ETHNICITY_NOT HISPANIC OR LATINO  ...  PATH_N_STAGE_N2a  PATH_N_STAGE_N3  \
0                              True  ...             False            Fal

In [13]:
# Input features (exclude non-predictive columns like PATIENT_ID and SAMPLE_ID)
X = df.drop(columns=["PATIENT_ID", "SAMPLE_ID"])

# Target labels (e.g., cancer type based on PRIMARY_DIAGNOSIS)
diagnosis_columns = [col for col in df.columns if col.startswith("PRIMARY_DIAGNOSIS_")]
y = df[diagnosis_columns]

In [14]:
print(diagnosis_columns)

['PRIMARY_DIAGNOSIS_Papillary adenocarcinoma, NOS', 'PRIMARY_DIAGNOSIS_Papillary transitional cell carcinoma', 'PRIMARY_DIAGNOSIS_Squamous cell carcinoma, NOS', 'PRIMARY_DIAGNOSIS_Transitional cell carcinoma']


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
print(y_test.all(axis=1).value_counts())

False    83
Name: count, dtype: int64


In [17]:
import tensorflow as tf

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(y_train.shape[1], activation="softmax")  # Output layer
])

# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])



In [18]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32)

Epoch 1/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.5752 - loss: 1.1730 - val_accuracy: 0.8193 - val_loss: 0.6697
Epoch 2/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8382 - loss: 0.5849 - val_accuracy: 0.8193 - val_loss: 0.4578
Epoch 3/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8350 - loss: 0.3882 - val_accuracy: 0.8193 - val_loss: 0.3597
Epoch 4/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8546 - loss: 0.2849 - val_accuracy: 0.8795 - val_loss: 0.2576
Epoch 5/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9710 - loss: 0.1682 - val_accuracy: 0.9518 - val_loss: 0.1861
Epoch 6/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9664 - loss: 0.1225 - val_accuracy: 0.9759 - val_loss: 0.1368
Epoch 7/50
[1m11/11[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x19bfcf669f0>

In [19]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9901 - loss: 0.0469    
Test Accuracy: 0.99


In [20]:
import pandas as pd

# Load preprocessed clinical data
clinical_data_path = "preprocessed_clinical_data.csv"
clinical_df = pd.read_csv(clinical_data_path, sep="\t")

In [21]:
# Inspect the File
with open("data_mutations.txt", "r") as f:
    for i, line in enumerate(f):
        print(line.strip())
        if i >= 5:  # Print only the first 5 lines
            break

#genome_nexus_version: 1.0.2
#isoform: mskcc
Hugo_Symbol	Entrez_Gene_Id	Center	NCBI_Build	Chromosome	Start_Position	End_Position	Strand	Consequence	Variant_Classification	Variant_Type	Reference_Allele	Tumor_Seq_Allele1	Tumor_Seq_Allele2	dbSNP_RS	dbSNP_Val_Status	Tumor_Sample_Barcode	Matched_Norm_Sample_Barcode	Match_Norm_Seq_Allele1	Match_Norm_Seq_Allele2	Tumor_Validation_Allele1	Tumor_Validation_Allele2	Match_Norm_Validation_Allele1	Match_Norm_Validation_Allele2	Verification_Status	Validation_Status	Mutation_Status	Sequencing_Phase	Sequence_Source	Validation_Method	Score	BAM_File	Sequencer	t_ref_count	t_alt_count	n_ref_count	n_alt_count	HGVSc	HGVSp	HGVSp_Short	Transcript_ID	RefSeq	Protein_position	Codons	Exon_Number	genomic_location_explanation	Annotation_Status
UTS2	10911	BI	GRCh38	1	7847808	7847808	+	missense_variant	Missense_Mutation	SNP	C	C	G			TCGA-2F-A9KO-01A	TCGA-2F-A9KO-11A									Somatic							122	26			ENST00000361696.10:c.333G>C	p.Lys111Asn	p.K111N	ENST00000361696	NM_006786

In [22]:
import pandas as pd

# Path to the mutation data file
mutation_data_path = "data_mutations.txt"

# Attempt to load the file with inferred delimiter and skipping bad lines
try:
    mutation_df = pd.read_csv(mutation_data_path, sep=None, engine="python", on_bad_lines="skip")
except Exception as e:
    print(f"Error loading file: {e}")
    mutation_df = None

# Inspect the loaded data
if mutation_df is not None:
    print(mutation_df.head())
    print("Shape of the dataset:", mutation_df.shape)
else:
    print("Failed to load the dataset.")

                              #genome_nexus_version:  1.0.2
0                                          #isoform:  mskcc
1  Hugo_Symbol\tEntrez_Gene_Id\tCenter\tNCBI_Buil...   None
2  UTS2\t10911\tBI\tGRCh38\t1\t7847808\t7847808\t...   None
3  PRAMEF2\t65122\tBI\tGRCh38\t1\t12861625\t12861...   None
4  ARHGEF19\t128272\tBI\tGRCh38\t1\t16208035\t162...   None
Shape of the dataset: (116686, 2)


In [23]:
import pandas as pd

# Load the file with flexible parsing
mutation_data_path = "data_mutations.txt"
try:
    mutation_df = pd.read_csv(
        mutation_data_path,
        sep="\t",          # Assuming tab-delimited
        comment="#",       # Ignore lines starting with "#"
        on_bad_lines="skip"  # Skip problematic rows
    )
except Exception as e:
    print(f"Error loading file: {e}")

In [24]:
# Display the first few rows
print(mutation_df.head())

# Check for missing values
print(mutation_df.isnull().sum())

# Check column names
print(mutation_df.columns)

  Hugo_Symbol  Entrez_Gene_Id Center NCBI_Build Chromosome  Start_Position  \
0        UTS2         10911.0     BI     GRCh38          1         7847808   
1     PRAMEF2         65122.0     BI     GRCh38          1        12861625   
2    ARHGEF19        128272.0     BI     GRCh38          1        16208035   
3       PADI6        353238.0     BI     GRCh38          1        17372260   
4     LDLRAD2        401944.0     BI     GRCh38          1        21815971   

   End_Position Strand         Consequence Variant_Classification  ...  \
0       7847808      +    missense_variant      Missense_Mutation  ...   
1      12861625      +    missense_variant      Missense_Mutation  ...   
2      16208035      +  synonymous_variant                 Silent  ...   
3      17372260      +    missense_variant      Missense_Mutation  ...   
4      21815971      +  synonymous_variant                 Silent  ...   

                         HGVSc        HGVSp HGVSp_Short    Transcript_ID  \
0  ENST000

In [25]:
filtered_df = mutation_df[mutation_df["Variant_Classification"] == "Missense_Mutation"]

In [26]:
gene_counts = mutation_df["Hugo_Symbol"].value_counts()

In [27]:
# Count mutations per gene
gene_counts = mutation_df["Hugo_Symbol"].value_counts()

# Display top 10 genes
print(gene_counts.head(10))

Hugo_Symbol
TTN       463
TP53      232
MUC16     203
KMT2D     151
ARID1A    129
KDM6A     117
MACF1     115
SYNE1     113
KMT2C     101
RYR2       98
Name: count, dtype: int64


In [28]:
# Count occurrences of each consequence
consequence_counts = mutation_df["Consequence"].value_counts()

# Display top 10 consequences
print(consequence_counts.head(10))

Consequence
missense_variant                            70850
synonymous_variant                          27900
stop_gained                                  6838
frameshift_variant                           1981
missense_variant,splice_region_variant       1743
non_coding_transcript_exon_variant           1129
splice_acceptor_variant                      1026
intron_variant                                992
splice_region_variant,synonymous_variant      616
downstream_gene_variant                       483
Name: count, dtype: int64


### merge clinical and genomic datasets

In [48]:
import pandas as pd

# Define file paths
patient_data_path = "data_clinical_patient.txt"
sample_data_path = "data_clinical_sample.txt"
mutation_data_path = "data_mutations.txt"
cna_data_path = "data_cna.txt"

# Load the datasets
patient_df = pd.read_csv(patient_data_path, sep="\t", comment="#")
sample_df = pd.read_csv(sample_data_path, sep="\t", comment="#")
mutation_df = pd.read_csv(mutation_data_path, sep="\t", comment="#")

# Load CNA data and transpose it
cna_df = pd.read_csv(cna_data_path, sep="\t")
cna_df = cna_df.set_index("Entrez_Gene_Id").T.reset_index()
cna_df.rename(columns={"index": "SAMPLE_ID"}, inplace=True)

In [49]:
# Merge clinical patient and sample data
merged_df = pd.merge(patient_df, sample_df, on="PATIENT_ID", how="inner")

In [50]:
# Merge mutation data
merged_df = pd.merge(merged_df, mutation_df, left_on="SAMPLE_ID", right_on="Tumor_Sample_Barcode", how="left")

# Merge CNA data
merged_df = pd.merge(merged_df, cna_df, on="SAMPLE_ID", how="left")

MemoryError: Unable to allocate 25.5 GiB for an array with shape (38321, 89144) and data type float64

#### The error we are encountering:
#### MemoryError: Unable to allocate 25.5 GiB for an array with shape (38321, 89144) and data type float64 indicates that your system is running out of memory while attempting to merge the CNA data (cna_df) with the existing merged_df. This issue arises because merging large datasets creates intermediate arrays in memory, which can exceed the available RAM.

In [61]:
from sklearn.preprocessing import OneHotEncoder

# Use sparse_output=True to generate a sparse matrix
encoder = OneHotEncoder(sparse_output=True)
encoded_categorical = encoder.fit_transform(data.select_dtypes(include=["object"]))

# Convert sparse matrix to DataFrame
encoded_df = pd.DataFrame.sparse.from_spmatrix(
    encoded_categorical,
    columns=encoder.get_feature_names_out(data.select_dtypes(include=["object"]).columns)
)

# Concatenate encoded data with the original DataFrame
data = pd.concat([data.select_dtypes(exclude=["object"]), encoded_df], axis=1)

In [63]:
# Load CNA data without transposing
cna_data_path = r"C:\Users\RAMAVATH SANTHOSH\OneDrive\Documents\ALL SEMs\SEM6\FINAL_DISSERTATION_WORK\Datasets\blca_tcga_gdc\data_cna.txt"

# Use chunking to process the file
chunk_size = 1000  # Define chunk size based on available memory
chunks = []

for chunk in pd.read_csv(cna_data_path, sep="\t", chunksize=chunk_size):
    # Set Entrez_Gene_Id as the index
    chunk.set_index("Entrez_Gene_Id", inplace=True)
    chunks.append(chunk)

# Combine all chunks into a single DataFrame
cna_df = pd.concat(chunks, axis=0)

In [64]:
# List of relevant genes
relevant_genes = ["PIK3CA", "TP53", "TTN"]  # Add more genes as needed

# Load only relevant rows
cna_data_path = r"C:\Users\RAMAVATH SANTHOSH\OneDrive\Documents\ALL SEMs\SEM6\FINAL_DISSERTATION_WORK\Datasets\blca_tcga_gdc\data_cna.txt"

# Use chunking to process the file
chunk_size = 1000
chunks = []

for chunk in pd.read_csv(cna_data_path, sep="\t", chunksize=chunk_size):
    # Filter rows for relevant genes
    filtered_chunk = chunk[chunk["Entrez_Gene_Id"].isin(relevant_genes)]
    chunks.append(filtered_chunk)

# Combine all chunks into a single DataFrame
cna_df = pd.concat(chunks, axis=0)

In [65]:
from scipy.sparse import csr_matrix

# Load CNA data in chunks and convert to sparse format
cna_data_path = r"C:\Users\RAMAVATH SANTHOSH\OneDrive\Documents\ALL SEMs\SEM6\FINAL_DISSERTATION_WORK\Datasets\blca_tcga_gdc\data_cna.txt"
chunk_size = 1000
sparse_chunks = []

for chunk in pd.read_csv(cna_data_path, sep="\t", chunksize=chunk_size):
    # Convert chunk to sparse matrix
    sparse_chunk = csr_matrix(chunk.drop(columns=["Entrez_Gene_Id"]).values)
    sparse_chunks.append(sparse_chunk)

# Combine sparse chunks into a single sparse matrix
from scipy.sparse import vstack
cna_sparse = vstack(sparse_chunks)

In [66]:
import scipy.sparse

# Save the sparse matrix to a file
sparse_matrix_path = r"C:\Users\RAMAVATH SANTHOSH\OneDrive\Documents\ALL SEMs\SEM6\FINAL_DISSERTATION_WORK\Datasets\blca_tcga_gdc\cna_sparse_matrix.npz"
scipy.sparse.save_npz(sparse_matrix_path, cna_sparse)

print(f"Sparse matrix saved to: {sparse_matrix_path}")

Sparse matrix saved to: C:\Users\RAMAVATH SANTHOSH\OneDrive\Documents\ALL SEMs\SEM6\FINAL_DISSERTATION_WORK\Datasets\blca_tcga_gdc\cna_sparse_matrix.npz


In [67]:
cna_sparse = scipy.sparse.load_npz(sparse_matrix_path)

In [68]:
gene_ids = []

for chunk in pd.read_csv(cna_data_path, sep="\t", chunksize=chunk_size):
    gene_ids.extend(chunk["Entrez_Gene_Id"].tolist())
    sparse_chunk = csr_matrix(chunk.drop(columns=["Entrez_Gene_Id"]).values)
    sparse_chunks.append(sparse_chunk)

# Combine sparse chunks
cna_sparse = vstack(sparse_chunks)

# Convert gene IDs to a DataFrame
gene_df = pd.DataFrame({"Entrez_Gene_Id": gene_ids})

In [70]:
# Load the first chunk of the CNA data to inspect column headers
first_chunk = next(pd.read_csv(cna_data_path, sep="\t", chunksize=1))
cna_sample_ids = first_chunk.columns[1:]  # Exclude the first column (Entrez_Gene_Id)

# Compare sample IDs
print("Clinical Sample IDs:", len(clinical_df["SAMPLE_ID"].unique()))
print("CNA Sample IDs:", len(cna_sample_ids))

# Check for mismatches
mismatched_ids = set(clinical_df["SAMPLE_ID"]) - set(cna_sample_ids)
if mismatched_ids:
    print(f"Mismatched Sample IDs: {mismatched_ids}")
else:
    print("No mismatched Sample IDs.")

Clinical Sample IDs: 413
CNA Sample IDs: 392
Mismatched Sample IDs: {'TCGA-G2-A2EC-01A', 'TCGA-DK-A2I2-01A', 'TCGA-E5-A4TZ-01A', 'TCGA-DK-A1AG-01A', 'TCGA-UY-A78P-01A', 'TCGA-E7-A8O8-01A', 'TCGA-DK-A3IM-01A', 'TCGA-DK-AA6P-01A', 'TCGA-FD-A43N-01A', 'TCGA-DK-A3IV-01A', 'TCGA-C4-A0F7-01A', 'TCGA-C4-A0F1-01A', 'TCGA-XF-A8HE-01A', 'TCGA-GV-A3QH-01A', 'TCGA-DK-A2I6-01A', 'TCGA-DK-A3IN-01A', 'TCGA-BT-A2LD-01A', 'TCGA-C4-A0F0-01A', 'TCGA-DK-A3IU-01A', 'TCGA-UY-A78M-01A', 'TCGA-GC-A4ZW-01A'}


In [71]:
# Find common sample IDs
common_sample_ids = set(clinical_df["SAMPLE_ID"]).intersection(set(cna_sample_ids))

# Filter clinical data
clinical_df = clinical_df[clinical_df["SAMPLE_ID"].isin(common_sample_ids)]

# Filter CNA data
filtered_chunks = []
for chunk in pd.read_csv(cna_data_path, sep="\t", chunksize=chunk_size):
    filtered_chunk = chunk[["Entrez_Gene_Id"] + list(common_sample_ids)]
    filtered_chunks.append(filtered_chunk)

# Combine filtered chunks into a single DataFrame
cna_filtered_df = pd.concat(filtered_chunks, axis=0)

# Convert to sparse matrix
sparse_chunks = []
for chunk in filtered_chunks:
    sparse_chunk = csr_matrix(chunk.drop(columns=["Entrez_Gene_Id"]).values)
    sparse_chunks.append(sparse_chunk)

# Combine sparse chunks
cna_sparse = vstack(sparse_chunks)

# Update gene IDs
gene_ids = cna_filtered_df["Entrez_Gene_Id"].tolist()
gene_df = pd.DataFrame({"Entrez_Gene_Id": gene_ids})

In [72]:
assert cna_sparse.shape[1] == len(clinical_df["SAMPLE_ID"]), "Mismatch between CNA data columns and sample IDs"
print("Dimensions aligned successfully!")

Dimensions aligned successfully!


In [73]:
# Create a DataFrame for the sparse matrix
cna_dense = pd.DataFrame.sparse.from_spmatrix(cna_sparse, columns=list(common_sample_ids), index=gene_df["Entrez_Gene_Id"])

# Transpose the DataFrame to match clinical data format
cna_dense = cna_dense.T

# Merge with clinical data
merged_data = pd.merge(clinical_df, cna_dense, left_on="SAMPLE_ID", right_index=True, how="inner")

print("Merged data shape:", merged_data.shape)

Merged data shape: (392, 38329)


In [None]:
# Create a DataFrame for the sparse matrix
cna_dense = pd.DataFrame.sparse.from_spmatrix(cna_sparse, columns=list(common_sample_ids), index=gene_df["Entrez_Gene_Id"])

# Transpose the DataFrame to match clinical data format
cna_dense = cna_dense.T

# Merge with clinical data
merged_data = pd.merge(clinical_df, cna_dense, left_on="SAMPLE_ID", right_index=True, how="inner")

print("Merged data shape:", merged_data.shape)

# Extract the top 10 rows
top_10_rows = merged_data.head(10)

# Save the top 10 rows to a file
output_file = "merged_data.txt"
top_10_rows.to_csv(output_file, sep="\t", index=False)

print(f"Top 10 rows of merged data saved to {output_file}")