# Common Pipeline to Process All 20 Cancer types Datasets
## The following script will:

### . Dynamically load and merge clinical, genomic, and sample data.
### . Preprocess the data by handling missing values, encoding categorical variables, and normalizing numerical features.
### . Train machine learning models to classify cancer types or predict survival outcomes.
### . Save preprocessed data and evaluation results for further analysis.

## BREAST INVASIVE CARCINOMA

In [18]:
import pandas as pd

# Define file paths
patient_data_path = "data_clinical_patient.txt"  # Replace with the actual path
sample_data_path = "data_clinical_sample.txt"    # Replace with the actual path

# Function to read and display columns
def get_columns(file_path):
    try:
        # Read the file into a DataFrame
        # Skip rows with metadata headers (adjust skiprows as needed based on file structure)
        df = pd.read_csv(file_path, sep="\t", comment="#", low_memory=False)
        # Return column names
        return df.columns.tolist()
    except Exception as e:
        return f"Error reading file: {e}"

# Get columns for both files
patient_columns = get_columns(patient_data_path)
sample_columns = get_columns(sample_data_path)

# Print the results
print("Columns in data_clinical_patient.txt:")
print(patient_columns)

print("\nColumns in data_clinical_sample.txt:")
print(sample_columns)

Columns in data_clinical_patient.txt:
['PATIENT_ID', 'OTHER_PATIENT_ID', 'PRIMARY_SITE_PATIENT', 'DISEASE_TYPE', 'PROJECT_NAME', 'PROJECT_ID', 'SEX', 'RACE', 'ETHNICITY', 'VITAL_STATUS', 'YEAR_OF_DEATH', 'PRIMARY_DIAGNOSIS', 'YEAR_OF_DIAGNOSIS', 'PATH_M_STAGE', 'BIOPSY_SITE', 'AJCC_STAGING_EDITION', 'ICD_10', 'AGE', 'PATH_STAGE', 'MORPHOLOGY', 'PATH_T_STAGE', 'PRIOR_TREATMENT', 'PATH_N_STAGE', 'PRIOR_MALIGNANCY', 'PROJECT_STATE', 'OS_STATUS', 'OS_MONTHS', 'DFS_STATUS', 'DFS_MONTHS']

Columns in data_clinical_sample.txt:
['PATIENT_ID', 'SAMPLE_ID', 'ONCOTREE_CODE', 'CANCER_TYPE', 'CANCER_TYPE_DETAILED', 'OTHER_SAMPLE_ID', 'SAMPLE_TYPE', 'IS_FFPE']


In [19]:
import pandas as pd

# Load the files
patient_df = pd.read_csv("data_clinical_patient.txt", sep="\t", comment="#")
sample_df = pd.read_csv("data_clinical_sample.txt", sep="\t", comment="#")

# Merge the files
merged_df = pd.merge(patient_df, sample_df, on="PATIENT_ID", how="inner")

In [20]:
# Define relevant columns
relevant_columns = [
    "PATIENT_ID", "SAMPLE_ID", "SEX", "RACE", "ETHNICITY", "AGE", 
    "VITAL_STATUS", "OS_STATUS", "OS_MONTHS", "DFS_STATUS", "DFS_MONTHS",
    "PRIMARY_DIAGNOSIS", "PATH_STAGE", "PATH_T_STAGE", "PATH_N_STAGE", "PATH_M_STAGE",
    "CANCER_TYPE", "CANCER_TYPE_DETAILED", "SAMPLE_TYPE", "IS_FFPE"
]

# Filter the merged DataFrame
clinical_data = merged_df[relevant_columns]

In [21]:
# Replace "not reported" or blank values with NaN
clinical_data.replace(["not reported", ""], pd.NA, inplace=True)

# Drop rows with excessive missing data (optional)
clinical_data.dropna(thresh=int(0.7 * len(relevant_columns)), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinical_data.replace(["not reported", ""], pd.NA, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinical_data.dropna(thresh=int(0.7 * len(relevant_columns)), inplace=True)


In [22]:
# Save to a tab-delimited file
clinical_data.to_csv("combined_clinical_data.csv", index=False, sep="\t")
print("Combined clinical data saved to combined_clinical_data.csv")

Combined clinical data saved to combined_clinical_data.csv


In [23]:
import pandas as pd

# Load the combined clinical data
data_path = "combined_clinical_data.csv"  # Replace with the actual path
df = pd.read_csv(data_path, sep="\t")

In [24]:
import numpy as np

# Replace blanks with NaN
df.replace(["", "not reported"], np.nan, inplace=True)

# Impute numerical columns with the median
numerical_cols = ["AGE", "OS_MONTHS", "DFS_MONTHS"]
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Impute categorical columns with the mode
categorical_cols = ["RACE", "ETHNICITY", "PATH_STAGE", "PATH_T_STAGE", "PATH_N_STAGE", "PATH_M_STAGE"]
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Drop rows with excessive missing data (optional)
df.dropna(thresh=int(0.7 * len(df.columns)), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [25]:
# One-hot encode categorical variables
categorical_cols = ["SEX", "RACE", "ETHNICITY", "VITAL_STATUS", "OS_STATUS", "DFS_STATUS",
                    "PRIMARY_DIAGNOSIS", "PATH_STAGE", "PATH_T_STAGE", "PATH_N_STAGE", "PATH_M_STAGE",
                    "CANCER_TYPE", "CANCER_TYPE_DETAILED", "SAMPLE_TYPE", "IS_FFPE"]

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [26]:
from sklearn.preprocessing import MinMaxScaler

# Normalize numerical features
scaler = MinMaxScaler()
numerical_cols = ["AGE", "OS_MONTHS", "DFS_MONTHS"]
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [27]:
output_file = "preprocessed_clinical_data.csv"
df.to_csv(output_file, index=False, sep="\t")

print(f"Preprocessed clinical data saved to {output_file}")

Preprocessed clinical data saved to preprocessed_clinical_data.csv


In [28]:
import pandas as pd

# Load the preprocessed clinical data
data_path = "preprocessed_clinical_data.csv"  # Replace with the actual path
df = pd.read_csv(data_path, sep="\t")

In [29]:
print(df.head())       # View the first few rows
print(df.info())       # Check data types and missing values
print(df.describe())   # Summary statistics for numerical columns

     PATIENT_ID         SAMPLE_ID       AGE  OS_MONTHS  DFS_MONTHS  SEX_Male  \
0  TCGA-3C-AAAU  TCGA-3C-AAAU-01A  0.460317   0.470739    0.211314     False   
1  TCGA-3C-AALI  TCGA-3C-AALI-01A  0.380952   0.465862    0.468093     False   
2  TCGA-3C-AALJ  TCGA-3C-AALJ-01A  0.571429   0.171969    0.172277     False   
3  TCGA-3C-AALK  TCGA-3C-AALK-01A  0.412698   0.168950    0.089060     False   
4  TCGA-4H-AAAK  TCGA-4H-AAAK-01A  0.380952   0.041222    0.040673     False   

   RACE_ASIAN  RACE_BLACK OR AFRICAN AMERICAN  RACE_WHITE  \
0       False                           False        True   
1       False                            True       False   
2       False                            True       False   
3       False                            True       False   
4       False                           False        True   

   ETHNICITY_NOT HISPANIC OR LATINO  ...  PATH_N_STAGE_N2a  PATH_N_STAGE_N3  \
0                              True  ...             False            Fal

In [30]:
# Input features (exclude non-predictive columns like PATIENT_ID and SAMPLE_ID)
X = df.drop(columns=["PATIENT_ID", "SAMPLE_ID"])

# Target labels (e.g., cancer type based on PRIMARY_DIAGNOSIS)
diagnosis_columns = [col for col in df.columns if col.startswith("PRIMARY_DIAGNOSIS_")]
y = df[diagnosis_columns]

In [31]:
print(diagnosis_columns)

['PRIMARY_DIAGNOSIS_Apocrine adenocarcinoma', 'PRIMARY_DIAGNOSIS_Basal cell carcinoma, NOS', 'PRIMARY_DIAGNOSIS_Carcinoma, NOS', 'PRIMARY_DIAGNOSIS_Cribriform carcinoma, NOS', 'PRIMARY_DIAGNOSIS_Infiltrating duct and lobular carcinoma', 'PRIMARY_DIAGNOSIS_Infiltrating duct carcinoma, NOS', 'PRIMARY_DIAGNOSIS_Infiltrating duct mixed with other types of carcinoma', 'PRIMARY_DIAGNOSIS_Infiltrating lobular mixed with other types of carcinoma', 'PRIMARY_DIAGNOSIS_Intraductal micropapillary carcinoma', 'PRIMARY_DIAGNOSIS_Intraductal papillary adenocarcinoma with invasion', 'PRIMARY_DIAGNOSIS_Large cell neuroendocrine carcinoma', 'PRIMARY_DIAGNOSIS_Lobular carcinoma, NOS', 'PRIMARY_DIAGNOSIS_Medullary carcinoma, NOS', 'PRIMARY_DIAGNOSIS_Metaplastic carcinoma, NOS', 'PRIMARY_DIAGNOSIS_Mucinous adenocarcinoma', 'PRIMARY_DIAGNOSIS_Paget disease and infiltrating duct carcinoma of breast', 'PRIMARY_DIAGNOSIS_Papillary carcinoma, NOS', 'PRIMARY_DIAGNOSIS_Phyllodes tumor, malignant', 'PRIMARY_DIAGNO

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
print(y_test.all(axis=1).value_counts())

False    221
Name: count, dtype: int64


In [34]:
import tensorflow as tf

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(y_train.shape[1], activation="softmax")  # Output layer
])

# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])



In [35]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32)

Epoch 1/50
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.3622 - loss: 2.6429 - val_accuracy: 0.7195 - val_loss: 1.1310
Epoch 2/50
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7285 - loss: 0.9805 - val_accuracy: 0.8914 - val_loss: 0.6444
Epoch 3/50
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8833 - loss: 0.6209 - val_accuracy: 0.8914 - val_loss: 0.4664
Epoch 4/50
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8815 - loss: 0.4872 - val_accuracy: 0.8914 - val_loss: 0.3722
Epoch 5/50
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8830 - loss: 0.3767 - val_accuracy: 0.9140 - val_loss: 0.3098
Epoch 6/50
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9176 - loss: 0.2946 - val_accuracy: 0.9276 - val_loss: 0.2651
Epoch 7/50
[1m28/28[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x1f5c4915a60>

In [37]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9982 - loss: 0.0323     
Test Accuracy: 1.00


In [38]:
import pandas as pd

# Load preprocessed clinical data
clinical_data_path = "preprocessed_clinical_data.csv"
clinical_df = pd.read_csv(clinical_data_path, sep="\t")

In [40]:
# Inspect the File
with open("data_mutations.txt", "r") as f:
    for i, line in enumerate(f):
        print(line.strip())
        if i >= 5:  # Print only the first 5 lines
            break

#genome_nexus_version: 1.0.2
#isoform: mskcc
Hugo_Symbol	Entrez_Gene_Id	Center	NCBI_Build	Chromosome	Start_Position	End_Position	Strand	Consequence	Variant_Classification	Variant_Type	Reference_Allele	Tumor_Seq_Allele1	Tumor_Seq_Allele2	dbSNP_RS	dbSNP_Val_Status	Tumor_Sample_Barcode	Matched_Norm_Sample_Barcode	Match_Norm_Seq_Allele1	Match_Norm_Seq_Allele2	Tumor_Validation_Allele1	Tumor_Validation_Allele2	Match_Norm_Validation_Allele1	Match_Norm_Validation_Allele2	Verification_Status	Validation_Status	Mutation_Status	Sequencing_Phase	Sequence_Source	Validation_Method	Score	BAM_File	Sequencer	t_ref_count	t_alt_count	n_ref_count	n_alt_count	HGVSc	HGVSp	HGVSp_Short	Transcript_ID	RefSeq	Protein_position	Codons	Exon_Number	genomic_location_explanation	Annotation_Status
KDM1A	23028	WUGSC	GRCh38	1	23083363	23083363	+	stop_retained_variant	Silent	SNP	G	G	A			TCGA-3C-AAAU-01A	TCGA-3C-AAAU-10A									Somatic							22	3			ENST00000356634.7:c.2558G>A	p.Ter853=	p.*853=	ENST00000356634	NM_015013.4	8

In [41]:
import pandas as pd

# Path to the mutation data file
mutation_data_path = "data_mutations.txt"

# Attempt to load the file with inferred delimiter and skipping bad lines
try:
    mutation_df = pd.read_csv(mutation_data_path, sep=None, engine="python", on_bad_lines="skip")
except Exception as e:
    print(f"Error loading file: {e}")
    mutation_df = None

# Inspect the loaded data
if mutation_df is not None:
    print(mutation_df.head())
    print("Shape of the dataset:", mutation_df.shape)
else:
    print("Failed to load the dataset.")

                              #genome_nexus_version:  1.0.2
0                                          #isoform:  mskcc
1  Hugo_Symbol\tEntrez_Gene_Id\tCenter\tNCBI_Buil...   None
2  KDM1A\t23028\tWUGSC\tGRCh38\t1\t23083363\t2308...   None
3  DDAH1\t23576\tWUGSC\tGRCh38\t1\t85324885\t8532...   None
4  GATA3\t2625\tWUGSC\tGRCh38\t10\t8073911\t80739...   None
Shape of the dataset: (89014, 2)


In [42]:
import pandas as pd

# Load the file with flexible parsing
mutation_data_path = "data_mutations.txt"
try:
    mutation_df = pd.read_csv(
        mutation_data_path,
        sep="\t",          # Assuming tab-delimited
        comment="#",       # Ignore lines starting with "#"
        on_bad_lines="skip"  # Skip problematic rows
    )
except Exception as e:
    print(f"Error loading file: {e}")

In [43]:
# Display the first few rows
print(mutation_df.head())

# Check for missing values
print(mutation_df.isnull().sum())

# Check column names
print(mutation_df.columns)

  Hugo_Symbol  Entrez_Gene_Id Center NCBI_Build Chromosome  Start_Position  \
0       KDM1A           23028  WUGSC     GRCh38          1        23083363   
1       DDAH1           23576  WUGSC     GRCh38          1        85324885   
2       GATA3            2625  WUGSC     GRCh38         10         8073911   
3      MALAT1          378938  WUGSC     GRCh38         11        65505435   
4      MIPOL1          145282  WUGSC     GRCh38         14        37369580   

   End_Position Strand                         Consequence  \
0      23083363      +               stop_retained_variant   
1      85324885      +             splice_acceptor_variant   
2       8073912      +                  frameshift_variant   
3      65505437      +  non_coding_transcript_exon_variant   
4      37369580      +                    missense_variant   

  Variant_Classification  ...                              HGVSc  \
0                 Silent  ...        ENST00000356634.7:c.2558G>A   
1            Splice_Si

In [44]:
filtered_df = mutation_df[mutation_df["Variant_Classification"] == "Missense_Mutation"]

In [45]:
gene_counts = mutation_df["Hugo_Symbol"].value_counts()

In [46]:
# Count mutations per gene
gene_counts = mutation_df["Hugo_Symbol"].value_counts()

# Display top 10 genes
print(gene_counts.head(10))

Hugo_Symbol
PIK3CA    374
TTN       365
TP53      344
MUC16     168
CDH1      134
GATA3     128
MAP3K1    126
RYR2      101
KMT2C     100
SYNE1      91
Name: count, dtype: int64


In [47]:
# Count occurrences of each consequence
consequence_counts = mutation_df["Consequence"].value_counts()

# Display top 10 consequences
print(consequence_counts.head(10))

Consequence
missense_variant                          53635
synonymous_variant                        19332
stop_gained                                4674
frameshift_variant                         4059
missense_variant,splice_region_variant     1565
splice_acceptor_variant                     762
non_coding_transcript_exon_variant          712
intron_variant                              576
inframe_deletion                            573
splice_donor_variant                        549
Name: count, dtype: int64


### merge clinical and genomic datasets

In [48]:
import pandas as pd

# Define file paths
patient_data_path = "data_clinical_patient.txt"
sample_data_path = "data_clinical_sample.txt"
mutation_data_path = "data_mutations.txt"
cna_data_path = "data_cna.txt"

# Load the datasets
patient_df = pd.read_csv(patient_data_path, sep="\t", comment="#")
sample_df = pd.read_csv(sample_data_path, sep="\t", comment="#")
mutation_df = pd.read_csv(mutation_data_path, sep="\t", comment="#")

# Load CNA data and transpose it
cna_df = pd.read_csv(cna_data_path, sep="\t")
cna_df = cna_df.set_index("Entrez_Gene_Id").T.reset_index()
cna_df.rename(columns={"index": "SAMPLE_ID"}, inplace=True)

In [49]:
# Merge clinical patient and sample data
merged_df = pd.merge(patient_df, sample_df, on="PATIENT_ID", how="inner")

In [50]:
# Merge mutation data
merged_df = pd.merge(merged_df, mutation_df, left_on="SAMPLE_ID", right_on="Tumor_Sample_Barcode", how="left")

# Merge CNA data
merged_df = pd.merge(merged_df, cna_df, on="SAMPLE_ID", how="left")

MemoryError: Unable to allocate 25.5 GiB for an array with shape (38321, 89144) and data type float64

#### The error we are encountering:
#### MemoryError: Unable to allocate 25.5 GiB for an array with shape (38321, 89144) and data type float64 indicates that your system is running out of memory while attempting to merge the CNA data (cna_df) with the existing merged_df. This issue arises because merging large datasets creates intermediate arrays in memory, which can exceed the available RAM.

In [61]:
from sklearn.preprocessing import OneHotEncoder

# Use sparse_output=True to generate a sparse matrix
encoder = OneHotEncoder(sparse_output=True)
encoded_categorical = encoder.fit_transform(data.select_dtypes(include=["object"]))

# Convert sparse matrix to DataFrame
encoded_df = pd.DataFrame.sparse.from_spmatrix(
    encoded_categorical,
    columns=encoder.get_feature_names_out(data.select_dtypes(include=["object"]).columns)
)

# Concatenate encoded data with the original DataFrame
data = pd.concat([data.select_dtypes(exclude=["object"]), encoded_df], axis=1)

In [63]:
# Load CNA data without transposing
cna_data_path = r"C:\Users\RAMAVATH SANTHOSH\OneDrive\Documents\ALL SEMs\SEM6\FINAL_DISSERTATION_WORK\Datasets\blca_tcga_gdc\data_cna.txt"

# Use chunking to process the file
chunk_size = 1000  # Define chunk size based on available memory
chunks = []

for chunk in pd.read_csv(cna_data_path, sep="\t", chunksize=chunk_size):
    # Set Entrez_Gene_Id as the index
    chunk.set_index("Entrez_Gene_Id", inplace=True)
    chunks.append(chunk)

# Combine all chunks into a single DataFrame
cna_df = pd.concat(chunks, axis=0)

In [64]:
# List of relevant genes
relevant_genes = ["PIK3CA", "TP53", "TTN"]  # Add more genes as needed

# Load only relevant rows
cna_data_path = r"C:\Users\RAMAVATH SANTHOSH\OneDrive\Documents\ALL SEMs\SEM6\FINAL_DISSERTATION_WORK\Datasets\blca_tcga_gdc\data_cna.txt"

# Use chunking to process the file
chunk_size = 1000
chunks = []

for chunk in pd.read_csv(cna_data_path, sep="\t", chunksize=chunk_size):
    # Filter rows for relevant genes
    filtered_chunk = chunk[chunk["Entrez_Gene_Id"].isin(relevant_genes)]
    chunks.append(filtered_chunk)

# Combine all chunks into a single DataFrame
cna_df = pd.concat(chunks, axis=0)

In [65]:
from scipy.sparse import csr_matrix

# Load CNA data in chunks and convert to sparse format
cna_data_path = r"C:\Users\RAMAVATH SANTHOSH\OneDrive\Documents\ALL SEMs\SEM6\FINAL_DISSERTATION_WORK\Datasets\blca_tcga_gdc\data_cna.txt"
chunk_size = 1000
sparse_chunks = []

for chunk in pd.read_csv(cna_data_path, sep="\t", chunksize=chunk_size):
    # Convert chunk to sparse matrix
    sparse_chunk = csr_matrix(chunk.drop(columns=["Entrez_Gene_Id"]).values)
    sparse_chunks.append(sparse_chunk)

# Combine sparse chunks into a single sparse matrix
from scipy.sparse import vstack
cna_sparse = vstack(sparse_chunks)

In [66]:
import scipy.sparse

# Save the sparse matrix to a file
sparse_matrix_path = r"C:\Users\RAMAVATH SANTHOSH\OneDrive\Documents\ALL SEMs\SEM6\FINAL_DISSERTATION_WORK\Datasets\blca_tcga_gdc\cna_sparse_matrix.npz"
scipy.sparse.save_npz(sparse_matrix_path, cna_sparse)

print(f"Sparse matrix saved to: {sparse_matrix_path}")

Sparse matrix saved to: C:\Users\RAMAVATH SANTHOSH\OneDrive\Documents\ALL SEMs\SEM6\FINAL_DISSERTATION_WORK\Datasets\blca_tcga_gdc\cna_sparse_matrix.npz


In [67]:
cna_sparse = scipy.sparse.load_npz(sparse_matrix_path)

In [68]:
gene_ids = []

for chunk in pd.read_csv(cna_data_path, sep="\t", chunksize=chunk_size):
    gene_ids.extend(chunk["Entrez_Gene_Id"].tolist())
    sparse_chunk = csr_matrix(chunk.drop(columns=["Entrez_Gene_Id"]).values)
    sparse_chunks.append(sparse_chunk)

# Combine sparse chunks
cna_sparse = vstack(sparse_chunks)

# Convert gene IDs to a DataFrame
gene_df = pd.DataFrame({"Entrez_Gene_Id": gene_ids})

In [70]:
# Load the first chunk of the CNA data to inspect column headers
first_chunk = next(pd.read_csv(cna_data_path, sep="\t", chunksize=1))
cna_sample_ids = first_chunk.columns[1:]  # Exclude the first column (Entrez_Gene_Id)

# Compare sample IDs
print("Clinical Sample IDs:", len(clinical_df["SAMPLE_ID"].unique()))
print("CNA Sample IDs:", len(cna_sample_ids))

# Check for mismatches
mismatched_ids = set(clinical_df["SAMPLE_ID"]) - set(cna_sample_ids)
if mismatched_ids:
    print(f"Mismatched Sample IDs: {mismatched_ids}")
else:
    print("No mismatched Sample IDs.")

Clinical Sample IDs: 413
CNA Sample IDs: 392
Mismatched Sample IDs: {'TCGA-G2-A2EC-01A', 'TCGA-DK-A2I2-01A', 'TCGA-E5-A4TZ-01A', 'TCGA-DK-A1AG-01A', 'TCGA-UY-A78P-01A', 'TCGA-E7-A8O8-01A', 'TCGA-DK-A3IM-01A', 'TCGA-DK-AA6P-01A', 'TCGA-FD-A43N-01A', 'TCGA-DK-A3IV-01A', 'TCGA-C4-A0F7-01A', 'TCGA-C4-A0F1-01A', 'TCGA-XF-A8HE-01A', 'TCGA-GV-A3QH-01A', 'TCGA-DK-A2I6-01A', 'TCGA-DK-A3IN-01A', 'TCGA-BT-A2LD-01A', 'TCGA-C4-A0F0-01A', 'TCGA-DK-A3IU-01A', 'TCGA-UY-A78M-01A', 'TCGA-GC-A4ZW-01A'}


In [71]:
# Find common sample IDs
common_sample_ids = set(clinical_df["SAMPLE_ID"]).intersection(set(cna_sample_ids))

# Filter clinical data
clinical_df = clinical_df[clinical_df["SAMPLE_ID"].isin(common_sample_ids)]

# Filter CNA data
filtered_chunks = []
for chunk in pd.read_csv(cna_data_path, sep="\t", chunksize=chunk_size):
    filtered_chunk = chunk[["Entrez_Gene_Id"] + list(common_sample_ids)]
    filtered_chunks.append(filtered_chunk)

# Combine filtered chunks into a single DataFrame
cna_filtered_df = pd.concat(filtered_chunks, axis=0)

# Convert to sparse matrix
sparse_chunks = []
for chunk in filtered_chunks:
    sparse_chunk = csr_matrix(chunk.drop(columns=["Entrez_Gene_Id"]).values)
    sparse_chunks.append(sparse_chunk)

# Combine sparse chunks
cna_sparse = vstack(sparse_chunks)

# Update gene IDs
gene_ids = cna_filtered_df["Entrez_Gene_Id"].tolist()
gene_df = pd.DataFrame({"Entrez_Gene_Id": gene_ids})

In [72]:
assert cna_sparse.shape[1] == len(clinical_df["SAMPLE_ID"]), "Mismatch between CNA data columns and sample IDs"
print("Dimensions aligned successfully!")

Dimensions aligned successfully!


In [73]:
# Create a DataFrame for the sparse matrix
cna_dense = pd.DataFrame.sparse.from_spmatrix(cna_sparse, columns=list(common_sample_ids), index=gene_df["Entrez_Gene_Id"])

# Transpose the DataFrame to match clinical data format
cna_dense = cna_dense.T

# Merge with clinical data
merged_data = pd.merge(clinical_df, cna_dense, left_on="SAMPLE_ID", right_index=True, how="inner")

print("Merged data shape:", merged_data.shape)

Merged data shape: (392, 38329)


In [None]:
# Create a DataFrame for the sparse matrix
cna_dense = pd.DataFrame.sparse.from_spmatrix(cna_sparse, columns=list(common_sample_ids), index=gene_df["Entrez_Gene_Id"])

# Transpose the DataFrame to match clinical data format
cna_dense = cna_dense.T

# Merge with clinical data
merged_data = pd.merge(clinical_df, cna_dense, left_on="SAMPLE_ID", right_index=True, how="inner")

print("Merged data shape:", merged_data.shape)

# Extract the top 10 rows
top_10_rows = merged_data.head(10)

# Save the top 10 rows to a file
output_file = "merged_data.txt"
top_10_rows.to_csv(output_file, sep="\t", index=False)

print(f"Top 10 rows of merged data saved to {output_file}")

In [75]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# Step 1: Drop irrelevant columns
irrelevant_columns = ["PATIENT_ID", "SAMPLE_ID", "OTHER_SAMPLE_ID"]
merged_data.drop(columns=irrelevant_columns, inplace=True)

# Step 2: Handle missing data
numerical_cols = merged_data.select_dtypes(include=["float64", "int64"]).columns
categorical_cols = merged_data.select_dtypes(include=["object"]).columns

# Impute numerical columns with median
for col in numerical_cols:
    merged_data[col].fillna(merged_data[col].median(), inplace=True)

# Impute categorical columns with mode
for col in categorical_cols:
    merged_data[col].fillna(merged_data[col].mode()[0], inplace=True)

# Step 3: Encode categorical variables
encoder = OneHotEncoder(sparse=True)
encoded_categorical = encoder.fit_transform(merged_data[categorical_cols])
encoded_df = pd.DataFrame.sparse.from_spmatrix(
    encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols)
)
merged_data = pd.concat([merged_data.drop(columns=categorical_cols), encoded_df], axis=1)

# Step 4: Normalize numerical features
scaler = StandardScaler()
merged_data[numerical_cols] = scaler.fit_transform(merged_data[numerical_cols])

# Step 5: Feature selection
X = merged_data.drop(columns=["TARGET_COLUMN"])  # Replace "TARGET_COLUMN" with your actual target
y = merged_data["TARGET_COLUMN"]

# Select top 100 features based on ANOVA F-statistic
selector = SelectKBest(score_func=f_classif, k=100)
X_selected = selector.fit_transform(X, y)

# Step 6: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Step 7: Train a Random Forest classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

TypeError: cannot perform median with type Sparse[float64, 0]