## Preprocess the copy-number variation (CNV) dataset

In [1]:
import pandas as pd
import numpy as np
import os

We first need the path to our folder containing case-organized data and the destination for storing the processed CNV data.

In [2]:
ORGANIZED_BY_CASE_PATH = ".../TCGA/data_by_cases"
DESTINATION_DATA_PATH = ".../TCGA/data_processed/PRCSD_cnv_data.csv"


We use the following function to read in CNV data. This function should be adapted to the format of CNV data used for a project. 

In [3]:
def read_cnv(filepath, case_id):
    arr = []
    with open(filepath) as f:
        lines = f.readlines()
        for l in lines:
            arr.append(l.upper().split())
    # transform 2d array into dataframe
    matrix = pd.DataFrame(arr)
    # get gene names as column names
    matrix.columns = matrix.iloc[0]
    # drop the column
    matrix = matrix.drop(0)
    # replace missing values with -1
    matrix["COPY_NUMBER"].fillna("-1", inplace = True)
    # transpose matrix and set ID to gene_ID
    matrix= matrix[["GENE_ID", "COPY_NUMBER"]].set_index("GENE_ID").transpose()
    # rename copy number column with case IDs
    return matrix.rename(columns={'GENE_ID': 'CASE_ID'},index={'COPY_NUMBER': case_id}).reset_index().rename(columns={0:'CASE_ID'})

In [None]:
def preprocess_cnv(by_case_path): 
    cases = os.listdir(by_case_path)
    #Loop through every case filepath and search for CNV data. Apply the read CSV function to each CNV data found.
    #After all the CNV files are read, we can concatenate them to create a matrix where rows are cases, columns are genomic regions, and values are the respective copy numbers.
    _cnv_data = []
    i=0
    for case in cases:
        contents_gene_exp = os.listdir(os.path.join(by_case_path, case, "cnv"))
        print(contents_gene_exp)
        if len(contents_gene_exp) == 0:
            i+=1
            print(f"{case} has no CNV expression data")
        else:
            filename = contents_gene_exp[0]
            path = os.path.join(by_case_path, case, "cnv", filename)
            _cnv_data.append(read_cnv(path, case))

    all_cnv_data = pd.concat(_cnv_data)
    # reset index to case ID
    all_cnv_data = all_cnv_data.rename(columns={"CASE_ID":"case_id"}).set_index("case_id")

    #For some preliminary feature reduction, we drop any columns that only have one unique value or have missing values.
    i = 0
    to_drop = []
    for col in all_cnv_data.columns:
        if len(all_cnv_data[col].unique())== 1 or ('-1' in all_cnv_data[col].unique()):
            to_drop.append(col)
            i+=1
            # print(col)

    print(f"{i} columns in data will be dropped, out of {len(all_cnv_data.columns)}")
    all_cnv_data= all_cnv_data.drop(columns = to_drop)
    return all_cnv_data



In [None]:
cnv_data = preprocess_cnv(ORGANIZED_BY_CASE_PATH)
cnv_data.to_csv(DESTINATION_DATA_PATH)

---