# **Constructing GEO dataset (all_data_geo and all_labels_geo)**

The purpose of this Python Notebook is to construct the datasets that will be used to evaluate the XGBoost models generated previously. The output of this code will be the datasets "all_data_geo" and "all_labels_geo", which will contain the list of samples (Gene Expression normalized using TPM format) from the ten different cancer types that are part of this thesis and the labels of each sample (type of cancer), respectively.

In [1]:
# Import the libraries needed to construct the datasets
import pandas as pd
import pickle
import os

In [2]:
ignored_tissues_geo_file_path = "ignored_tissues_list_geo.pkl"
class_mapping_geo_file_path = "class_mapping_geo.pkl"

# class mapping
class_mapping_geo = {}
# ignore unknown tissues
ignored_tissues_geo = []

# load the files if they exist

if os.path.exists(class_mapping_geo_file_path):

    # load the list of ignored tissues from a pickel file
    with open(ignored_tissues_geo_file_path, 'rb') as f: 
        ignored_tissues_geo = pickle.load(f)

    # load a dataframe with the labels of each sample from a pickel file
    with open(class_mapping_geo_file_path, 'rb') as f: 
        class_mapping_geo = pickle.load(f)

In [3]:
def extract_data(serie, annot, tpm):

     # READ ANNOTATION FILE (to get Gene Symbol)
    df_symbols = pd.read_csv(annot, sep='\t', dtype={"ChrStart": str, "ChrStop": str})

    # READ TPM FILE
    df_tpm = pd.read_csv(tpm, sep='\t')


    # Replace GeneID with Symbol by deleting the column GeneID in df_tpm and inserting the symbol colum from df_symbol in index 0
    df_tpm.drop("GeneID", inplace=True, axis=1)

    df_tpm.insert(0, "Symbol", df_symbols["Symbol"])

    # Set Symbol as index
    df_tpm.set_index("Symbol", inplace=True)

    # Transpose the DataFrame
    all_data_geo = df_tpm.transpose()
    #print(all_data_geo)

    
    # EXTRACTING THE LABELS
    # Open the files in read mode
    with open(serie, 'r') as file:
        lines = file.readlines()

    # Extract the values starting with "GSM"
    gsm_values = []
    for line in lines:
        if line.startswith("!Sample_geo_accession"):
            # Split the line by whitespace and get the values starting from index 1
            gsm_values = line.split()[1:]
            break

    # Extract the information from the specified column
    column_data = []
    for line in lines:
        if line.startswith("!Sample_characteristics_ch1"):
            columns = line.split("\t")
            column_data = [column.strip().split(":")[1].strip().replace('"', '').lower() for column in columns[1:]]  # Assuming the desired column index is 1
            break

    # Create a DataFrame with sample IDs and tissue information
    samples_labels = pd.DataFrame({"Sample ID": [gsm.strip('"') for gsm in gsm_values], "Tissue": column_data})

    # Setting Sample ID as index
    all_labels_geo = samples_labels.set_index("Sample ID")

    # aligning/matching each row of labels with all_data_geo based on Sample ID
    indices = all_data_geo.index
    idx = [x in all_labels_geo.index for x in indices]
    indices = indices[idx]
    all_labels_geo = all_labels_geo.loc[indices]

    filtered_labels = []
    filtered_data = []

    for i in range(all_labels_geo.shape[0]):

        if all_labels_geo.iloc[i][0] in ignored_tissues_geo:
            continue

        row = pd.DataFrame(all_data_geo.iloc[i]).T

        if all_labels_geo.iloc[i][0] in class_mapping_geo.keys():
            filtered_labels.append(class_mapping_geo[all_labels_geo.iloc[i][0]])
            filtered_data.append(row)

        else:
            # request user to insert the label
            mapping = input(f'Insert label for \"{all_labels_geo.iloc[i][0]}\": ')
            if mapping == "":
                ignored_tissues_geo.append(all_labels_geo.iloc[i][0])

            else:
                class_mapping_geo[all_labels_geo.iloc[i][0]] = mapping
                filtered_labels.append(mapping)
                filtered_data.append(row)


    
    # Create a DataFrame with only the values of the "Tissue" column
    labels = pd.DataFrame({"Tissue": filtered_labels})
    #print(filtered_data[0].shape)
    data = pd.concat(filtered_data, axis=0)

    # Get the count of samples for each unique label
    #sample_counts = samples_labels["Tissue"].value_counts()

    return (data, labels)


In [4]:
datasets = {
    "bladder":["GSE229410", "GSE216037"],
    "brain":["GSE119834", "GSE213527"],
    "breast":["GSE147356"],
    "colon":["GSE180440", "GSE146009"],
    "liver":["GSE144269", "GSE77314"],
    "lung":["GSE60052"],
    "pancreas": ["GSE131050-GPL18573", "GSE131050-GPL16791"],
    "prostate": ["GSE229904"],
    "skin":["GSE142441"],
    "cancers":["GSE184398"]
}

all_tpms = []
all_labels = []

for cancer_type in datasets.keys():
    for accession_number in datasets[cancer_type]:

        # directory pathways
        serie = f"./data/data-geo/{cancer_type}/{accession_number}_series_matrix.txt"
        annot = f"./data/data-geo/{cancer_type}/Human.GRCh38.p13.annot_{accession_number}.tsv"
        tpm = f"./data/data-geo/{cancer_type}/{accession_number}_norm_counts_TPM_GRCh38.p13_NCBI.tsv"

        result = extract_data(serie, annot, tpm)
        all_tpms.append(result[0])
        all_labels.append(result[1])

all_tpms_df = pd.concat(all_tpms, axis=0)
all_labels_df = pd.concat(all_labels, axis=0)

print(all_tpms_df.shape)
print(all_labels_df.shape)

(2098, 39376)
(2098, 1)


In [5]:
all_labels_file_path = "all_labels_geo.pkl"
all_data_file_path = "all_data_geo.pkl"

# store the count matrices and labels into pickle files
with open(all_data_file_path, 'wb') as f:
    pickle.dump(all_tpms_df, f)
with open(all_labels_file_path, 'wb') as f:
    pickle.dump(all_labels_df, f)


# store the count matrices and labels into pickle files
with open(ignored_tissues_geo_file_path, 'wb') as f:
    pickle.dump(ignored_tissues_geo, f)
with open(class_mapping_geo_file_path, 'wb') as f:
    pickle.dump(class_mapping_geo, f)
