In [316]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np


In [317]:
pd.set_option('display.max_rows', 500)

In [318]:
def read_files_in_directory(directory):
    file_paths = []
    for root, _, files in os.walk(directory):

        for file in files:
            if file[-3:] == "tsv":
                file_path = os.path.join(root, file)
                file_paths.append(file_path)
    return file_paths


In [319]:
directory_path = "data/Protein_Expression"
files = read_files_in_directory(directory_path)
len(files)

7906

In [320]:
def get_case_id_from_filename(file):
    index = file.find("TCGA")
    return file[index:index+12]

In [321]:
def extract_protein_expression(file):
    df = pd.read_csv(file, sep='\t')
    subset_df = df[['protein_expression', 'AGID']]
    transposed_df = subset_df.set_index('AGID').T
    transposed_df = transposed_df.reset_index(drop=True)
    transposed_df.insert(0, 'case_submitter_id', [get_case_id_from_filename(file)])
    return transposed_df

In [322]:
def merge_cases(files,extractor):
    merged = extractor(files[0])
    for file in files[1:]:
        to_be_merge = extractor(file)
        merged = pd.concat([to_be_merge, merged], axis=0, ignore_index=True)
    return merged
    

In [323]:
# merged_rppa = merge_cases(files,extract_protein_expression)
# merged_rppa.to_csv("full_rppa_raw.csv",index=False)


In [324]:
merged_rppa = pd.read_csv('full_rppa_raw.csv')
merged_rppa.shape

(7906, 488)

In [325]:
def calculate_column_statistics(df):
    results = pd.DataFrame(index=df.columns, columns=['Mean', 'Variance', 'NumUniqueValues', 'NumNaNs'])
    for column in df.columns:
        if column not in ["case_submitter_id"]:
            results.at[column, 'Mean'] = df[column].mean()
            results.at[column, 'Variance'] = df[column].var()
            results.at[column, 'NumUniqueValues'] = df[column].nunique()
            results.at[column, 'NumNaNs'] = df[column].isna().sum()
    return results

In [326]:
statistics= calculate_column_statistics(merged_rppa)
statistics["NumNaNs"].value_counts()

NumNaNs
406     238
0       216
7252     12
7146     10
5335      4
6964      1
4691      1
1000      1
4237      1
2417      1
6110      1
1415      1
Name: count, dtype: int64

In [327]:
7906*0.2

1581.2

among all 488 proteins, for 20% Missingness, 7906*0.2 = 1581.2, we drop 12 + 10 + 4 + 1 + 1 + 1 + 1 + 1 =31 proteins since too many missing. For other 238 + 1(1000) + 1(1415) preotins consider inputation.

Drop

In [328]:
def extract_rows_by_nan(df, nan_threshold=1581):
    columns = [column for column in df.columns if df[column].isna().sum() >= nan_threshold]
    return columns

In [329]:
empmpty_columns = extract_rows_by_nan(merged_rppa,1581)
merged_rppa = merged_rppa.drop(columns=empmpty_columns)
merged_rppa.shape

(7906, 457)

Inputation

In [330]:
path = "data/clinical.cart.2024-06-25/clinical.tsv"
df = pd.read_csv(path,sep='\t')

In [331]:
df['days_to_death'] = pd.to_numeric(df['days_to_death'], errors='coerce')
df['days_to_last_follow_up'] = pd.to_numeric(df['days_to_last_follow_up'], errors='coerce')

In [332]:
colums = ["ajcc_pathologic_stage","vital_status","days_to_death","days_to_last_follow_up","case_submitter_id","primary_diagnosis"]
deduplicate = df[df.index %2 !=0].sort_values(by='case_submitter_id')

In [333]:
merged_rppa = pd.merge(merged_rppa, deduplicate[colums], on='case_submitter_id', how='inner')
merged_rppa["ajcc_pathologic_stage"].unique()

array(['Stage IV', 'Stage I', "'--", 'Stage III', 'Stage IIB', 'Stage IA',
       'Stage II', 'Stage IIA', 'Stage IIIA', 'Stage IB', 'Stage IIIB',
       'Stage IVA', 'Stage 0', 'Stage IIIC', 'Stage IIC', 'Stage X',
       'Stage IVB', 'Stage IVC', 'Stage IS', 'Not Reported'], dtype=object)

In [334]:
def encode_stage_column(df, column_name):
    # Define mapping dictionary
    stage_mapping = {
        "Stage I": 0,
        "Stage IA": 1,
        "Stage IB": 2,
        "Stage IS": 3,
        "Stage II": 4,
        "Stage IIA": 5,
        "Stage IIB": 6,
        "Stage IIC": 7,
        "Stage III": 8,
        "Stage IIIA": 9,
        "Stage IIIB": 10,
        "Stage IIIC": 11,
        "Stage IV": 12,
        "Stage IVA": 13,
        "Stage IVB": 14,
        "Stage IVC": 15,
        "Stage X": 16,
        "Stage 0":17,
        "'--": -1,
        "Not Reported": -1
        
            }
    
    # Apply mapping to the column
    df[column_name] = df[column_name].map(stage_mapping)
    
    return df

In [335]:
merged_rppa_before_inputation = encode_stage_column(merged_rppa,"ajcc_pathologic_stage")

In [336]:
proteins_only_cols = merged_rppa_before_inputation.columns.drop(["case_submitter_id", 'ajcc_pathologic_stage', 'vital_status', 'days_to_death',
       'days_to_last_follow_up', 'primary_diagnosis'])

In [337]:
statistics= calculate_column_statistics(merged_rppa_before_inputation[proteins_only_cols])
statistics["NumNaNs"].value_counts()

NumNaNs
405     238
0       216
990       1
1414      1
Name: count, dtype: int64

In [338]:
merged_rppa_before_inputation.shape

(7895, 462)

7906-7895 = 11 cases are dropped while merging since lack of clinical information

among proteins that need to be inputated, drop cases if pathologic_stage not reported

In [339]:
def extract_columns_by_nan_num(df, nan_threshold):
    columns = [column for column in df.columns if df[column].isna().sum() == nan_threshold]
    return columns

In [340]:
protein_0f_1414_missing = extract_columns_by_nan_num(merged_rppa_before_inputation,1414)
protein_0f_990_missing = extract_columns_by_nan_num(merged_rppa_before_inputation,990)
protein_0f_405_missing = extract_columns_by_nan_num(merged_rppa_before_inputation,405)


In [341]:
def drop_rows(df, columns_to_check):
    all_nan = df[columns_to_check].isnull().all(axis=1)
    stage_minus_one = df["ajcc_pathologic_stage"] == -1
    to_drop = all_nan & stage_minus_one
    df = df[~to_drop]
    return df


In [342]:
merged_df_no_inputation = drop_rows(merged_rppa_before_inputation,protein_0f_1414_missing)
merged_df_no_inputation = drop_rows(merged_rppa_before_inputation,protein_0f_990_missing)
merged_df_no_inputation = drop_rows(merged_rppa_before_inputation,protein_0f_405_missing)

In [343]:
merged_df_no_inputation.shape

(7784, 462)

7895 - 7784 = 111 are dropped , good, most proteins that need inputation have ajcc_pathologic_stage avaliable

In [None]:
def impute_missing_with_group_median(merged_df_no_inputation, reference_column, columns_to_impute):

    for col in columns_to_impute:

        median_values = merged_df_no_inputation.groupby(reference_column)[col].median()

        for index, row in merged_df_no_inputation[merged_df_no_inputation[col].isnull()].iterrows():

            
            label_value = row[reference_column]

            merged_df_no_inputation.at[index, col] = median_values[label_value]
    return merged_df_no_inputation