In [21]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np


In [42]:
pd.set_option('display.max_rows', 500)

In [30]:
def read_files_in_directory(directory):
    file_paths = []
    for root, _, files in os.walk(directory):

        for file in files:
            if file[-3:] == "tsv":
                file_path = os.path.join(root, file)
                file_paths.append(file_path)
    return file_paths


In [32]:
directory_path = "data/Protein_Expression"
files = read_files_in_directory(directory_path)
len(files)

7906

In [36]:
def get_case_id_from_filename(file):
    index = file.find("TCGA")
    return file[index:index+12]

In [37]:
def extract_protein_expression(file):
    df = pd.read_csv(file, sep='\t')
    subset_df = df[['protein_expression', 'AGID']]
    transposed_df = subset_df.set_index('AGID').T
    transposed_df = transposed_df.reset_index(drop=True)
    transposed_df.insert(0, 'case_submitter_id', [get_case_id_from_filename(file)])
    return transposed_df

In [39]:
def merge_cases(files,extractor):
    merged = extractor(files[0])
    for file in files[1:]:
        to_be_merge = extractor(file)
        merged = pd.concat([to_be_merge, merged], axis=0, ignore_index=True)
    return merged
    

In [40]:
# merged_rppa = merge_cases(files,extract_protein_expression)
# merged_rppa.to_csv("full_rppa_raw.csv",index=False)


In [41]:
merged_rppa = pd.read_csv('full_rppa_raw.csv')
merged_rppa.shape

(7906, 488)

In [45]:
def calculate_column_statistics(df):
    results = pd.DataFrame(index=df.columns, columns=['Mean', 'Variance', 'NumUniqueValues', 'NumNaNs'])
    for column in df.columns:
        if column not in ["case_submitter_id"]:
            results.at[column, 'Mean'] = df[column].mean()
            results.at[column, 'Variance'] = df[column].var()
            results.at[column, 'NumUniqueValues'] = df[column].nunique()
            results.at[column, 'NumNaNs'] = df[column].isna().sum()
    return results

In [None]:
statistics= calculate_column_statistics(merged_rppa)
statistics

In [47]:
def extract_rows_by_nan(df, nan_threshold=353):
    columns = [column for column in df.columns if df[column].isna().sum() == nan_threshold]
    return columns

In [51]:
statistics

Unnamed: 0,Mean,Variance,NumUniqueValues,NumNaNs
case_submitter_id,,,,
AGID00100,0.115085,0.051381,7688.0,0.0
AGID00111,0.076493,0.052172,7699.0,0.0
AGID00101,-0.052685,0.211264,7753.0,0.0
AGID00001,-0.074658,0.287058,7753.0,0.0
AGID00002,-0.112266,0.11996,7686.0,0.0
AGID00003,-0.047794,0.52758,7775.0,0.0
AGID00443,-0.03222,0.054605,7698.0,0.0
AGID00120,-0.427699,0.440076,7704.0,0.0
AGID00004,-0.154013,0.481266,7749.0,0.0


In [49]:
statistics= calculate_column_statistics(merged_rppa)
statistics["NumNaNs"].unique()

array([nan, 0, 406, 6964, 4691, 1000, 7252, 7146, 5335, 4237, 2417, 6110,
       1415], dtype=object)