In [255]:
"""
# ****************************************************************************
#                       DATA ORGANIZATION and FORMATING
# ****************************************************************************

In this script we organize the radiation datasets: 
 - type-format data that is in unusual type (see dates)
 - renames and change column order where necessary
 - remove redundant columns, 
 - ensure that all datasets have the same "observations" (measured samples)
Finally save them as DataFrame.csv which can then be easily shared across 
scripts and programming languages

# ****************************************************************************
"""


import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import nan


# ****************************************************************************
#              Clinical, Mmetabolites and Lipoproteins tables
# ****************************************************************************
# Create the major dataframes using the following files:
# List of clinical table with features and timepoints
# => ClinicalData_Klinisk.csv ------------ [n: 1028] 
#        Patients list and clinical data  (sheets "Klinik fra Randi")
#        - (including medications, treatments etc...)
# => ClinicalData_Oversikt.csv ----------- [n: 250] 
#        Sample list of all timepoint measurements (sheet "Oversikt")
#        - (in theory 5 each patient)
# => Lipoproteinene_Combined2.csv -------- [n: 1026]
#        Lipoprotein measurements (blood serum)
# => Metabolittene_comb_final_clean.csv -- [n: 1028]
#        Metabolites measurments (blood serum)

# ------ LOAD data-tables ------
path_Klinisk  = "/Users/mattesa/molbreastlab-storage/work/Radiation_study/Matteo/ClinicalData_Klinisk.csv" 
path_Oversikt = "/Users/mattesa/molbreastlab-storage/work/Radiation_study/Matteo/ClinicalData_Oversikt.csv" 
path_Lipopro  = "/Users/mattesa/molbreastlab-storage/work/Radiation_study/Matteo/Lipoproteinene_Combined2.csv" 
path_Metabol  = "/Users/mattesa/molbreastlab-storage/work/Radiation_study/Matteo/Metabolittene_comb_final_clean.csv" 
root_path = os.path.dirname(path_Lipopro)
             
# Load clinical data files, Lipoprotein and Metabolites measurments files 
# NOTE: There are some redundant columns!
CD_klinisk = pd.read_csv( path_Klinisk,  header=1, index_col=False)
CD_oversik = pd.read_csv( path_Oversikt, header=0, index_col=False)
LP_measure = pd.read_csv( path_Lipopro,  header=0, index_col=False)
MB_measure = pd.read_csv( path_Metabol,  header=0, index_col=False)


# ------ PRUNING the data ------
# Extract the timepoints as defined by Guro. We could also redifine by looking 
# dates or use dates to find matching measurments between different dataframes 
sele_oversik  = [ "Samplenames lipo", "Names date modified", "Samling date", 
                  "Dato stråleterapi", 
                  "Timepoint", "Timepoint_kodet",
                  "Alder", "BMI", 
                  "HEr2 ", "ER ", "PGR","kjemo", "herceptin" , "hormornbeh",
                  "FatPAS.1", "FatPAS.2", "FatPAS.3", "FatPAS.4", "FatPAS.5"]
CD_samples  = CD_oversik.loc[:,sele_oversik]

# Simplify the names of the Metabolites dataframe columns, removing the unit
names = MB_measure.columns
MB_measure.columns = [xx.split(" (RawConc)")[0] for xx in names]

# Rename column e.g. "Samplenames lipo" in CD_oversik === "name" in LP_measure
CD_samples = CD_samples.rename(columns={"Samplenames lipo": "Samplename"})
LP_measure = LP_measure.rename(columns={"name": "Samplename"})
MB_measure = MB_measure.rename(columns={"Sample Name": "Samplename"})
CD_samples = CD_samples.rename(columns={"HEr2 ": "HER2"})
CD_samples = CD_samples.rename(columns={"ER ": "ER"})

# Remove quality control measurments: strings starting with "QC"
mask_QC = ["QC" not in xx   for xx in CD_samples["Samplename"]]
CD_samples = CD_samples.loc[ mask_QC, : ]
mask_QC = ["QC" not in xx   for xx in LP_measure["Samplename"]]
LP_measure = LP_measure.loc[ mask_QC, : ]
mask_QC = ["QC" not in xx   for xx in MB_measure["Samplename"]]
MB_measure = MB_measure.loc[ mask_QC, : ]

# Insert a "PatientID" column
pID = [ xx.split("_")[0] for xx in CD_samples["Samplename"]]
CD_samples.insert( loc=1, column="PatientID", value=pID)
pID = [ xx.split("_")[0] for xx in LP_measure["Samplename"]]
LP_measure.insert( loc=1, column="PatientID", value=pID)
pID = [ xx.split("_")[0] for xx in MB_measure["Samplename"]]
MB_measure.insert( loc=1, column="PatientID", value=pID)


# ! Ensure that the different DFs (CD, LP and MB) have the exact same set of 
#   samples in their list. Use "Samplename" compare and remove those samples
#   that appear only in one dataframe.
# NOTE: check manually on the list what we are actually removing !!!
#       Specifically, LP have less samples than MP and CD
# Use set-difference to find missing samples in LP_measure, then reomve those
# from the list. Finally, subset DFs selecting only common samples
miss_aa = np.setdiff1d( CD_samples["Samplename"], LP_measure["Samplename"])
miss_bb = np.setdiff1d( MB_measure["Samplename"], LP_measure["Samplename"])
miss_cc = np.setdiff1d( LP_measure["Samplename"], CD_samples["Samplename"])
miss_nn = miss_aa.tolist() + miss_bb.tolist() + miss_cc.tolist()  
list_nn = np.setdiff1d( CD_samples["Samplename"], miss_nn )
CD_samples = CD_samples.loc[CD_samples["Samplename"].isin(list_nn)]
list_nn = np.setdiff1d( MB_measure["Samplename"], miss_nn )
MB_measure = MB_measure.loc[MB_measure["Samplename"].isin(list_nn)]
list_nn = np.setdiff1d( LP_measure["Samplename"], miss_nn )
LP_measure = LP_measure.loc[LP_measure["Samplename"].isin(list_nn)]


# At this point we have the exact same entries in both DataFrames. 
# (Check by running np.setdiff1d in all combinations between the 3 DFs)
# Sort-by name columns "Samplename" and then remove uncategorized timepoints.
CD_samples = CD_samples.sort_values("Samplename")
LP_measure = LP_measure.sort_values("Samplename")
MB_measure = MB_measure.sort_values("Samplename")

# Correct Syntax errors in Time column ('pre ' (with space) instead of 'pre')
# idx = CD_samples.index[CD_samples['Timepoint']== 'pre '].tolist()
# CD_samples.loc[idx] = CD_samples.loc[idx]['Timepoint'] = 'pre'

# Remove the samples tagged "pre" in HER2 column
#mask = np.array(CD_samples["PGR"] == -100)
# Remove 'nan' values (CD_samples['Timepoint']== '?')
mask = CD_samples["Timepoint_kodet"].isnull().values
CD_samples = CD_samples.loc[ ~mask, : ]
LP_measure = LP_measure.loc[ ~mask, : ]
MB_measure = MB_measure.loc[ ~mask, : ]


# ------ REORGANIZE and SORT variables ------
# Now all DFs the sample measurements sorted in the same order (Samplename)
# Thus, we reindex the rows in all DFs to be the same.
CD_samples = CD_samples.reset_index(drop=True)
LP_measure = LP_measure.reset_index(drop=True)
MB_measure = MB_measure.reset_index(drop=True)

# Reorganize the "covariates" order so that we remove redundant (colinear) 
# factors and we can plot them together based on meaningful grouping
# and order (e.g. class of molecules)
# TG  = tryglyceride
# CH  = total cholesterol
# FC  = free cholesterol
# (EC = esterified cholesterol) 
# PL  = phosppholipids
# AB  = ApoB  (=== particle numbers, for specific class)
# A1  = ApoA (only HDL)
remove_LP_vars = [
                  "LDHD", "ABA1", "TBPN", "VLPN", "IDPN", "LDPN",
                  "L1PN", "L2PN", "L3PN", "L4PN", "L5PN", "L6PN"
                  ]

vars_LP_order = [  
                'V1TG', 'V1CH', 'V1FC', 'V1PL', 
                'V2TG', 'V2CH', 'V2FC', 'V2PL', 
                'V3TG', 'V3CH', 'V3FC', 'V3PL', 
                'V4TG', 'V4CH', 'V4FC', 'V4PL', 
                'V5TG', 'V5CH', 'V5FC', 'V5PL', 

                'IDTG', 'IDCH', 'IDFC', 'IDPL',

                'L1TG', 'L1CH', 'L1FC', 'L1PL',
                'L2TG', 'L2CH', 'L2FC', 'L2PL',
                'L3TG', 'L3CH', 'L3FC', 'L3PL',
                'L4TG', 'L4CH', 'L4FC', 'L4PL',
                'L5TG', 'L5CH', 'L5FC', 'L5PL',
                'L6TG', 'L6CH', 'L6FC', 'L6PL',

                'H1TG', 'H1CH', 'H1FC', 'H1PL', 'H1A1', 'H1A2',
                'H2TG', 'H2CH', 'H2FC', 'H2PL', 'H2A1', 'H2A2',
                'H3TG', 'H3CH', 'H3FC', 'H3PL', 'H3A1', 'H3A2',
                'H4TG', 'H4CH', 'H4FC', 'H4PL', 'H4A1', 'H4A2',
                ]
vars_MB_order = [ 'Lysine', 'Histidine', 'Glutamic acid',
                  'Glutamine', 'Asparagine', 'Threonine', 
                  'Glycine', 'Proline',
                  'Phenylalanine', 'Leucine', 'Alanine', 'Tyrosine', 'Valine', 'Isoleucine', 'Methionine',                  
                  'N,N-Dimethylglycine', 'Ornithine', 'Sarcosine',                 
                  '2-Hydroxybutyric acid', '3-Hydroxybutyric acid', '2-Oxoglutaric acid', '2-Aminobutyric acid',
                  'Pyruvic acid', 'Citric acid', 'Acetic acid', 'Lactic acid', 'Succinic acid', 'Formic acid', 'Acetoacetic acid',                  
                  'K-EDTA','Ca-EDTA',                 
                  'Acetone', 'Glycerol',                 
                  'D-Galactose', 'Glucose',                 
                  'Dimethylsulfone',                   
                  'Choline', 'Creatine', 'Creatinine'
                 ]

data = LP_measure.iloc[:,4:]
# Simplify the index (covariates) names by removing the "unit"
names = data.columns.values
names = [ xx.split(" [")[0] for xx in names]
data.columns = names
data = data.drop(remove_LP_vars, axis = 1)
LP_measure = pd.concat( [LP_measure.iloc[:,0:4], data], axis = 1 )

# select and sort only the variables in vars_LP_order
#data = data.loc[:,vars_LP_order] 
#data = data.reindex(vars_LP_order, axis = 1)

data = MB_measure.iloc[:,2:]
data = data.reindex(vars_MB_order, axis = 1)
MB_measure = pd.concat( [MB_measure.iloc[:,0:2], data], axis = 1 )

# Quick check to ensure that "Samplename" in all three dataframe are in the 
# same ordered list
'''
colname = "Samplename"
DF = pd.DataFrame(data =[ CD_samples[colname],  LP_measure[colname],  MB_measure[colname]]).T
DF.columns = ["CD_samples", "LP_measure", "MB_measure"]
DF["TF"] = 0
for ii in range(len(DF)):
    lst = DF.iloc[ii,:-1].values.tolist()
    res = all(ele == lst[0] for ele in lst)
    DF.iloc[ii,3] = res    
DF.loc[DF["TF"]==False, :]
'''

# ------ SAVE ------
# Save as Python-Ready .csv file
CD_samples.to_csv( root_path + "/PyDF_Sample_ClinikData.csv"  , header=True, index=False)
LP_measure.to_csv( root_path + "/PyDF_Lipoproteine.csv", header=True, index=False)
MB_measure.to_csv( root_path + "/PyDF_Metabolittene.csv", header=True, index=False)



# ****************************************************************************
#                                Survival data
# ****************************************************************************
# Create a dataframes using survival file. 
# => Radiationstudy_survival_modified_MS.xlsx ------------ [n: 250] 
#        Patients list with survival and follow-up data
# Survival data format requires more ad-hoc working; thus we work on it here separately.
# NOTE:
# Survival state:  1= lever,  2= tilbakefall, 3= kreftdød,     4= annen død 
#                  1= alive,  2= relapse,     3= cancer death, 4= other death 

#------ LOAD SURVIVAL data-tables ------
# List of patients and importantly the survival. We need to load as .xlsx 
# because there are "comment" columns with "commas" that alter the table when 
# uploaded as .csv file
path_Survival = "/Users/mattesa/molbreastlab-storage/work/Radiation_study/Matteo/Radiationstudy_survival_modified_MS.xlsx" 
CD_survival = pd.read_excel( path_Survival,  header=0, index_col=False )

# ------ PRUNING the data ------
# We could exclude columns: 
# 6 - Komorbidity - a description and requires manual conversion to some categorical data type
CD_survival = CD_survival.iloc[:, 0:22]
CD_survival['PatientID'] = CD_survival['PatientID'].astype(str)

# Uniformize the date from strings to datetime64 
# NOTE: NAN strings (possibly no follow up due to death) are convered into
#       NaT in datetime64. To find NaT use .isnull() method
CD_survival["Date_StartRadio"] = pd.to_datetime(CD_survival["Date_StartRadio"], infer_datetime_format=True, errors='coerce')
CD_survival["Date_10yCheck"]   = pd.to_datetime(CD_survival["Date_10yCheck"], infer_datetime_format=True, errors='coerce')
CD_survival["Date_Death"]      = pd.to_datetime(CD_survival["Date_Death"], infer_datetime_format=True, errors='coerce')
CD_survival["Date_Relapse"]    = pd.to_datetime(CD_survival["Date_Relapse"], infer_datetime_format=True, errors='coerce')
CD_survival["Date_QOLquestionnaire"] = pd.to_datetime(CD_survival["Date_QOLquestionnaire"], infer_datetime_format=True, errors='coerce')

# Replace NaN and convert columns into simple 0-1 values
mask    = CD_survival["N_Participant"].isnull()
CD_survival.loc[mask, "N_Participant"] = 0
mask    = CD_survival["N_Participant"].isnull()
CD_survival.loc[mask, "N_Participant"] = 0
mask    = CD_survival["NewCancer"].isnull()
CD_survival.loc[mask, "NewCancer"] = 0


# ------ SAVE ------
# Save as Python-Ready .csv file
CD_survival.to_csv( root_path + "/PyDF_Patients_Survival.csv" , header=True, index=False)



# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# ------ COMBINE survival data into with CD_samples ------

Surv_Colns = [  "Death_All_5y", "Death_All_7y", "Death_All_10y", 
                "Death_Recur_5y", "Death_Recur_7y", "Death_Recur_10y", 
                "BC_Death_Recur_5y", "BC_Death_Recur_7y", "BC_Death_Recur_10y"
             ]
# Initialize empty columns
for dd in Surv_Colns:
    CD_samples[dd] = 9

# Add one by one survival data to each row of CD_sample, based on PatientID
for dd in Surv_Colns:
    for cc in CD_samples.index.values:        
        cc_patient = CD_samples.loc[cc, "PatientID"]
        # Consider that some may be "nan" elements
        if len( CD_survival.loc[ CD_survival.loc[:,"PatientID"] == cc_patient, dd] ) == 0 : 
            S_value = nan
        else :
            S_value = CD_survival.loc[ CD_survival.loc[:,"PatientID"] == cc_patient, dd].values[0]        
        CD_samples.loc[ cc , dd ] = S_value
# Display and check that it was succesful
#CD_survival.loc[ CD_survival.loc[:,"Death_Recur_10y"] == 1, "PatientID"].unique()
#CD_samples.loc[  CD_samples.loc[:,"Death_Recur_10y"] == 1, "PatientID"].unique()


# ------ SAVE ------
# Save updated CD_samples .csv file
CD_samples.to_csv( root_path + "/PyDF_Sample_ClinikData.csv"  , header=True, index=False)



# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
print( "CD_samples  - ", len(CD_samples))
print( "CD_survival - ", len(CD_samples))
print( "LP_measure  - ", len(CD_samples))
print( "MB_measure  - ", len(CD_samples))



CD_samples  -  985
CD_survival -  985
LP_measure  -  985
MB_measure  -  985


In [None]:
# Set the variables names and colors based on the data_type used for the model
data_type = "LP"
if  data_type == "LP":
    vars_order = [  
                    'V1TG', 'V1CH', 'V1FC', 'V1PL', 
                    'V2TG', 'V2CH', 'V2FC', 'V2PL', 
                    'V3TG', 'V3CH', 'V3FC', 'V3PL', 
                    'V4TG', 'V4CH', 'V4FC', 'V4PL', 
                    'V5TG', 'V5CH', 'V5FC', 'V5PL', 

                    'IDTG', 'IDCH', 'IDFC', 'IDPL',

                    'L1TG', 'L1CH', 'L1FC', 'L1PL',
                    'L2TG', 'L2CH', 'L2FC', 'L2PL',
                    'L3TG', 'L3CH', 'L3FC', 'L3PL',
                    'L4TG', 'L4CH', 'L4FC', 'L4PL',
                    'L5TG', 'L5CH', 'L5FC', 'L5PL',
                    'L6TG', 'L6CH', 'L6FC', 'L6PL',

                    'H1TG', 'H1CH', 'H1FC', 'H1PL', 'H1A1', 'H1A2',
                    'H2TG', 'H2CH', 'H2FC', 'H2PL', 'H2A1', 'H2A2',
                    'H3TG', 'H3CH', 'H3FC', 'H3PL', 'H3A1', 'H3A2',
                    'H4TG', 'H4CH', 'H4FC', 'H4PL', 'H4A1', 'H4A2',
        
                    'TPTG', 'TPCH', 'TPA1', 'TPA2', 'TPAB',        
                    'VLCH', 'LDCH', 'HDCH',  
                    'VLTG', 'LDTG', 'HDTG',  
                    'VLFC', 'LDFC', 'HDFC', 
                    'VLPL', 'LDPL', 'HDPL', 'HDA1', 'HDA2', 
                    'VLAB', 'IDAB', 'LDAB', 
                    'L1AB', 'L2AB', 'L3AB', 'L4AB', 'L5AB', 'L6AB'
                    ]


    # Create a concatenated list of RGB values to assign vars_MB_order colors 
    # NOTE 1 : use * operator for list concatenation and *n for repeation
    # NOTE 2 : the indexes of the DF (variables) )were reorganized before based on vars_LP_order list
    vars_color = [*[[.6,.1,.0]]*4 , *[[.7,.2,.0]]*4 , *[[.8,.3,.0]]*4 , *[[.9,.4,.0]]*4, *[[1 ,.5,.0]]*4,
                  *[[.2,.8,.2]]*4 , 
                  *[[.7,.7,.0]]*4 , *[[.8,.8,.0]]*4 , *[[.9,.9,.0]]*4 , 
                  *[[1.,.7,.0]]*4 , *[[1.,.8,.0]]*4 , *[[1.,.9,.0]]*4 ,
                  *[[.1,.3,.8]]*6 , *[[.1,.4,.9]]*6 , *[[.1,.5,1.]]*6 , *[[.1,.6,1.]]*6,
                  *[[.8,.9,.8]]*5 ,
                  *[[.7,.6,.6]]*23 
    ]   
    
    
elif data_type == "MB":    
    vars_order = [  
                    'Lysine', 'Histidine', 'Glutamic acid',
                    'Glutamine', 'Asparagine', 'Threonine', 
                    'Glycine', 'Proline',
                    'Phenylalanine', 'Leucine', 'Alanine', 'Tyrosine', 'Valine', 'Isoleucine', 'Methionine',                  
                    'N,N-Dimethylglycine', 'Ornithine', 'Sarcosine',                 
                    '2-Hydroxybutyric acid', '3-Hydroxybutyric acid', '2-Oxoglutaric acid', '2-Aminobutyric acid',
                    'Pyruvic acid', 'Citric acid', 'Acetic acid', 'Lactic acid', 'Succinic acid', 'Formic acid', 'Acetoacetic acid',                  
                    'K-EDTA','Ca-EDTA',                 
                    'Acetone', 'Glycerol',                 
                    'D-Galactose', 'Glucose',                 
                    'Dimethylsulfone',                   
                    'Choline', 'Creatine', 'Creatinine'
                 ]
    # Create a concatenated list of RGB values to assign vars_MB_order colors 
    # NOTE 1 : use * operator for list concatenation and *n for repeation
    # NOTE 2 : the indexes of the DF (variables) )were reorganized before based on vars_MB_order list
    vars_color = [*[[0,0.6,1]]*3 , *[[0, 0.4, 1]]*3 , *[[0, 0.2, 1]]*2 , *[[0, 0, 1]]*7,
                  *[[1,0.8,0.2]]*3 , 
                  *[[1,0.4,0.2]]*4 , 
                  *[[1,0.0,0.2]]*7 , 
                  *[[0.2, 0.5, 0.2]]*2 , *[[0.1, 0.6, 0.1]]*2 , *[[0.0, 0.6, 0.0]]*2 , *[[.8, .8, 0]]*1, *[[0, 0.7, 0.2]]*3 ] 

"""
R = []
G = []
B = []
for xx in vars_color:
    R.append( xx[0] )
    G.append( xx[1] )
    B.append( xx[2] )

tableVars_Order_RGB = pd.DataFrame(data = np.array([vars_order, R, G, B]).T, columns = ["LP_names", "R", "G", "B"])
tableVars_Order_RGB
# --_ SAVE --- Save as Python-Ready .csv file
tableVars_Order_RGB.to_csv( root_path + "/table"+data_type+"_Order_RGB.csv", header=True, index=False)
"""