In [254]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import nan

# --------- IMPORT ---------
# Data patients klinical and survival data. Then combine the two dataframes
root_path     = "/Users/mattesa/molbreastlab-storage/work/Radiation_study/Matteo"
path_Klinisk  = "/Users/mattesa/molbreastlab-storage/work/Radiation_study/Matteo/ClinicalData_Klinisk.csv" 
CD_Klinisk = pd.read_csv( path_Klinisk, header=1, index_col=False)

CD_Klinisk.rename( columns = {"PID": "PatientID"}, inplace=True)

path_Survival = "/Users/mattesa/molbreastlab-storage/work/Radiation_study/Matteo/Radiationstudy_survival_modified_MS.xlsx" 
CD_survival = pd.read_excel( path_Survival,  header=0, index_col=False )

CD_survival = CD_survival.iloc[:, 0:22]
CD_survival['PatientID'] = CD_survival['PatientID'].astype(str)

# We sort by the patient_ID column, which is has same length and same elements
# This ensure that we can then simply combine the two table
CD_survival = CD_survival.sort_values(by='PatientID')
CD_Klinisk  = CD_Klinisk.sort_values(by='PatientID')
CD_Combined = pd.concat([CD_Klinisk, CD_survival.iloc[:,12:]], axis=1)

# CD_Combined

In [276]:
# --------- CREATE COLUMNS VALUES ---------
# Given a specific cohort/dataset, calculate all the variables of interest and
# collect the data as list, which will then become a unique columns in the
# final table
# Interesting cohorts are -
#   Death_All_5y
#   Death_All_10y
#   Death_Recur_5y
#   Death_Recur_10y
#   BC_Death_Recur_5y
#   BC_Death_Recur_10y

cohort = "Death_All_10y"
mask = CD_Combined[cohort] == 1
sele = CD_Combined[mask]

lsNames = []
lsCount = []
lsFract = []

print("Cohort:  ", cohort)
print("Size:    ", sele.shape[0])
# Cohort Size
lsNames.append( "Cohort Size" )
lsCount.append( sele.shape[0] )
lsFract.append( "" )

# ***************************************************************************
lsNames.append( "----" )
lsCount.append( "----" )
lsFract.append( "----" )

# Age data
lsNames.append( "Age_Mean" )
lsCount.append( sele["aldBL"].mean() )
lsFract.append( "" )
lsNames.append( "Age_StDev" )
lsCount.append( sele["aldBL"].std() )
lsFract.append( "" )
lsNames.append( "Age_Range" )
lsCount.append( str(sele["aldBL"].min())+" - "+str(sele["aldBL"].max()) )
lsFract.append( "" )

# BMI Data
lsNames.append( "BMI_Mean" )
lsCount.append( sele["BMI"].mean() )
lsFract.append( "" )
lsNames.append( "BMI_StDev" )
lsCount.append( sele["BMI"].std() )
lsFract.append( "" )
lsNames.append( "BMI_Range" )
lsCount.append( str(sele["BMI"].min())+" - "+str(sele["BMI"].max()) )
lsFract.append( "" )

# ***************************************************************************
lsNames.append( "----" )
lsCount.append( "----" )
lsFract.append( "----" )

# Breast Side Data
lsNames.append( "Breast Right" )
lsCount.append( (sele["Side"] == 0).sum() )
lsFract.append( (sele["Side"] == 0).sum() /sele.shape[0] *100 )
lsNames.append( "Breast Left" )
lsCount.append( (sele["Side"] == 1).sum() )
lsFract.append( (sele["Side"] == 1).sum() /sele.shape[0] *100 )
lsNames.append( "Breast Bilateral" )
lsCount.append( (sele["Side"] == 2).sum() )
lsFract.append( (sele["Side"] == 2).sum() /sele.shape[0] *100 )
# Cancer Histology Data
lsNames.append( "Hist. IDC" )
lsCount.append( (sele["hist"] == 1).sum() )
lsFract.append( (sele["hist"] == 1).sum() /sele.shape[0] *100 )
lsNames.append( "Hist. ILC" )
lsCount.append( (sele["hist"] == 2).sum() )
lsFract.append( (sele["hist"] == 2).sum() /sele.shape[0] *100 )
lsNames.append( "Hist. DCIS" )
lsCount.append( (sele["hist"] == 3).sum() )
lsFract.append( (sele["hist"] == 3).sum() /sele.shape[0] *100 )
lsNames.append( "Hist. LCIS" )
lsCount.append( (sele["hist"] == 4).sum() )
lsFract.append( (sele["hist"] == 4).sum() /sele.shape[0] *100 )
lsNames.append( "Hist. Tubular" )
lsCount.append( (sele["hist"] == 5).sum() )
lsFract.append( (sele["hist"] == 5).sum() /sele.shape[0] *100 )
lsNames.append( "Hist. Mucinous" )
lsCount.append( (sele["hist"] == 6).sum() )
lsFract.append( (sele["hist"] == 6).sum() /sele.shape[0] *100 )
lsNames.append( "Hist. Adeno carc." )
lsCount.append( (sele["hist"] == 7).sum() )
lsFract.append( (sele["hist"] == 7).sum() /sele.shape[0] *100 )
lsNames.append( "Hist. Metaplastic" )
lsCount.append( (sele["hist"] == 8).sum() )
lsFract.append( (sele["hist"] == 8).sum() /sele.shape[0] *100 )
lsNames.append( "Hist. Medullary" )
lsCount.append( (sele["hist"] == 9).sum() )
lsFract.append( (sele["hist"] == 9).sum() /sele.shape[0] *100 )
# Cancer Stage AJCC Data
lsNames.append( "AJCC Stage 0" )
lsCount.append( (sele["AJCC"] == 0).sum() )
lsFract.append( (sele["AJCC"] == 0).sum() /sele.shape[0] *100 )
lsNames.append( "AJCC Stage 1" )
lsCount.append( (sele["AJCC"] == 1).sum() )
lsFract.append( (sele["AJCC"] == 1).sum() /sele.shape[0] *100 )
lsNames.append( "AJCC Stage 2" )
lsCount.append( ((sele["AJCC"] == 2) | (sele["AJCC"] == 3)).sum() )
lsFract.append( ((sele["AJCC"] == 2) | (sele["AJCC"] == 3)).sum() /sele.shape[0] *100 )
lsNames.append( "AJCC Stage 3" )
lsCount.append( (sele["AJCC"] >= 4).sum() )
lsFract.append( (sele["AJCC"] >= 4).sum() /sele.shape[0] *100 )

# Tumor Growth Data
lsNames.append( "Tumor Growth -" )
lsCount.append( (sele["tumorinnv"] == 0).sum() )
lsFract.append( (sele["tumorinnv"] == 0).sum() /sele.shape[0] *100 )
lsNames.append( "Tumor Growth +" )
lsCount.append( (sele["tumorinnv"] == 1).sum() )
lsFract.append( (sele["tumorinnv"] == 1).sum() /sele.shape[0] *100 )
lsNames.append( "TumorSize_Mean" )
lsCount.append( sele["tumorstr"].mean() )
lsFract.append( "" )
lsNames.append( "TumorSize_StDev" )
lsCount.append( sele["tumorstr"].std() )
lsFract.append( "" )

# Surgery Type Data
lsNames.append( "Brystvev" )
lsCount.append( (sele["kir"] == 0).sum() )
lsFract.append( (sele["kir"] == 0).sum() /sele.shape[0] *100 )
lsNames.append( "Radikal" )
lsCount.append( (sele["kir"] == 1).sum() )
lsFract.append( (sele["kir"] == 1).sum() /sele.shape[0] *100 )
# Comorbidity General
lsNames.append( "Chronic Comorbidity +" )
lsCount.append( (sele["Comorb"] == 0).sum() )
lsFract.append( (sele["Comorb"] == 0).sum() /sele.shape[0] *100 )
lsNames.append( "Chronic Comorbidity -" )
lsCount.append( (sele["Comorb"] == 1).sum() )
lsFract.append( (sele["Comorb"] == 1).sum() /sele.shape[0] *100 )

# Comorbidity Specific
lsNames.append( "Cb - None" )
lsCount.append( (sele["Comorb_spes"] == 5).sum() )
lsFract.append( (sele["Comorb_spes"] == 5).sum() /sele.shape[0] *100 )
lsNames.append( "Cb - Anxiety/Depression" )
lsCount.append( (sele["Comorb_spes"] == 1).sum() )
lsFract.append( (sele["Comorb_spes"] == 1).sum() /sele.shape[0] *100 )
lsNames.append( "Cb - Diabetes" )
lsCount.append( (sele["Comorb_spes"] == 2).sum() )
lsFract.append( (sele["Comorb_spes"] == 2).sum() /sele.shape[0] *100 )
lsNames.append( "Cb - Stroke" )
lsCount.append( (sele["Comorb_spes"] == 3).sum() )
lsFract.append( (sele["Comorb_spes"] == 3).sum() /sele.shape[0] *100 )
lsNames.append( "Cb - Cardiovascular" )
lsCount.append( (sele["Comorb_spes"] == 4).sum() )
lsFract.append( (sele["Comorb_spes"] == 4).sum() /sele.shape[0] *100 )
lsNames.append( "Cb - Lund disease" )
lsCount.append( (sele["Comorb_spes"] == 6).sum()  )
lsFract.append( (sele["Comorb_spes"] == 6).sum() /sele.shape[0] *100 )
lsNames.append( "Cb - MS Hypothyroidism" )
lsCount.append( (sele["Comorb_spes"] == 7).sum() )
lsFract.append( (sele["Comorb_spes"] == 7).sum() /sele.shape[0] *100 )
lsNames.append( "Cb - Muscle and Joints" )
lsCount.append( (sele["Comorb_spes"] == 8).sum() )
lsFract.append( (sele["Comorb_spes"] == 8).sum() /sele.shape[0] *100 )
lsNames.append( "Cb - Narcolepsy" )
lsCount.append( (sele["Comorb_spes"] == 9).sum()  )
lsFract.append( (sele["Comorb_spes"] == 9).sum() /sele.shape[0] *100 )
lsNames.append( "Cb - Polio-senskader" )
lsCount.append( (sele["Comorb_spes"] == 10).sum() )
lsFract.append( (sele["Comorb_spes"] == 10).sum() /sele.shape[0] *100 )

# ***************************************************************************
lsNames.append( "----" )
lsCount.append( "----" )
lsFract.append( "----" )

# ER Data
lsNames.append( "ER +" )
lsCount.append( (sele["ER"] == 1).sum() )
lsFract.append( (sele["ER"] == 1).sum() /sele.shape[0] *100 )
lsNames.append( "ER -" )
lsCount.append( (sele["ER"] == 2).sum() )
lsFract.append( (sele["ER"] == 2).sum() /sele.shape[0] *100 )
lsNames.append( "ER, DCIS" )
lsCount.append( (sele["ER"] == 3).sum() )
lsFract.append( (sele["ER"] == 3).sum() /sele.shape[0] *100 )
lsNames.append( "ER, Insufficient" )
lsCount.append( (sele["ER"] == 4).sum() )
lsFract.append( (sele["ER"] == 4).sum() /sele.shape[0] *100 )

# Her2 Data
lsNames.append( "Her2 -" )
lsCount.append( (sele["HER2"] == 0).sum() )
lsFract.append( (sele["HER2"] == 0).sum() /sele.shape[0] *100 )
lsNames.append( "Her2 +" )
lsCount.append( (sele["HER2"] == 1).sum() )
lsFract.append( (sele["HER2"] == 1).sum() /sele.shape[0] *100 )
lsNames.append( "Her2 Unknown" )
lsCount.append( (sele["HER2"] == 2).sum() )
lsFract.append( (sele["HER2"] == 2).sum() /sele.shape[0] *100 )

# PGR Data
lsNames.append( "PGR -" )
lsCount.append( (sele["PgR"] == 2).sum() )
lsFract.append( (sele["PgR"] == 2).sum() /sele.shape[0] *100 )
lsNames.append( "PGR +" )
lsCount.append( (sele["PgR"] == 3).sum() )
lsFract.append( (sele["PgR"] == 3).sum() /sele.shape[0] *100 )
lsNames.append( "PGR unknown" )
lsCount.append( (sele["PgR"] == 1).sum() )
lsFract.append( (sele["PgR"] == 1).sum() /sele.shape[0] *100 )

# Herceptin Data
lsNames.append( "Herceptin - None" )
lsCount.append( (sele["Hercep"] == 0).sum() )
lsFract.append( (sele["Hercep"] == 0).sum() /sele.shape[0] *100 )
lsNames.append( "Herceptin - Before" )
lsCount.append( (sele["Hercep"] == 1).sum()  )
lsFract.append( (sele["Hercep"] == 1).sum() /sele.shape[0] *100 )
lsNames.append( "Herceptin - After" )
lsCount.append( (sele["Hercep"] == 2).sum() )
lsFract.append( (sele["Hercep"] == 2).sum() /sele.shape[0] *100 )

# ***************************************************************************
lsNames.append( "----" )
lsCount.append( "----" )
lsFract.append( "----" )

# Kjemoterapi Data
lsNames.append( "Kjemoterapi - None" )
lsCount.append( (sele["kjemo"] == 0).sum() )
lsFract.append( (sele["kjemo"] == 0).sum() /sele.shape[0] *100 )
lsNames.append( "Kjemoterapi - Adjuvant" )
lsCount.append( (sele["kjemo"] == 1).sum() )
lsFract.append( (sele["kjemo"] == 1).sum() /sele.shape[0] *100 )
lsNames.append( "Kjemoterapi - Neoadjuvant" )
lsCount.append( (sele["kjemo"] == 2).sum() )
lsFract.append( (sele["kjemo"] == 2).sum() /sele.shape[0] *100 )
lsNames.append( "Kjemoterapi - Adj + Neoadj" )
lsCount.append( (sele["kjemo"] == 3).sum() )
lsFract.append( (sele["kjemo"] == 3).sum() /sele.shape[0] *100 )
# Kjemo-Type Data
lsNames.append( "Kjemotype - None" )
lsCount.append( (sele["kjemo_type"] == 0).sum() )
lsFract.append( (sele["kjemo_type"] == 0).sum() /sele.shape[0] *100 )
lsNames.append( "Kjemotype - Fec60" )
lsCount.append( (sele["kjemo_type"] == 1).sum() )
lsFract.append( (sele["kjemo_type"] == 1).sum() /sele.shape[0] *100 )
lsNames.append( "Kjemotype - Fec100" )
lsCount.append( (sele["kjemo_type"] == 2).sum() )
lsFract.append( (sele["kjemo_type"] == 2).sum() /sele.shape[0] *100 )
lsNames.append( "Kjemotype - Fec/Tax" )
lsCount.append( (sele["kjemo_type"] == 3).sum() )
lsFract.append( (sele["kjemo_type"] == 3).sum() /sele.shape[0] *100 )
lsNames.append( "Kjemotype - Ec" )
lsCount.append( (sele["kjemo_type"] == 4).sum() )
lsFract.append( (sele["kjemo_type"] == 4).sum() /sele.shape[0] *100 )
lsNames.append( "Kjemotype - Annet" )
lsCount.append( (sele["kjemo_type"] == 5).sum() )
lsFract.append( (sele["kjemo_type"] == 5).sum() /sele.shape[0] *100 )

# Hormon Data
lsNames.append( "Hormon +" )
lsCount.append( (sele["Horm_type"] == 0).sum())
lsFract.append( (sele["Horm_type"] == 0).sum() /sele.shape[0] *100 )
lsNames.append( "Hormon -" )
lsCount.append( (sele["Horm_type"] == 1).sum() )
lsFract.append( (sele["Horm_type"] == 1).sum() /sele.shape[0] *100 )
# Hormon
lsNames.append( "Hormon - None" )
lsCount.append( (sele["Horm_type"] == 0).sum() )
lsFract.append( (sele["Horm_type"] == 0).sum() /sele.shape[0] *100 )
lsNames.append( "Hormon - Tamoxifen Before" )
lsCount.append( (sele["Horm_type"] == 1).sum()  )
lsFract.append( (sele["Horm_type"] == 1).sum() /sele.shape[0] *100 )
lsNames.append( "Hormon - Tamoxifen After" )
lsCount.append( (sele["Horm_type"] == 2).sum()  )
lsFract.append( (sele["Horm_type"] == 2).sum() /sele.shape[0] *100 )
lsNames.append( "Hormon - Aromatase Before" )
lsCount.append( (sele["Horm_type"] == 3).sum())
lsFract.append( (sele["Horm_type"] == 3).sum() /sele.shape[0] *100 )
lsNames.append( "Hormon - Aromatase After" )
lsCount.append( (sele["Horm_type"] == 4).sum() )
lsFract.append( (sele["Horm_type"] == 4).sum() /sele.shape[0] *100 )

# ***************************************************************************
lsNames.append( "----" )
lsCount.append( "----" )
lsFract.append( "----" )

# T_Stad Data
lsNames.append( "Tstad - T1" )
lsCount.append( (sele["Tstad"] == 1).sum() )
lsFract.append( (sele["Tstad"] == 1).sum() /sele.shape[0] *100 )
lsNames.append( "Tstad - T2" )
lsCount.append( (sele["Tstad"] == 2).sum() )
lsFract.append( (sele["Tstad"] == 2).sum() /sele.shape[0] *100 )
lsNames.append( "Tstad - T3" )
lsCount.append( (sele["Tstad"] == 3).sum() )
lsFract.append( (sele["Tstad"] == 3).sum() /sele.shape[0] *100 )
lsNames.append( "Tstad - T4" )
lsCount.append( (sele["Tstad"] == 4).sum() )
lsFract.append( (sele["Tstad"] == 4).sum() /sele.shape[0] *100 )
lsNames.append( "Tstad - Tis" )
lsCount.append( (sele["Tstad"] == 5).sum() )
lsFract.append( (sele["Tstad"] == 5).sum() /sele.shape[0] *100 )

# N_Stad Data
lsNames.append( "Nstad - N0" )
lsCount.append( (sele["Nstad"] == 1).sum() )
lsFract.append( (sele["Nstad"] == 1).sum() /sele.shape[0] *100 )
lsNames.append( "Nstad - N1" )
lsCount.append( (sele["Nstad"] == 2).sum() )
lsFract.append( (sele["Nstad"] == 2).sum() /sele.shape[0] *100 )
lsNames.append( "Nstad - N2" )
lsCount.append( (sele["Nstad"] == 3).sum()  )
lsFract.append( (sele["Nstad"] == 3).sum() /sele.shape[0] *100 )
lsNames.append( "Nstad - N3" )
lsCount.append( (sele["Nstad"] == 4).sum() )
lsFract.append( (sele["Nstad"] == 4).sum() /sele.shape[0] *100 )
lsNames.append( "Nstad - DCIS or Unknown" )
lsCount.append( (sele["Nstad"] == 5).sum() )
lsFract.append( (sele["Nstad"] == 5).sum() /sele.shape[0] *100 )

# ***************************************************************************
lsNames.append( "----" )
lsCount.append( "----" )
lsFract.append( "----" )

# Number of lymph nodes removed
lsNames.append( "Lymphonomed removed - Mean" )
lsCount.append( sele["lymf_fj"].mean() )
lsFract.append( "" )
lsNames.append( "Lymphonomed removed - StDev" )
lsCount.append( sele["lymf_fj"].std() )
lsFract.append( "" )
# Number of lymph nodes affected
lsNames.append( "Lymphonomed affected - Mean" )
lsCount.append( sele["lymf_aff"].mean() )
lsFract.append( "" )
lsNames.append( "Lymphonomed affected - StDev" )
lsCount.append( sele["lymf_aff"].std() )
lsFract.append( "" )

# Radiation Extent Data
lsNames.append( "RT - Local" )
lsCount.append( (sele["RT"] == 0).sum()  )
lsFract.append( (sele["RT"] == 0).sum() /sele.shape[0] *100 )
lsNames.append( "RT - Locoregional" )
lsCount.append( (sele["RT"] == 1).sum()  )
lsFract.append( (sele["RT"] == 1).sum() /sele.shape[0] *100 )

# HjerteSnittDose data
lsNames.append( "Hjerte Snitt Dose" )
lsCount.append( sele["HjerteSnittDose"].mean() )
lsFract.append( "" )
lsNames.append( "Hjerte Snitt Dose" )
lsCount.append( sele["HjerteSnittDose"].std() )
lsFract.append( "" )
lsNames.append( "Hjerte Snitt Dose" )
lsCount.append( str(sele["HjerteSnittDose"].min())+" - "+str(sele["HjerteSnittDose"].max()) )
lsFract.append( "" )

# HjerteV25 data
lsNames.append( "Hjerte V25" )
lsCount.append( sele["HjerteV25"].mean() )
lsFract.append( "" )
lsNames.append( "Hjerte V25" )
lsCount.append( sele["HjerteV25"].std() )
lsFract.append( "" )
lsNames.append( "Hjerte V25" )
lsCount.append( str(sele["HjerteV25"].min())+" - "+str(sele["HjerteV25"].max()) )
lsFract.append( "" )

# HjerteV40 data
lsNames.append( "Hjerte V40" )
lsCount.append( sele["HjerteV40"].mean() )
lsFract.append( "" )
lsNames.append( "Hjerte V40" )
lsCount.append( sele["HjerteV40"].std() )
lsFract.append( "" )
lsNames.append( "Hjerte V40" )
lsCount.append( str(sele["HjerteV40"].min())+" - "+str(sele["HjerteV40"].max()) )
lsFract.append( "" )


Cohort:   Death_All_10y
Size:     37


In [277]:
# RUN only 1st time - initialize empty DataFrame
# dfGroups = pd.DataFrame( data = [], index= lsNames)

dfGroups["Death_All_10y - Count"] = lsCount
dfGroups["Death_All_10y - Perct"] = lsFract
# dfGroups["Survived_All_5y"]     = dfVals
# dfGroups["Survived_All_10y"]     = dfVals
# dfGroups["Survived_All+Rec_5y"]  = dfVals
# dfGroups["Survived_All+Rec_10y"] = dfVals
# dfGroups["Survived_BC+Rec_5y"]   = dfVals
# dfGroups["Survived_BC+Rec_10y"]  = dfVals        

In [279]:
# Save as Python-Ready .csv file
dfGroups.to_csv( root_path + "/Table_Cohort.csv"  , header=True, index=True)

In [None]:
"""mednames = sele["medisin"].unique()
mednames = [ xx for xx in mednames if type(xx) != float ]
newnames = []
for nn in mednames:
    sub = nn.split(",")
    
    if type(nn) != float: 
        if len(sub) >=2:
            for xx in sub:
                newnames.append( xx.strip() )
        else:
            newnames.append( sub )

#pd.DataFrame( data=counts, index=mednames )"""

In [278]:
dfGroups

Unnamed: 0,Cohort - Count,Cohort - Perct,Survived_All_5y - Count,Survived_All_5y - Perct,Death_All_5y - Count,Death_All_5y - Perct,Survived_All_10y - Count,Survived_All_10y - Perct,Death_All_10y - Count,Death_All_10y - Perct
Cohort Size,250,,235,,14,,212,,37,
----,----,----,----,----,----,----,----,----,----,----
Age_Mean,58.104,,58.0255,,59.8571,,57.7736,,60.1622,
Age_StDev,9.84973,,9.30119,,17.1682,,8.75066,,14.7036,
Age_Range,28 - 89,,28 - 89,,36 - 88,,28 - 79,,36 - 89,
...,...,...,...,...,...,...,...,...,...,...
Hjerte V25,6.75921,,6.94925,,2.0806,,7.279,,2.10768,
Hjerte V25,-100.0 - 6.4,,-100.0 - 6.4,,0.0 - 6.2,,-100.0 - 6.4,,0.0 - 6.35,
Hjerte V40,0.48308,,0.443362,,1.18429,,0.382028,,1.07514,
Hjerte V40,6.48258,,6.67826,,1.24804,,7.02047,,1.16685,
