In [154]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import nan

# --------- IMPORT ---------
# Data patients klinical and survival data. Then combine the two dataframes
root_path     = "/Users/mattesa/molbreastlab-storage/work/Radiation_study/Matteo"
path_Klinisk  = "/Users/mattesa/molbreastlab-storage/work/Radiation_study/Matteo/ClinicalData_Klinisk.csv" 
CD_Klinisk = pd.read_csv( path_Klinisk, header=1, index_col=False)

CD_Klinisk.rename( columns = {"PID": "PatientID"}, inplace=True)

path_Survival = "/Users/mattesa/molbreastlab-storage/work/Radiation_study/Matteo/Radiationstudy_survival_modified_MS_newfollowuptime.xlsx" 
CD_survival = pd.read_excel( path_Survival,  header=0, index_col=False )

CD_survival = CD_survival.iloc[:, 0:22]
CD_survival['PatientID'] = CD_survival['PatientID'].astype(str)

# We sort by the patient_ID column, which is has same length and same elements
# This ensure that we can then simply combine the two table
CD_survival = CD_survival.sort_values(by='PatientID')
CD_Klinisk  = CD_Klinisk.sort_values(by='PatientID')
CD_Combined = pd.concat([CD_Klinisk, CD_survival.iloc[:,12:]], axis=1)

# CD_Combined

In [155]:
def chi2_test( data_A, data_B ):    
    # Pearson's chi-square test:
    # A statistical method similar to t-test that make the inference for 
    # distribution of categorical variables and to decide  whether there is a 
    # relationship exists two populations/groups.
    # If H0 is true, each cell of the value in  contingency table (data_crosstab)
    # contains an expected frequency (Ev), as opposed to the observed value (Ov) 
    # for each cell.
    # The test statistic: Comparing the differences between Ov and Ev 
    # leads to the χ2 statistic, with [(r-1) x (c-1)] degrees of freedomchi_square = 0
    
    from scipy.stats import chisquare
   
    # Find all unique categorical levels in the two datasets and create a 
    # two-columns list of the "levels" and "groups" repeating as many time 
    # as they are present in the data provided. The list can then be easily
    # converted in a dataframe and a contigency table with with the Row and  
    # Col total sums
    cat = np.union1d( data_B.unique(), data_A.unique() )
    # Remove nan from categories and make sure that the values are Integers
    # (NB: cat values are used as index of DF, so they cannot be floats)
    cat = cat[~np.isnan(cat)]
    cat = cat.astype(int)
    
    data = []
    for cc in cat:
        data.append( [ [cc, "A"] for xx in range((data_A==cc).sum()) ] )
        data.append( [ [cc, "B"] for xx in range((data_B==cc).sum()) ] )
        
    data = [item for sublist in data for item in sublist]      # Flatten list by one level
    df = pd.DataFrame(data, columns = ['Level', 'Group']) 
    # Create contingency table
    data_crosstab = pd.crosstab(df['Level'],
                                df['Group'],
                               margins=True, margins_name="Total")

    critical_value, p_value, dof, expectFreq = chi2_contingency( data_crosstab.iloc[:-1,:-1], correction=False)
        
    return p_value 

In [156]:
# --------- CREATE COLUMNS VALUES ---------
# Given a specific cohort/dataset, calculate all the variables of interest and
# collect the data as list, which will then become a unique columns in the
# final table
# Interesting cohorts are -
#   Death_All_5y
#   Death_All_10y
#   Death_Recur_5y
#   Death_Recur_10y
#   BC_Death_Recur_5y
#   BC_Death_Recur_10y


# Initialize empty DataFrame to store final data
dfGroups = pd.DataFrame( data = [] )

# We run an outer for loop that fo through all cohorts (columns) with survival 
# that we want to group for our analysis. Then each will be repeated twice, 
# creating the 0 (survived) and  1 (dead) groups to confront
cohorts     = [ "Cohort", "Death_All_5y", "Death_All_5y", "Death_All_10y", "Death_All_10y"  ]
conditions  = [0, 0, 1, 0, 1]           # 0 = survived,  1 = dead
colSaveName = [ "Cohort", "Survive_All_5y", "Death_All_5y", "Survive_All_10y", "Death_All_10y"  ]

for cc in range(len(cohorts)):

    if cohorts[cc] == "Cohort":
        sele = CD_Combined
    else:
        mask = CD_Combined[cohorts[cc]] == conditions[cc]
        sele = CD_Combined[mask]

    lsNames = []
    lsCount = []
    lsFract = []

    print("Cohort:  ", cohorts[cc], " --> ", conditions[cc])
    print("Size:    ", sele.shape[0])
    # Cohort Size
    lsNames.append( "Cohort Size" )
    lsCount.append( sele.shape[0] )
    lsFract.append( "" )

    # ***************************************************************************
    lsNames.append( "----" )
    lsCount.append( "----" )
    lsFract.append( "----" )

    # Age data
    lsNames.append( "Age_Mean" )
    lsCount.append( sele["aldBL"].mean() )
    lsFract.append( "" )
    lsNames.append( "Age_StDev" )
    lsCount.append( sele["aldBL"].std() )
    lsFract.append( "" )
    lsNames.append( "Age_Range" )
    lsCount.append( str(sele["aldBL"].min())+" - "+str(sele["aldBL"].max()) )
    lsFract.append( "" )

    # BMI Data
    lsNames.append( "BMI_Mean" )
    lsCount.append( sele["BMI"].mean() )
    lsFract.append( "" )
    lsNames.append( "BMI_StDev" )
    lsCount.append( sele["BMI"].std() )
    lsFract.append( "" )
    lsNames.append( "BMI_Range" )
    lsCount.append( str(sele["BMI"].min())+" - "+str(sele["BMI"].max()) )
    lsFract.append( "" )

    # ***************************************************************************
    lsNames.append( "----" )
    lsCount.append( "----" )
    lsFract.append( "----" )

    # Breast Side Data
    lsNames.append( "Breast Right" )
    lsCount.append( (sele["Side"] == 0).sum() )
    lsFract.append( (sele["Side"] == 0).sum() /sele.shape[0] *100 )
    lsNames.append( "Breast Left" )
    lsCount.append( (sele["Side"] == 1).sum() )
    lsFract.append( (sele["Side"] == 1).sum() /sele.shape[0] *100 )
    lsNames.append( "Breast Bilateral" )
    lsCount.append( (sele["Side"] == 2).sum() )
    lsFract.append( (sele["Side"] == 2).sum() /sele.shape[0] *100 )
    # Cancer Histology Data
    lsNames.append( "Hist. IDC" )
    lsCount.append( (sele["hist"] == 1).sum() )
    lsFract.append( (sele["hist"] == 1).sum() /sele.shape[0] *100 )
    lsNames.append( "Hist. ILC" )
    lsCount.append( (sele["hist"] == 2).sum() )
    lsFract.append( (sele["hist"] == 2).sum() /sele.shape[0] *100 )
    lsNames.append( "Hist. DCIS" )
    lsCount.append( (sele["hist"] == 3).sum() )
    lsFract.append( (sele["hist"] == 3).sum() /sele.shape[0] *100 )
    lsNames.append( "Hist. LCIS" )
    lsCount.append( (sele["hist"] == 4).sum() )
    lsFract.append( (sele["hist"] == 4).sum() /sele.shape[0] *100 )
    lsNames.append( "Hist. Tubular" )
    lsCount.append( (sele["hist"] == 5).sum() )
    lsFract.append( (sele["hist"] == 5).sum() /sele.shape[0] *100 )
    lsNames.append( "Hist. Mucinous" )
    lsCount.append( (sele["hist"] == 6).sum() )
    lsFract.append( (sele["hist"] == 6).sum() /sele.shape[0] *100 )
    lsNames.append( "Hist. Adeno carc." )
    lsCount.append( (sele["hist"] == 7).sum() )
    lsFract.append( (sele["hist"] == 7).sum() /sele.shape[0] *100 )
    lsNames.append( "Hist. Metaplastic" )
    lsCount.append( (sele["hist"] == 8).sum() )
    lsFract.append( (sele["hist"] == 8).sum() /sele.shape[0] *100 )
    lsNames.append( "Hist. Medullary" )
    lsCount.append( (sele["hist"] == 9).sum() )
    lsFract.append( (sele["hist"] == 9).sum() /sele.shape[0] *100 )
    # Cancer Stage AJCC Data
    lsNames.append( "AJCC Stage 0" )
    lsCount.append( (sele["AJCC"] == 0).sum() )
    lsFract.append( (sele["AJCC"] == 0).sum() /sele.shape[0] *100 )
    lsNames.append( "AJCC Stage 1" )
    lsCount.append( (sele["AJCC"] == 1).sum() )
    lsFract.append( (sele["AJCC"] == 1).sum() /sele.shape[0] *100 )
    lsNames.append( "AJCC Stage 2" )
    lsCount.append( ((sele["AJCC"] == 2) | (sele["AJCC"] == 3)).sum() )
    lsFract.append( ((sele["AJCC"] == 2) | (sele["AJCC"] == 3)).sum() /sele.shape[0] *100 )
    lsNames.append( "AJCC Stage 3" )
    lsCount.append( (sele["AJCC"] >= 4).sum() )
    lsFract.append( (sele["AJCC"] >= 4).sum() /sele.shape[0] *100 )

    # Tumor Growth Data
    lsNames.append( "Tumor Growth -" )
    lsCount.append( (sele["tumorinnv"] == 0).sum() )
    lsFract.append( (sele["tumorinnv"] == 0).sum() /sele.shape[0] *100 )
    lsNames.append( "Tumor Growth +" )
    lsCount.append( (sele["tumorinnv"] == 1).sum() )
    lsFract.append( (sele["tumorinnv"] == 1).sum() /sele.shape[0] *100 )
    lsNames.append( "TumorSize_Mean" )
    lsCount.append( sele["tumorstr"].mean() )
    lsFract.append( "" )
    lsNames.append( "TumorSize_StDev" )
    lsCount.append( sele["tumorstr"].std() )
    lsFract.append( "" )

    # Surgery Type Data
    lsNames.append( "Brystvev" )
    lsCount.append( (sele["kir"] == 0).sum() )
    lsFract.append( (sele["kir"] == 0).sum() /sele.shape[0] *100 )
    lsNames.append( "Radikal" )
    lsCount.append( (sele["kir"] == 1).sum() )
    lsFract.append( (sele["kir"] == 1).sum() /sele.shape[0] *100 )
    # Comorbidity General
    lsNames.append( "Chronic Comorbidity +" )
    lsCount.append( (sele["Comorb"] == 0).sum() )
    lsFract.append( (sele["Comorb"] == 0).sum() /sele.shape[0] *100 )
    lsNames.append( "Chronic Comorbidity -" )
    lsCount.append( (sele["Comorb"] == 1).sum() )
    lsFract.append( (sele["Comorb"] == 1).sum() /sele.shape[0] *100 )

    # Comorbidity Specific
    lsNames.append( "Cb - None" )
    lsCount.append( (sele["Comorb_spes"] == 5).sum() )
    lsFract.append( (sele["Comorb_spes"] == 5).sum() /sele.shape[0] *100 )
    lsNames.append( "Cb - Anxiety/Depression" )
    lsCount.append( (sele["Comorb_spes"] == 1).sum() )
    lsFract.append( (sele["Comorb_spes"] == 1).sum() /sele.shape[0] *100 )
    lsNames.append( "Cb - Diabetes" )
    lsCount.append( (sele["Comorb_spes"] == 2).sum() )
    lsFract.append( (sele["Comorb_spes"] == 2).sum() /sele.shape[0] *100 )
    lsNames.append( "Cb - Stroke" )
    lsCount.append( (sele["Comorb_spes"] == 3).sum() )
    lsFract.append( (sele["Comorb_spes"] == 3).sum() /sele.shape[0] *100 )
    lsNames.append( "Cb - Cardiovascular" )
    lsCount.append( (sele["Comorb_spes"] == 4).sum() )
    lsFract.append( (sele["Comorb_spes"] == 4).sum() /sele.shape[0] *100 )
    lsNames.append( "Cb - Lung disease" )
    lsCount.append( (sele["Comorb_spes"] == 6).sum()  )
    lsFract.append( (sele["Comorb_spes"] == 6).sum() /sele.shape[0] *100 )
    lsNames.append( "Cb - MS Hypothyroidism" )
    lsCount.append( (sele["Comorb_spes"] == 7).sum() )
    lsFract.append( (sele["Comorb_spes"] == 7).sum() /sele.shape[0] *100 )
    lsNames.append( "Cb - Muscle and Joints" )
    lsCount.append( (sele["Comorb_spes"] == 8).sum() )
    lsFract.append( (sele["Comorb_spes"] == 8).sum() /sele.shape[0] *100 )
    lsNames.append( "Cb - Narcolepsy" )
    lsCount.append( (sele["Comorb_spes"] == 9).sum()  )
    lsFract.append( (sele["Comorb_spes"] == 9).sum() /sele.shape[0] *100 )
    lsNames.append( "Cb - Polio-senskader" )
    lsCount.append( (sele["Comorb_spes"] == 10).sum() )
    lsFract.append( (sele["Comorb_spes"] == 10).sum() /sele.shape[0] *100 )

    # ***************************************************************************
    lsNames.append( "----" )
    lsCount.append( "----" )
    lsFract.append( "----" )

    # ER Data
    lsNames.append( "ER +" )
    lsCount.append( (sele["ER"] == 1).sum() )
    lsFract.append( (sele["ER"] == 1).sum() /sele.shape[0] *100 )
    lsNames.append( "ER -" )
    lsCount.append( (sele["ER"] == 2).sum() )
    lsFract.append( (sele["ER"] == 2).sum() /sele.shape[0] *100 )
    lsNames.append( "ER, DCIS" )
    lsCount.append( (sele["ER"] == 3).sum() )
    lsFract.append( (sele["ER"] == 3).sum() /sele.shape[0] *100 )
    lsNames.append( "ER, Insufficient" )
    lsCount.append( (sele["ER"] == 4).sum() )
    lsFract.append( (sele["ER"] == 4).sum() /sele.shape[0] *100 )

    # Her2 Data
    lsNames.append( "Her2 -" )
    lsCount.append( (sele["HER2"] == 0).sum() )
    lsFract.append( (sele["HER2"] == 0).sum() /sele.shape[0] *100 )
    lsNames.append( "Her2 +" )
    lsCount.append( (sele["HER2"] == 1).sum() )
    lsFract.append( (sele["HER2"] == 1).sum() /sele.shape[0] *100 )
    lsNames.append( "Her2 Unknown" )
    lsCount.append( (sele["HER2"] == 2).sum() )
    lsFract.append( (sele["HER2"] == 2).sum() /sele.shape[0] *100 )

    # PGR Data
    lsNames.append( "PGR -" )
    lsCount.append( (sele["PgR"] == 2).sum() )
    lsFract.append( (sele["PgR"] == 2).sum() /sele.shape[0] *100 )
    lsNames.append( "PGR +" )
    lsCount.append( (sele["PgR"] == 3).sum() )
    lsFract.append( (sele["PgR"] == 3).sum() /sele.shape[0] *100 )
    lsNames.append( "PGR unknown" )
    lsCount.append( (sele["PgR"] == 1).sum() )
    lsFract.append( (sele["PgR"] == 1).sum() /sele.shape[0] *100 )

    # Herceptin Data
    lsNames.append( "Herceptin - None" )
    lsCount.append( (sele["Hercep"] == 0).sum() )
    lsFract.append( (sele["Hercep"] == 0).sum() /sele.shape[0] *100 )
    lsNames.append( "Herceptin - Before" )
    lsCount.append( (sele["Hercep"] == 1).sum()  )
    lsFract.append( (sele["Hercep"] == 1).sum() /sele.shape[0] *100 )
    lsNames.append( "Herceptin - After" )
    lsCount.append( (sele["Hercep"] == 2).sum() )
    lsFract.append( (sele["Hercep"] == 2).sum() /sele.shape[0] *100 )

    # ***************************************************************************
    lsNames.append( "----" )
    lsCount.append( "----" )
    lsFract.append( "----" )

    # Kjemoterapi Data
    lsNames.append( "Kjemoterapi - None" )
    lsCount.append( (sele["kjemo"] == 0).sum() )
    lsFract.append( (sele["kjemo"] == 0).sum() /sele.shape[0] *100 )
    lsNames.append( "Kjemoterapi - Adjuvant" )
    lsCount.append( (sele["kjemo"] == 1).sum() )
    lsFract.append( (sele["kjemo"] == 1).sum() /sele.shape[0] *100 )
    lsNames.append( "Kjemoterapi - Neoadjuvant" )
    lsCount.append( (sele["kjemo"] == 2).sum() )
    lsFract.append( (sele["kjemo"] == 2).sum() /sele.shape[0] *100 )
    lsNames.append( "Kjemoterapi - Adj + Neoadj" )
    lsCount.append( (sele["kjemo"] == 3).sum() )
    lsFract.append( (sele["kjemo"] == 3).sum() /sele.shape[0] *100 )
    # Kjemo-Type Data
    lsNames.append( "Kjemotype -" )
    lsCount.append( (sele["kjemo_type"] == 0).sum() )
    lsFract.append( (sele["kjemo_type"] == 0).sum() /sele.shape[0] *100 )
    lsNames.append( "Kjemotype +" )
    lsCount.append( (sele["kjemo_type"] >= 1).sum() )
    lsFract.append( (sele["kjemo_type"] >= 1).sum() /sele.shape[0] *100 )
    """    
    lsNames.append( "Kjemotype - None" )
    lsCount.append( (sele["kjemo_type"] == 0).sum() )
    lsFract.append( (sele["kjemo_type"] == 0).sum() /sele.shape[0] *100 )
    lsNames.append( "Kjemotype - Fec60" )
    lsCount.append( (sele["kjemo_type"] == 1).sum() )
    lsFract.append( (sele["kjemo_type"] == 1).sum() /sele.shape[0] *100 )
    lsNames.append( "Kjemotype - Fec100" )
    lsCount.append( (sele["kjemo_type"] == 2).sum() )
    lsFract.append( (sele["kjemo_type"] == 2).sum() /sele.shape[0] *100 )
    lsNames.append( "Kjemotype - Fec/Tax" )
    lsCount.append( (sele["kjemo_type"] == 3).sum() )
    lsFract.append( (sele["kjemo_type"] == 3).sum() /sele.shape[0] *100 )
    lsNames.append( "Kjemotype - Ec" )
    lsCount.append( (sele["kjemo_type"] == 4).sum() )
    lsFract.append( (sele["kjemo_type"] == 4).sum() /sele.shape[0] *100 )
    lsNames.append( "Kjemotype - Annet" )
    lsCount.append( (sele["kjemo_type"] == 5).sum() )
    lsFract.append( (sele["kjemo_type"] == 5).sum() /sele.shape[0] *100 )
    """

    # Hormon Data
    lsNames.append( "Hormon +" )
    lsCount.append( (sele["Horm_type"] == 0).sum())
    lsFract.append( (sele["Horm_type"] == 0).sum() /sele.shape[0] *100 )
    lsNames.append( "Hormon -" )
    lsCount.append( (sele["Horm_type"] == 1).sum() )
    lsFract.append( (sele["Horm_type"] == 1).sum() /sele.shape[0] *100 )
    # Hormon
    lsNames.append( "Hormon - None" )
    lsCount.append( (sele["Horm_type"] == 0).sum() )
    lsFract.append( (sele["Horm_type"] == 0).sum() /sele.shape[0] *100 )
    lsNames.append( "Hormon - Tamoxifen" )
    lsCount.append( ((sele["Horm_type"] == 1) | (sele["Horm_type"] == 2)).sum()  )
    lsFract.append( ((sele["Horm_type"] == 1) | (sele["Horm_type"] == 2)).sum() /sele.shape[0] *100 )
    lsNames.append( "Hormon - Aromatase" )
    lsCount.append( ((sele["Horm_type"] == 3) | (sele["Horm_type"] == 4)).sum()  )
    lsFract.append( ((sele["Horm_type"] == 3) | (sele["Horm_type"] == 4)).sum() /sele.shape[0] *100 )
    """
    lsNames.append( "Hormon - Tamoxifen Before" )
    lsCount.append( ((sele["Horm_type"] == 1) | (sele["Horm_type"] == 1)).sum()  )
    lsFract.append( (sele["Horm_type"] == 1).sum() /sele.shape[0] *100 )
    lsNames.append( "Hormon - Tamoxifen After" )
    lsCount.append( (sele["Horm_type"] == 2).sum()  )
    lsFract.append( (sele["Horm_type"] == 2).sum() /sele.shape[0] *100 )
    lsNames.append( "Hormon - Aromatase Before" )
    lsCount.append( (sele["Horm_type"] == 3).sum())
    lsFract.append( (sele["Horm_type"] == 3).sum() /sele.shape[0] *100 )
    lsNames.append( "Hormon - Aromatase After" )
    lsCount.append( (sele["Horm_type"] == 4).sum() )
    lsFract.append( (sele["Horm_type"] == 4).sum() /sele.shape[0] *100 )
    """
    # ***************************************************************************
    lsNames.append( "----" )
    lsCount.append( "----" )
    lsFract.append( "----" )

    # T_Stad Data
    lsNames.append( "Tstad - T1" )
    lsCount.append( (sele["Tstad"] == 1).sum() )
    lsFract.append( (sele["Tstad"] == 1).sum() /sele.shape[0] *100 )
    lsNames.append( "Tstad - T2" )
    lsCount.append( (sele["Tstad"] == 2).sum() )
    lsFract.append( (sele["Tstad"] == 2).sum() /sele.shape[0] *100 )
    lsNames.append( "Tstad - T3" )
    lsCount.append( (sele["Tstad"] == 3).sum() )
    lsFract.append( (sele["Tstad"] == 3).sum() /sele.shape[0] *100 )
    lsNames.append( "Tstad - T4" )
    lsCount.append( (sele["Tstad"] == 4).sum() )
    lsFract.append( (sele["Tstad"] == 4).sum() /sele.shape[0] *100 )
    lsNames.append( "Tstad - Tis" )
    lsCount.append( (sele["Tstad"] == 5).sum() )
    lsFract.append( (sele["Tstad"] == 5).sum() /sele.shape[0] *100 )

    # N_Stad Data
    lsNames.append( "Nstad - N0" )
    lsCount.append( (sele["Nstad"] == 1).sum() )
    lsFract.append( (sele["Nstad"] == 1).sum() /sele.shape[0] *100 )
    lsNames.append( "Nstad - N1" )
    lsCount.append( (sele["Nstad"] == 2).sum() )
    lsFract.append( (sele["Nstad"] == 2).sum() /sele.shape[0] *100 )
    lsNames.append( "Nstad - N2" )
    lsCount.append( (sele["Nstad"] == 3).sum()  )
    lsFract.append( (sele["Nstad"] == 3).sum() /sele.shape[0] *100 )
    lsNames.append( "Nstad - N3" )
    lsCount.append( (sele["Nstad"] == 4).sum() )
    lsFract.append( (sele["Nstad"] == 4).sum() /sele.shape[0] *100 )
    lsNames.append( "Nstad - DCIS or Unknown" )
    lsCount.append( (sele["Nstad"] == 5).sum() )
    lsFract.append( (sele["Nstad"] == 5).sum() /sele.shape[0] *100 )

    # ***************************************************************************
    lsNames.append( "----" )
    lsCount.append( "----" )
    lsFract.append( "----" )

    # Number of lymphnodes removed
    lsNames.append( "Lymphonodes removed - Mean" )
    lsCount.append( sele["lymf_fj"].mean() )
    lsFract.append( "" )
    lsNames.append( "Lymphonodes removed - StDev" )
    lsCount.append( sele["lymf_fj"].std() )
    lsFract.append( "" )
    # If ANY lymphnode was affected
    lsNames.append( "Lymphonodes affected - None" )
    lsCount.append( (sele["lymf_aff"] == 0).sum() )
    lsFract.append( (sele["lymf_aff"] == 0).sum() /sele.shape[0] *100 )
    lsNames.append( "Lymphonodes affected - >=1" )
    lsCount.append( (sele["lymf_aff"] >= 1).sum() )
    lsFract.append( (sele["lymf_aff"] >= 1).sum() /sele.shape[0] *100 )
    # Number of lymphnodes affected
    lsNames.append( "Lymphonodes affected - Mean" )
    lsCount.append( sele["lymf_aff"].mean() )
    lsFract.append( "" )
    lsNames.append( "Lymphonodes affected - StDev" )
    lsCount.append( sele["lymf_aff"].std() )
    lsFract.append( "" )

    # Radiation Extent Data
    lsNames.append( "RT - Local" )
    lsCount.append( (sele["RT"] == 0).sum()  )
    lsFract.append( (sele["RT"] == 0).sum() /sele.shape[0] *100 )
    lsNames.append( "RT - Locoregional" )
    lsCount.append( (sele["RT"] == 1).sum()  )
    lsFract.append( (sele["RT"] == 1).sum() /sele.shape[0] *100 )

    # HjerteSnittDose data
    lsNames.append( "Hjerte Snitt Dose" )
    lsCount.append( sele["HjerteSnittDose"].mean() )
    lsFract.append( "" )
    lsNames.append( "Hjerte Snitt Dose" )
    lsCount.append( sele["HjerteSnittDose"].std() )
    lsFract.append( "" )
    lsNames.append( "Hjerte Snitt Dose" )
    lsCount.append( str(sele["HjerteSnittDose"].min())+" - "+str(sele["HjerteSnittDose"].max()) )
    lsFract.append( "" )

    # HjerteV25 data
    lsNames.append( "Hjerte V25" )
    lsCount.append( sele["HjerteV25"].mean() )
    lsFract.append( "" )
    lsNames.append( "Hjerte V25" )
    lsCount.append( sele["HjerteV25"].std() )
    lsFract.append( "" )
    lsNames.append( "Hjerte V25" )
    lsCount.append( str(sele["HjerteV25"].min())+" - "+str(sele["HjerteV25"].max()) )
    lsFract.append( "" )

    # HjerteV40 data
    lsNames.append( "Hjerte V40" )
    lsCount.append( sele["HjerteV40"].mean() )
    lsFract.append( "" )
    lsNames.append( "Hjerte V40" )
    lsCount.append( sele["HjerteV40"].std() )
    lsFract.append( "" )
    lsNames.append( "Hjerte V40" )
    lsCount.append( str(sele["HjerteV40"].min())+" - "+str(sele["HjerteV40"].max()) )
    lsFract.append( "" )
    
    dfGroups[ colSaveName[cc]+" - Count"] = lsCount
    dfGroups[ colSaveName[cc]+" - Perct"] = lsFract

# Add the indexs, that untill now were numerica values
dfGroups.index = lsNames

Cohort:   Cohort  -->  0
Size:     250
Cohort:   Death_All_5y  -->  0
Size:     235
Cohort:   Death_All_5y  -->  1
Size:     14
Cohort:   Death_All_10y  -->  0
Size:     209
Cohort:   Death_All_10y  -->  1
Size:     40


In [159]:
# --------- CREATE COLUMNS VALUES ---------
# Given a specific cohort/dataset, calculate all the variables of interest and
# collect the data as list, which will then become a unique columns in the
# final table
# Interesting cohorts are -
#   Death_All_5y
#   Death_All_10y

import scipy.stats as stats

cohort_A = "Death_All_10y"
mask_A   = CD_Combined[cohort_A] == 0
sele_A   = CD_Combined[mask_A]

cohort_B = "Death_All_10y"
mask_B   = CD_Combined[cohort_B] == 1
sele_B   = CD_Combined[mask_B]



In [160]:
lsNames = []
ls_A = []
ls_B = []
pval = [] 

print("Cohort A:  ", cohort_A)
print("Size A:    ", sele_A.shape[0])
print("Cohort B:  ", cohort_B)
print("Size B:    ", sele_B.shape[0])

# Cohort Size
lsNames.append( "Cohort Size" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# ***************************************************************************
lsNames.append( "----" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# Age data
lsNames.append( "Age_Mean" )
ls_A.append( sele_A["aldBL"].values )
ls_B.append( sele_B["aldBL"].values )
pval.append( stats.ttest_ind(a=ls_A[-1], b=ls_B[-1], equal_var=False).pvalue )
lsNames.append( "Age_StDev" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
lsNames.append( "Age_Range" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# BMI Data
lsNames.append( "BMI_Mean" )
ls_A.append( sele_A["BMI"].values )
ls_B.append( sele_B["BMI"].values )
pval.append( stats.ttest_ind(a=ls_A[-1], b=ls_B[-1], equal_var=False).pvalue )
lsNames.append( "BMI_StDev" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
lsNames.append( "BMI_Range" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )



# ***************************************************************************
lsNames.append( "----" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# Breast Side Data
lsNames.append( "Breast Right" )
data_A = sele_A["Side"]
data_B = sele_B["Side"]
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# Cancer Histology Data
lsNames.append( "Hist. IDC" )
data_A = sele_A["hist"]
data_B = sele_B["hist"]
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# Cancer Stage AJCC Data
lsNames.append( "AJCC Stage 0" )
data_A = sele_A["AJCC"].copy()
mask = (sele_A["AJCC"] == 2) | (sele_A["AJCC"] == 3)   # correct Stage 2
data_A[ mask ] = 3
mask = (sele_A["AJCC"] >= 4)                           # correct Stage 3
data_A[ mask ] = 4
data_B = sele_B["AJCC"].copy()
mask = (sele_B["AJCC"] == 2) | (sele_B["AJCC"] == 3)   # correct Stage 2
data_B[ mask ] = 3
mask = (sele_B["AJCC"] >= 4)                           # correct Stage 3
data_B[ mask ] = 4
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# Tumor Growth Data
lsNames.append( "Tumor Growth -" )
data_A = sele_A["tumorinnv"]
data_B = sele_B["tumorinnv"]
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( sele_A["tumorstr"].values )
ls_B.append( sele_B["tumorstr"].values )
pval.append( stats.ttest_ind(a=ls_A[-1], b=ls_B[-1], equal_var=False).pvalue )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# Surgery Type Data
lsNames.append( "Brystvev" )
data_A = sele_A["kir"]
data_B = sele_B["kir"]
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# Comorbidity General
lsNames.append( "Chronic Comorbidity +" )
data_A = sele_A["Comorb"]
data_B = sele_B["Comorb"]
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# Comorbidity Specific
lsNames.append( "Cb - None" )
data_A = sele_A["Comorb_spes"].copy()
mask = (sele_A["Comorb_spes"] == 5) | (sele_A["Comorb_spes"] == 4) # all others
data_A[ ~mask ] = 0
data_B = sele_B["Comorb_spes"].copy()
mask = (sele_B["Comorb_spes"] == 5) | (sele_B["Comorb_spes"] == 4) # all others
data_B[ ~mask ] = 0
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# ***************************************************************************
lsNames.append( "----" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# ER Data
lsNames.append( "ER +" )
data_A = sele_A["ER"]
data_B = sele_B["ER"]
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# Her2 Data
lsNames.append( "Her2 -" )
data_A = sele_A["HER2"]
data_B = sele_B["HER2"]
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# PGR Data
lsNames.append( "PGR -" )
data_A = sele_A["PgR"]
data_B = sele_B["PgR"]
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# Herceptin Data
lsNames.append( "Herceptin - None" )
data_A = sele_A["Hercep"]
data_B = sele_B["Hercep"]
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# ***************************************************************************
lsNames.append( "----" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
# Kjemoterapi Data
lsNames.append( "Kjemoterapi - None" )
data_A = sele_A["kjemo"]
data_B = sele_B["kjemo"]
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
# Kjemo-Type Data
lsNames.append( "Kjemotype - None" )
data_A = sele_A["kjemo_type"].copy() 
data_A[ data_A >= 1 ] = 1                  # any type of Kjemo
data_B = sele_B["kjemo_type"].copy()
data_B[ data_B >= 1 ] = 1                  # any type of Kjemo
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
"""
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
"""

# Hormon Data
lsNames.append( "Hormon +" )
data_A = sele_A["Horm_type"]
data_B = sele_B["Horm_type"]
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# Hormon
lsNames.append( "Hormon - None" )
data_A = sele_A["Horm_type"].copy()
mask = (sele_A["Horm_type"] == 1) | (sele_A["Horm_type"] == 2)   # Tamoxifen before and after
data_A[ mask ] = 1    
mask = (sele_A["Horm_type"] == 3) | (sele_A["Horm_type"] == 4)   # Aromatase before and after
data_A[ mask ] = 2    
data_B = sele_B["Horm_type"].copy()
mask = (sele_B["Horm_type"] == 1) | (sele_B["Horm_type"] == 2)   # Tamoxifen before and after
data_B[ mask ] = 1    
mask = (sele_B["Horm_type"] == 3) | (sele_B["Horm_type"] == 4)   # Aromatase before and after
data_B[ mask ] = 2  
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
"""
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
"""


# ***************************************************************************
lsNames.append( "----" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# T_Stad Data
lsNames.append( "Tstad - T1" )
data_A = sele_A["Tstad"]
data_B = sele_B["Tstad"]
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# N_Stad Data
lsNames.append( "Nstad - N0" )
data_A = sele_A["Nstad"]
data_B = sele_B["Nstad"]
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# ***************************************************************************
lsNames.append( "----" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# Number of lymph nodes removed
lsNames.append( "Lymphonomed removed - Mean" )
ls_A.append( sele_A["lymf_fj"].values )
ls_B.append( sele_B["lymf_fj"].values )
pval.append( stats.ttest_ind(a=ls_A[-1], b=ls_B[-1], equal_var=False).pvalue )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
# If ANY lymphnode was affected
lsNames.append( "Lymphonodes affected - None" )
data_A = sele_A["lymf_aff"].copy() 
data_A[ data_A >= 1 ] = 1                  # any type of Kjemo
data_B = sele_B["lymf_aff"].copy()
data_B[ data_B >= 1 ] = 1                  # any type of Kjemo
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
# Number of lymph nodes affected
lsNames.append( "Lymphonomed affected - Mean" )
ls_A.append( sele_A["lymf_aff"].values )
ls_B.append( sele_B["lymf_aff"].values )
pval.append( stats.ttest_ind(a=ls_A[-1], b=ls_B[-1], equal_var=False).pvalue )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# Radiation Extent Data
lsNames.append( "RT - Local" )
data_A = sele_A["RT"]
data_B = sele_B["RT"]
ls_A.append( data_A )
ls_B.append( data_B )
pval.append( chi2_test( data_A, data_B ) )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )


# HjerteSnittDose data
lsNames.append( "Hjerte Snitt Dose" )
ls_A.append( sele_A["HjerteSnittDose"].values )
ls_B.append( sele_B["HjerteSnittDose"].values )
pval.append( stats.ttest_ind(a=ls_A[-1], b=ls_B[-1], equal_var=False).pvalue )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# HjerteV25 data
lsNames.append( "Hjerte V25" )
ls_A.append( sele_A["HjerteV25"].values )
ls_B.append( sele_B["HjerteV25"].values )
pval.append( stats.ttest_ind(a=ls_A[-1], b=ls_B[-1], equal_var=False).pvalue )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )

# HjerteV40 data
lsNames.append( "Hjerte V40" )
ls_A.append( sele_A["HjerteV40"].values )
ls_B.append( sele_B["HjerteV40"].values )
pval.append( stats.ttest_ind(a=ls_A[-1], b=ls_B[-1], equal_var=False).pvalue )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )
ls_A.append( "" )
ls_B.append( "" )
pval.append( "" )


len(pval)

dfGroups["p-value_"+cohort_A] = pval

Cohort A:   Death_All_10y
Size A:     209
Cohort B:   Death_All_10y
Size B:     40


In [161]:
# Save as Python-Ready .csv file
dfGroups.to_csv( root_path + "/Table_Cohort.csv"  , header=True, index=True)

In [152]:
"""
data_A = sele_A["Comorb_spes"].copy()
mask = (sele_A["Comorb_spes"] == 5) | (sele_A["Comorb_spes"] == 4) # all others
data_A[ ~mask ] = 0
data_B = sele_B["Comorb_spes"].copy()
mask = (sele_B["Comorb_spes"] == 5) | (sele_B["Comorb_spes"] == 4) # all others
data_B[ ~mask ] = 0

data_A = sele_A["AJCC"].copy()
mask = (sele_A["AJCC"] == 2) | (sele_A["AJCC"] == 3)   # correct Stage 2
data_A[ mask ] = 3
mask = (sele_A["AJCC"] >= 4)                           # correct Stage 3
data_A[ mask ] = 4
data_B = sele_B["AJCC"].copy()
mask = (sele_B["AJCC"] == 2) | (sele_B["AJCC"] == 3)   # correct Stage 2
data_B[ mask ] = 3
mask = (sele_B["AJCC"] >= 4)                           # correct Stage 3
data_B[ mask ] = 4

from scipy.stats import chi2_contingency

# -------------------------------------------------------------------------------------
cat = np.union1d( data_B.unique(), data_A.unique() )
# Remove nan from categories and make sure that the values are Integers
# (NB: cat values are used as index of DF, so they cannot be floats)
cat = cat[~np.isnan(cat)]
cat = cat.astype(int)

data = []
for cc in cat:
    data.append( [ [cc, "A"] for xx in range((data_A==cc).sum()) ] )
    data.append( [ [cc, "B"] for xx in range((data_B==cc).sum()) ] )

data = [item for sublist in data for item in sublist]      # Flatten list by one level
df = pd.DataFrame(data, columns = ['Level', 'Group']) 
# Create contingency table
data_crosstab = pd.crosstab(df['Level'],
                            df['Group'],
                           margins=True, margins_name="Total")
# Calcualtion of Chisquare test statistics
chi_square = 0
myExp = []
rows = df['Level'].unique()
columns = df['Group'].unique()
for i in columns:
    for j in rows:
        Ov = data_crosstab[i][j]
        Ev = data_crosstab[i]['Total'] * data_crosstab['Total'][j] / data_crosstab['Total']['Total']
        chi_square += (Ov-Ev)**2/Ev
        print(i," - ",j, " :  ",(Ov-Ev)**2/Ev)
        myExp.append(Ev)

# Now we can calculate the p-value and critical values for hypothesis testing in the decision rule
degree_freedom = (len(rows)-1)*(len(columns)-1)
p_value = 1 - stats.norm.cdf(chi_square, degree_freedom)

# NB: alpha value could be an input
alpha = 0.05
critical_value = stats.chi2.ppf(1-alpha, degree_freedom)


print("My Chi2:    ", critical_value )
print("My p-value: ", p_value )
print("dof    :    ", degree_freedom )
print("---------------------- " )


# -------------------------------------------------------------------------------------

   
chi2, p, dof, ex = chi2_contingency( data_crosstab.iloc[:-1,:-1], correction=False)
print("Chi2   :    ", chi2 )
print("p-value:    ", p )
print("dof    :    ", dof )
"""

A  -  0  :   0.07960526315789486
A  -  1  :   0.23870700372979708
A  -  3  :   0.0625
A  -  4  :   3.335526315789473
B  -  0  :   0.424561403508772
B  -  1  :   1.273104019892251
B  -  3  :   0.3333333333333333
B  -  4  :   17.789473684210527
My Chi2:     7.814727903251179
My p-value:  0.0
dof    :     3
---------------------- 
Chi2   :     23.536811023622047
p-value:     3.12074448335557e-05
dof    :     3
