In [45]:
import pandas as pd
import copy
import pandas as pd
%matplotlib notebook
%config InlineBackend.print_figure_kwargs={'bbox_inches': None}
import pandas as pd
from IPython.display import display
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)  
%matplotlib inline
import numpy as np
from IPython.display import Image

In [46]:
def eln_2017_classification(df_merge):
    """
    This function builds the ELN 2017 classification of AML for the dataframe given in input. The classification is a new column labeled 'eln_2017'

    It supposes that the following are columns of the input:
        - t_8_21, inv_16, t_16_16, CEBPA_bi, NPM1, FLT3_ITD t_9_11, TP53, t_6_9, MLL, t_9_22, inv_3, t_3_3,
        minus5, del_5q, minus7, minus17, RUNX1 (or hotspots starting with RUNX1), ASXL1 (ibid), complex


    :param df_final:
    :return: pandas.DataFrame, copy of df_final with 'eln_2017' column
    """

    df = df_merge.copy(deep=True)
    df.loc[:, 'eln_2017'] = np.nan
    df.loc[(df.eln_2017.isnull()) & (df.t_9_11 == 1), 'eln_2017'] = 'intermediate'
    df.loc[(df.eln_2017.isnull()) & (df.complex == 1), 'eln_2017'] = 'adverse'
    df.loc[(df.eln_2017.isnull()) & (df.TP53 == 1), 'eln_2017'] = 'adverse'
    df.loc[(df.eln_2017.isnull()) & (df.t_6_9 == 1), 'eln_2017'] = 'adverse'
    if 'MLL_PTD' in df.columns.values:
        df.loc[(df.eln_2017.isnull()) & (df.MLL_PTD == 1), 'eln_2017'] = 'adverse'
    else:
        df.loc[(df.eln_2017.isnull()) & (df.MLL == 1), 'eln_2017'] = 'adverse'
    df.loc[(df.eln_2017.isnull()) & (df.t_9_22 == 1), 'eln_2017'] = 'adverse'
    df.loc[(df.eln_2017.isnull()) & (df.inv_3 == 1), 'eln_2017'] = 'adverse'
    df.loc[(df.eln_2017.isnull()) & (
                (df.minus5 == 1) | (df.del_5q == 1) | (df.minus7 == 1) | (df.minus17 == 1)), 'eln_2017'] = 'adverse'
    df.loc[(df.eln_2017.isnull()) & (df.NPM1 == 0) & (df.ITD == 1), 'eln_2017'] = 'adverse'  #yanis change for my dataset FLT3_ITD to new ITD

    df.loc[(df.eln_2017.isnull()) & (df.t_8_21 == 1), 'eln_2017'] = 'favorable'
    df.loc[(df.eln_2017.isnull()) & (df.inv_16 == 1), 'eln_2017'] = 'favorable'
    df.loc[(df.eln_2017.isnull()) & (df.CEBPA_bi == 1), 'eln_2017'] = 'favorable'
    df.loc[(df.eln_2017.isnull()) & (df.NPM1 == 1) & (df.ITD == 0), 'eln_2017'] = 'favorable' #yanis change for my dataset FLT3_ITD to new ITD

    # As we distinguish between hotspot mutations,  several columns may encode
    # for a gene mutation

    for col in [c for c in df.columns if c.startswith('RUNX1')]:
        df.loc[(df.eln_2017.isnull()) & (df[col] == 1), 'eln_2017'] = 'adverse'
    for col in [c for c in df.columns if c.startswith('ASXL1')]:
        df.loc[(df.eln_2017.isnull()) & (df[col] == 1), 'eln_2017'] = 'adverse'

    for col in [x for x in df.columns if (x.startswith('minus') or x.startswith('del'))]:
        df.loc[(df.eln_2017.isnull()) & (df[col] == 1), 'eln_2017'] = 'adverse'

    df.loc[(df.eln_2017.isnull()) & (df.NPM1 == 0) & (df.ITD == 0), 'eln_2017'] = 'intermediate'   #yanis change for my dataset FLT3_ITD to new ITD
    df.loc[(df.eln_2017.isnull()) & (df.NPM1 == 1) & (df.ITD == 1), 'eln_2017'] = 'intermediate'
    df.loc[df.eln_2017.isnull(), 'eln_2017'] = 'intermediate'

    return df

In [49]:
def eln_2017_classification_ratio(df_merge):
    """
    This function builds the ELN 2017 classification of AML for the dataframe given in input. The classification is a new column labeled 'eln_2017'

    It supposes that the following are columns of the input:
        - t_8_21, inv_16, t_16_16, CEBPA_bi, NPM1, FLT3_ITD t_9_11, TP53, t_6_9, MLL, t_9_22, inv_3, t_3_3,
        minus5, del_5q, minus7, minus17, RUNX1 (or hotspots starting with RUNX1), ASXL1 (ibid), complex


    :param df_final:
    :return: pandas.DataFrame, copy of df_final with 'eln_2017' column
    """

    df = df_merge.copy(deep=True)
    df.loc[:, 'eln_2017'] = np.nan
    
    df.loc[(df.eln_2017.isnull()) & (df.t_9_11 == 1), 'rules'] = 't_9_11'
    df.loc[(df.eln_2017.isnull()) & (df.t_9_11 == 1), 'eln_2017'] = 'intermediate'

    df.loc[(df.eln_2017.isnull()) & (df.complex == 1), 'rules'] = 'complex'
    df.loc[(df.eln_2017.isnull()) & (df.complex == 1), 'eln_2017'] = 'adverse'
    
    df.loc[(df.eln_2017.isnull()) & (df.TP53 == 1), 'rules'] = 'TP53'
    df.loc[(df.eln_2017.isnull()) & (df.TP53 == 1), 'eln_2017'] = 'adverse'
    
    df.loc[(df.eln_2017.isnull()) & (df.t_6_9 == 1), 'rules'] = 't_6_9'
    df.loc[(df.eln_2017.isnull()) & (df.t_6_9 == 1), 'eln_2017'] = 'adverse'  
    
    if 'MLL_PTD' in df.columns.values:
        
        df.loc[(df.eln_2017.isnull()) & (df.MLL_PTD == 1), 'rules'] = 'MLL_PTD'
        df.loc[(df.eln_2017.isnull()) & (df.MLL_PTD == 1), 'eln_2017'] = 'adverse'     
        
    else:
        
        df.loc[(df.eln_2017.isnull()) & (df.MLL == 1), 'rules'] = 'MLL'
        df.loc[(df.eln_2017.isnull()) & (df.MLL == 1), 'eln_2017'] = 'adverse'
        
    df.loc[(df.eln_2017.isnull()) & (df.t_9_22 == 1), 'rules'] = 't_9_22'
    df.loc[(df.eln_2017.isnull()) & (df.t_9_22 == 1), 'eln_2017'] = 'adverse'
    
    df.loc[(df.eln_2017.isnull()) & (df.inv_3 == 1), 'rules'] = 'inv_3'
    df.loc[(df.eln_2017.isnull()) & (df.inv_3 == 1), 'eln_2017'] = 'adverse'
    
    df.loc[(df.eln_2017.isnull()) & (
                (df.minus5 == 1) | (df.del_5q == 1) | (df.minus7 == 1) | (df.minus17 == 1)), 'rules'] = 'minus5 or del5q or minus7 or minus17'
    df.loc[(df.eln_2017.isnull()) & (
                (df.minus5 == 1) | (df.del_5q == 1) | (df.minus7 == 1) | (df.minus17 == 1)), 'eln_2017'] = 'adverse'
    
    df.loc[(df.eln_2017.isnull()) & (df.NPM1 == 0) & (df.ITD == 1) & (df.Clin_Ratio >= 50), 'rules'] = 'NPM1wt and ITD mut with high ratio)'
    df.loc[(df.eln_2017.isnull()) & (df.NPM1 == 0) & (df.ITD == 1) & (df.Clin_Ratio >= 50), 'eln_2017'] = 'adverse'  #yanis change for my dataset FLT3_ITD to new ITD

    df.loc[(df.eln_2017.isnull()) & (df.t_8_21 == 1), 'rules'] = 't_8_21'
    df.loc[(df.eln_2017.isnull()) & (df.t_8_21 == 1), 'eln_2017'] = 'favorable'
    
    df.loc[(df.eln_2017.isnull()) & (df.inv_16 == 1), 'rules'] = 'inv_16'
    df.loc[(df.eln_2017.isnull()) & (df.inv_16 == 1), 'eln_2017'] = 'favorable'
    
    df.loc[(df.eln_2017.isnull()) & (df.CEBPA_bi == 1), 'rules'] = 'CEBPA_bi'
    df.loc[(df.eln_2017.isnull()) & (df.CEBPA_bi == 1), 'eln_2017'] = 'favorable'
    
    df.loc[(df.eln_2017.isnull()) & (df.NPM1 == 1) & ((df.ITD == 0) | ((df.ITD == 1) & (df.Clin_Ratio <50))) , 'rules'] = 'NPM1 mut and (ITD wt or ITD mut with low ratio)'
    df.loc[(df.eln_2017.isnull()) & (df.NPM1 == 1) & ((df.ITD == 0) | ((df.ITD == 1) & (df.Clin_Ratio <50))) , 'eln_2017'] = 'favorable' #yanis change for my dataset FLT3_ITD to new ITD

    # As we distinguish between hotspot mutations,  several columns may encode
    # for a gene mutation

    for col in [c for c in df.columns if c.startswith('RUNX1')]:
        
        df.loc[(df.eln_2017.isnull()) & (df[col] == 1), 'rules'] = 'RUNX1'
        df.loc[(df.eln_2017.isnull()) & (df[col] == 1), 'eln_2017'] = 'adverse'
        
    for col in [c for c in df.columns if c.startswith('ASXL1')]:
        
        df.loc[(df.eln_2017.isnull()) & (df[col] == 1), 'rules'] = 'ASXL1'
        df.loc[(df.eln_2017.isnull()) & (df[col] == 1), 'eln_2017'] = 'adverse'

    for col in [x for x in df.columns if (x.startswith('minus') or x.startswith('del'))]:
        
        df.loc[(df.eln_2017.isnull()) & (df[col] == 1), 'rules'] = 'minus or del'
        df.loc[(df.eln_2017.isnull()) & (df[col] == 1), 'eln_2017'] = 'adverse'

    df.loc[(df.eln_2017.isnull()) & (df.NPM1 == 0) & ((df.ITD == 0) | ((df.ITD == 1) & (df.Clin_Ratio <50))), 'rules'] = 'NPM1 wt and (ITD wt or ITD mut with low ratio)'    
    df.loc[(df.eln_2017.isnull()) & (df.NPM1 == 0) & ((df.ITD == 0) | ((df.ITD == 1) & (df.Clin_Ratio <50))), 'eln_2017'] = 'intermediate'   #yanis change for my dataset FLT3_ITD to new ITD
    
    df.loc[(df.eln_2017.isnull()) & (df.NPM1 == 1) & (df.ITD == 1) & (df.Clin_Ratio >= 50), 'rules'] = 'NPM1 mut and ITD mut with high ratio '
    df.loc[(df.eln_2017.isnull()) & (df.NPM1 == 1) & (df.ITD == 1) & (df.Clin_Ratio >= 50), 'eln_2017'] = 'intermediate'
    
    df.loc[df.eln_2017.isnull(), 'rules'] = 'others remaining'
    df.loc[df.eln_2017.isnull(), 'eln_2017'] = 'intermediate'

    return df

In [50]:
df = pd.read_table("nejm_data.tsv", sep =" ")
df["minus5"] = df["del_5"]
df["del_5q"] = df["del_5"]
df["minus17"] = df["del_17"]
df = eln_2017_classification(df)    ### we do not have the clinical ratio
df.drop(["minus7","minus5","del_5q","minus17"], axis=1,inplace=True)
df.to_csv("nejm_data_eln.tsv",sep=" ")

In [38]:
# df_final=pd.read_table("df_final_with_multiple_comp.tsv",sep=" ")
# df_master = pd.read_table("../../../data/initial_dataset/Master_04_10_2019.csv",sep=",",low_memory=False)
# df_ITD = pd.read_table("../../../data/initial_dataset/ITD_merge.100419.csv",sep=",",low_memory=False)
# df_master.drop(['complex'],axis=1,inplace=True)
# #df_master.rename(columns={'eln_2017':'initial_eln_2017'},inplace=True)
# df_master.set_index('data_pd',inplace=True)
# df_ITD.set_index('data_pd',inplace=True)
# df_complete = df_final[['ITD','complex']].merge(pd.DataFrame(df_ITD['Clin_Ratio']),left_index=True,right_index=True)
# df_complete = df_complete.merge(df_master,left_index=True,right_index=True)
# df_complete[['Clin_Ratio','bm_blasts','wbc']] = df_complete[['Clin_Ratio','bm_blasts','wbc']].replace('na',np.NaN)
# df_complete.Clin_Ratio = pd.to_numeric(df_complete.Clin_Ratio)
# df_complete[['wbc','bm_blasts']] = df_complete[['wbc','bm_blasts']].fillna(df_complete[['wbc','bm_blasts']].mean())
# df_complete.Clin_Ratio = df_complete[['wbc','bm_blasts','Clin_Ratio']].Clin_Ratio.interpolate(method='linear')

In [39]:
df.eln_2017.value_counts()

adverse         490
favorable       347
intermediate    339
Name: eln_2017, dtype: int64

In [None]:

# df_eln = pd.DataFrame(eln_2017_classification_ratio(df_complete).eln_2017)
# df_eln = pd.concat([df_eln,pd.get_dummies(df_eln)],axis=1)
# df_eln
# df_eln.to_csv("../../../data/updated_dataset/eln_final.tsv",sep="\t")