In [1]:
import pandas as pd
import numpy as np

pd.options.display.float_format = '{:.2f}%'.format

## Exploration of the identifier and demographic-1 features

- **identifier** features
    - **encounter_id**: Unique, non-repeating values. 
    - **hospital_id-icu_id**: A patient diagnosed with Diabetus Mellitus should not depend on hospital_id & it's icu_id. Each hostpital will have unique icu_ids. **DROP**
- **demographic-1** features
    - age, height, weight, bmi, gender, ethnicity
    - **readmission_status**: is 0 for all rows. **DROP**
    - Compare the training and test datasets
    - Compare values in the training set where the dm=1 and dm=0
- **demographic-2** features *TODO*
    - readmission_status, elective_surgery, ....
- **labs & vitals category**: *TODO*
    - They have max and min values. Should these be replaced with average?



## Training dataset

In [2]:
df_train = pd.read_csv("data/TrainingWiDS2021.csv")
df_train.describe()
print(df_train.columns)

df_test = pd.read_csv("data/UnlabeledWiDS2021.csv")
df_test.describe()

Index(['Unnamed: 0', 'encounter_id', 'hospital_id', 'age', 'bmi',
       'elective_surgery', 'ethnicity', 'gender', 'height',
       'hospital_admit_source',
       ...
       'h1_pao2fio2ratio_max', 'h1_pao2fio2ratio_min', 'aids', 'cirrhosis',
       'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma',
       'solid_tumor_with_metastasis', 'diabetes_mellitus'],
      dtype='object', length=181)


Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,height,icu_id,pre_icu_los_days,readmission_status,...,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis
count,10234.00%,10234.00%,10234.00%,10234.00%,9219.00%,10234.00%,9933.00%,10234.00%,10234.00%,10234.00%,...,1928.00%,1422.00%,1422.00%,10234.00%,10234.00%,10234.00%,10234.00%,10234.00%,10234.00%,10234.00%
mean,5117.50%,140498.78%,10092.39%,62.85%,29.12%,0.20%,169.28%,677.82%,0.83%,0.00%,...,156.77%,254.69%,246.26%,0.00%,0.01%,0.01%,0.02%,0.01%,0.00%,0.02%
std,2954.45%,3182.46%,57.01%,17.85%,8.34%,0.40%,10.81%,304.12%,2.41%,0.00%,...,105.62%,132.09%,130.75%,0.03%,0.11%,0.10%,0.15%,0.08%,0.06%,0.13%
min,1.00%,135000.00%,10001.00%,18.00%,14.90%,0.00%,137.00%,82.00%,-0.21%,0.00%,...,30.00%,42.62%,41.42%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
25%,2559.25%,137742.25%,10043.00%,52.00%,23.70%,0.00%,162.50%,451.00%,0.03%,0.00%,...,79.00%,151.40%,144.05%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
50%,5117.50%,140503.50%,10085.00%,65.00%,27.60%,0.00%,170.00%,687.00%,0.13%,0.00%,...,118.00%,234.00%,224.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
75%,7675.75%,143252.75%,10145.00%,76.00%,32.60%,0.00%,177.80%,962.00%,0.43%,0.00%,...,206.07%,345.52%,336.52%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
max,10234.00%,146000.00%,10199.00%,96.00%,69.94%,1.00%,195.60%,1111.00%,65.95%,0.00%,...,518.06%,672.82%,651.66%,1.00%,1.00%,1.00%,1.00%,1.00%,1.00%,1.00%


In [3]:
def _describe(df, df_type):
    m_total = 'Entries - Total:'
    m_features = "Features:"

    print("-"*40)
    print(f"{df_type} dataset - details")
    print("-"*40)
    print(f"{m_total:33} {df.shape[0]:6}")

    df_dm_1 = df_dm_0 = pd.DataFrame()
    if df_type != "Testing":
        df_dm_1  = df[df['diabetes_mellitus']==1]; df_dm_1.to_csv(f"{df_type}_dm_1.csv")
        df_dm_0  = df[df['diabetes_mellitus']==0]; df_dm_0.to_csv(f"{df_type}_dm_0.csv")
        print(f"Entries - diabetes_mellitus = 1: {df_dm_1.shape[0]:7} {100 * df_dm_1.shape[0]/df.shape[0]:0.2f}%")
        print(f"Entries - diabetes_mellitus = 0: {df_dm_0.shape[0]:7} {100 * df_dm_0.shape[0]/df.shape[0]:0.2f}%")
    print(f"{m_features:33} {len(df.columns):6}")
    return df.shape[0], df_dm_1, df_dm_0


In [127]:
def _col_describe(df):
    for col in ['age', 'gender', 'ethnicity', 'weight', 'height', 'bmi']:
        print(f"{col:10}: {df[df[col].isnull()].shape[0]:4}")
        
    print("readmission_status", df_train['readmission_status'].unique())

In [5]:
AGE_BINS = [-0.1, 0, 15.9, 20, 40, 60, 90, 120]
def _agedist(df, df_name):
    total_count = df.shape[0]
    nan_count = df[df['age'].isnull()].shape[0]
    nan_pc = nan_count/total_count

    ages = df['age'].unique()
    ages.sort()
    # Since age has 0.0 values, these should be ignored/replaced
    min_age = ages[1]
    print(f"Invalid age: {df[df['age']==0.0]['age'].count()} Min: {df['age'].min():0.2f} Max: {df['age'].max():0.2f}")

    # Group the ages based on the age-groups
    bins = pd.cut(df['age'], AGE_BINS)
    df_age = df.groupby(bins)['age'].agg(['count'])
    df_age = df_age.reset_index()

    # Append the null value counts
    df_age = df_age.append(pd.DataFrame([['null', nan_count]], columns = ['age', f'count']))

    # Add a percentage column
    df_age[f'{df_name}_%'] = 100 * df_age['count']/total_count
    df_age = df_age.rename(columns = {'count':f"{df_name}_count"})

    print(df_age)
    print(f"Verification:   {df_name}_count: {df_age[f'{df_name}_count'].sum()} "
          f"{df_name}_%: {df_age[f'{df_name}_%'].sum():0.1f}%")
    return df_age, nan_count, nan_pc

In [6]:
# underweight, normal, overweight, obese
BMI_BINS = [0, 18.5, 24.9, 30, 70] 
BMI_LABELS = [    '[0.0-18.5] underweight',
                            '[18.5-24.9] normal weight',
                            '[30-70] over weight',
                            '[70<] obese']
def _bmidist(df, df_name):
    total_count = df.shape[0]
    nan_count = df[df['bmi'].isnull()].shape[0]
    nan_pc = nan_count/total_count
    print(f"Min: {df['bmi'].min():0.2f} Max: {df['bmi'].max():0.2f}")

    # Group the values based on the bmi-categories
    bins = pd.cut(df['bmi'], BMI_BINS, labels=BMI_LABELS)
    df_bmi = df.groupby(bins)['bmi'].agg(['count'])
    df_bmi = df_bmi.reset_index()

    # Append the null value counts
    df_bmi = df_bmi.append(pd.DataFrame([['null', nan_count]], columns = ['bmi', f'count']))

    # Add a percentage column
    df_bmi[f"{df_name}_%"] = 100 * df_bmi['count']/total_count
    df_bmi = df_bmi.rename(columns = {'count':f"{df_name}_count"})

    print(df_bmi)
    print(f"Verification:   {df_name}_count: {df_bmi[f'{df_name}_count'].sum()} "
          f"{df_name}_%: {df_bmi[f'{df_name}_%'].sum():0.1f}%")
    return df_bmi, nan_count, nan_pc

In [104]:
def _categoricalvardist(df, df_name, catvar, total_count=-1):
    if total_count == -1:
        total_count = df.shape[0]
        
    nan_count = df[df[catvar].isnull()].shape[0]
    nan_pc = nan_count/total_count

    df_catvar = df.groupby([catvar]).count()['encounter_id'].sort_values(ascending=False).reset_index()
    df_catvar = df_catvar.rename(columns = {'encounter_id':'count'})

    # Append the null value counts
    df_catvar = df_catvar.append(pd.DataFrame([['null', nan_count]], columns = [catvar, f'count']))

    # Add a percentage column
    df_catvar[f"{df_name}_%"] = 100 * df_catvar['count']/total_count
    df_catvar = df_catvar.rename(columns = {'count':f"{df_name}_count"})

    print(df_catvar)
    print(f"Verification:   {df_name}_count: {df_catvar[f'{df_name}_count'].sum()} "
          f"{df_name}_%: {df_catvar[f'{df_name}_%'].sum():0.1f}%")
    return df_catvar, nan_count, nan_pc

In [100]:
def _compare(feature, df1, df2, col1, col2):
    df = df1.join(df2.set_index(feature), on=feature)
    print(f"{feature}: {col1}: {df1[f'{col1}'].sum():0.1f}% {col2}: {df2[f'{col2}'].sum():0.1f}%")
    return df

## Train Dataset

In [10]:
train_count, df_train_dm_1, df_train_dm_0 = _describe(df_train, "Training")

----------------------------------------
Training dataset - details
----------------------------------------
Entries - Total:                  130157
Entries - diabetes_mellitus = 1:   28151 21.63%
Entries - diabetes_mellitus = 0:  102006 78.37%
Features:                            181


In [128]:
_col_describe(df_train)


age       : 4988
gender    :   66
ethnicity : 1587
weight    : 3463
height    : 2077
bmi       : 4490
readmission_status [0]


In [102]:
df_train_age, train_age_nan, train_age_nanpc = _agedist(df_train, "train")
df_train_bmi, train_bmi_nan, train_bmi_nanpc = _bmidist(df_train, "train")
df_train_gender, nan_count, nan_pc = _categoricalvardist(df_train, 'train', 'gender')
df_train_ethnicity, nan_count, nan_pc = _categoricalvardist(df_train, 'train', 'ethnicity')

Invalid age: 30 Min: 0.00 Max: 89.00
             age  train_count  train_%
0    (-0.1, 0.0]           30    0.02%
1    (0.0, 15.9]            0    0.00%
2   (15.9, 20.0]         1605    1.23%
3   (20.0, 40.0]        13648   10.49%
4   (40.0, 60.0]        37003   28.43%
5   (60.0, 90.0]        72883   56.00%
6  (90.0, 120.0]            0    0.00%
0           null         4988    3.83%
Verification:   train_count: 130157 train_%: 100.0%
Min: 14.84 Max: 67.81
                         bmi  train_count  train_%
0     [0.0-18.5] underweight         5323    4.09%
1  [18.5-24.9] normal weight        36015   27.67%
2        [30-70] over weight        38010   29.20%
3                [70<] obese        46319   35.59%
0                       null         4490    3.45%
Verification:   train_count: 130157 train_%: 100.0%
_categoricalvardist
  gender  train_count  train_%
0      M        70518   54.18%
1      F        59573   45.77%
0   null           66    0.05%
Verification:   train_count: 130157 

### Train - diabetes_mellitus = 1

In [109]:
df_train_dm_1_age, train_dm_1_age_nan, train_dm_1_age_nanpc = _agedist(df_train_dm_1, "train_dm_1")
df_train_dm_1_bmi, train_dm_1_bmi_nan, train_dm_1_bmi_nanpc = _bmidist(df_train_dm_1, "train_dm_1")
df_train_dm_1_gender, train_dm_1_gender_nan, train_dm_1_gender_nanpc = _categoricalvardist(df_train_dm_1, 'train_dm_1', 'gender')
df_train_dm_1_ethnicity, train_dm_1_ethnicity_nan, train_dm_1_ethnicity_nanpc = _categoricalvardist(df_train_dm_1, 'train_dm_1', 'ethnicity')

Invalid age: 3 Min: 0.00 Max: 89.00
             age  train_dm_1_count  train_dm_1_%
0    (-0.1, 0.0]                 3         0.01%
1    (0.0, 15.9]                 0         0.00%
2   (15.9, 20.0]               183         0.65%
3   (20.0, 40.0]              1644         5.84%
4   (40.0, 60.0]              7579        26.92%
5   (60.0, 90.0]             18082        64.23%
6  (90.0, 120.0]                 0         0.00%
0           null               660         2.34%
Verification:   train_dm_1_count: 28151 train_dm_1_%: 100.0%
Min: 14.84 Max: 67.81
                         bmi  train_dm_1_count  train_dm_1_%
0     [0.0-18.5] underweight               597         2.12%
1  [18.5-24.9] normal weight              5268        18.71%
2        [30-70] over weight              7503        26.65%
3                [70<] obese             13868        49.26%
0                       null               915         3.25%
Verification:   train_dm_1_count: 28151 train_dm_1_%: 100.0%
  gender  tra

### Train - diabetes_mellitus = 0

In [110]:
df_train_dm_0_age, train_dm_0_age_nan, train_dm_0_age_nanpc = _agedist(df_train_dm_0, "train_dm_0")
df_train_dm_0_bmi, train_dm_0_bmi_nan, train_dm_0_bmi_nanpc = _bmidist(df_train_dm_0, "train_dm_0")
df_train_dm_0_gender, train_dm_0_gender_nan, train_dm_0_gender_nanpc = _categoricalvardist(df_train_dm_0, 'train_dm_0', 'gender')
df_train_dm_0_ethnicity, train_dm_0_ethnicity_nan, train_dm_0_ethnicity_nanpc = _categoricalvardist(df_train_dm_0, 'train_dm_0', 'ethnicity')

Invalid age: 27 Min: 0.00 Max: 89.00
             age  train_dm_0_count  train_dm_0_%
0    (-0.1, 0.0]                27         0.03%
1    (0.0, 15.9]                 0         0.00%
2   (15.9, 20.0]              1422         1.39%
3   (20.0, 40.0]             12004        11.77%
4   (40.0, 60.0]             29424        28.85%
5   (60.0, 90.0]             54801        53.72%
6  (90.0, 120.0]                 0         0.00%
0           null              4328         4.24%
Verification:   train_dm_0_count: 102006 train_dm_0_%: 100.0%
Min: 14.84 Max: 67.81
                         bmi  train_dm_0_count  train_dm_0_%
0     [0.0-18.5] underweight              4726         4.63%
1  [18.5-24.9] normal weight             30747        30.14%
2        [30-70] over weight             30507        29.91%
3                [70<] obese             32451        31.81%
0                       null              3575         3.50%
Verification:   train_dm_0_count: 102006 train_dm_0_%: 100.0%
  gender  

## Test Dataframe

In [129]:
test_count, _, _ = _describe(df_test, "Testing")
_col_describe(df_test)

----------------------------------------
Testing dataset - details
----------------------------------------
Entries - Total:                   10234
Features:                            180
age       :    0
gender    :    5
ethnicity :  204
weight    :  908
height    :  301
bmi       : 1015
readmission_status [0]


In [94]:
df_test_age, test_age_nan, test_age_nanpc = _agedist(df_test, "test")
df_test_bmi, test_bmi_nan, test_bmi_nanpc = _bmidist(df_test, "test")
df_test_gender, test_gender_nan, test_gender_nanpc = _categoricalvardist(df_test, 'test', 'gender')
df_test_ethnicity, test_ethnicity_nan, test_ethnicity_nanpc = _categoricalvardist(df_test, 'test', 'ethnicity')

Invalid age: 0 Min: 18.00 Max: 96.00
             age  test_count  test_%
0    (-0.1, 0.0]           0   0.00%
1    (0.0, 15.9]           0   0.00%
2   (15.9, 20.0]         180   1.76%
3   (20.0, 40.0]        1075  10.50%
4   (40.0, 60.0]        2939  28.72%
5   (60.0, 90.0]        5753  56.21%
6  (90.0, 120.0]         287   2.80%
0           null           0   0.00%
Verification:   test_count: 10234 test_%: 100.0%
Min: 14.90 Max: 69.94
                         bmi  test_count  test_%
0     [0.0-18.5] underweight         388   3.79%
1  [18.5-24.9] normal weight        2651  25.90%
2        [30-70] over weight        2863  27.98%
3                [70<] obese        3317  32.41%
0                       null        1015   9.92%
Verification:   test_count: 10234 test_%: 100.0%
_categoricalvardist
  gender  test_count  test_%
0      M        5525  53.99%
1      F        4704  45.96%
0   null           5   0.05%
Verification:   test_count: 10234 test_%: 100.0%
_categoricalvardist
          e

In [66]:
def _comparedf(df1, df2, col1, col2, feature):
    df = df1.join(df2.set_index(feature), on=feature)
    print(f"{feature}: {col1}: {df[f'{col1}'].sum():.2f}% {col2}: {df[f'{col2}'].sum():.2f}%")
    return df

In [82]:
_compare('age', df_train_dm_1_age, df_train_dm_0_age, 'train_dm_1_%', 'train_dm_0_%')

age: train_dm_1_%: 100.0% train_dm_0_%: 100.00%


Unnamed: 0,age,train_dm_1_count,train_dm_1_%,train_dm_0_count,train_dm_0_%
0,"(-0.1, 0.0]",3,0.01%,27,0.03%
1,"(0.0, 15.9]",0,0.00%,0,0.00%
2,"(15.9, 20.0]",183,0.65%,1422,1.39%
3,"(20.0, 40.0]",1644,5.84%,12004,11.77%
4,"(40.0, 60.0]",7579,26.92%,29424,28.85%
5,"(60.0, 90.0]",18082,64.23%,54801,53.72%
6,"(90.0, 120.0]",0,0.00%,0,0.00%
0,,660,2.34%,4328,4.24%


In [84]:
_compare('bmi', df_train_dm_1_bmi, df_train_dm_0_bmi, 'train_dm_1_%', 'train_dm_0_%')

bmi: train_dm_1_%: 100.0% train_dm_0_%: 100.0%


Unnamed: 0,bmi,train_dm_1_count,train_dm_1_%,train_dm_0_count,train_dm_0_%
0,[0.0-18.5] underweight,597,2.12%,4726,4.63%
1,[18.5-24.9] normal weight,5268,18.71%,30747,30.14%
2,[30-70] over weight,7503,26.65%,30507,29.91%
3,[70<] obese,13868,49.26%,32451,31.81%
0,,915,3.25%,3575,3.50%


In [111]:
_compare('gender', df_train_dm_1_gender, df_train_dm_0_gender, 'train_dm_1_%', 'train_dm_0_%')

gender: train_dm_1_%: 100.0% train_dm_0_%: 100.0%


Unnamed: 0,gender,train_dm_1_count,train_dm_1_%,train_dm_0_count,train_dm_0_%
0,M,15420,54.78%,55098,54.01%
1,F,12726,45.21%,46847,45.93%
0,,5,0.02%,61,0.06%


In [112]:
_compare('ethnicity', df_train_dm_1_ethnicity, df_train_dm_0_ethnicity, 'train_dm_1_%', 'train_dm_0_%')

ethnicity: train_dm_1_%: 100.0% train_dm_0_%: 100.0%


Unnamed: 0,ethnicity,train_dm_1_count,train_dm_1_%,train_dm_0_count,train_dm_0_%
0,Caucasian,20605,73.19%,79631,78.07%
1,African American,3654,12.98%,10257,10.06%
2,Other/Unknown,1595,5.67%,4666,4.57%
3,Hispanic,1243,4.42%,3806,3.73%
4,Asian,538,1.91%,1660,1.63%
5,Native American,294,1.04%,621,0.61%
0,,222,0.79%,1365,1.34%


## Features Train DM = 1 Vs DM = 0
How are the features in the case where the dm=1 in comparison with the case where dm=0?

In [67]:
_comparedf(df_train_dm_1_age, df_train_dm_0_age, 'train_dm_1_%', 'train_dm_0_%', 'age')

age: train_dm_1_%: 100.00% train_dm_0_%: 100.00%


Unnamed: 0,age,train_dm_1_count,train_dm_1_%,train_dm_0_count,train_dm_0_%
0,"(-0.1, 0.0]",3,0.01%,27,0.03%
1,"(0.0, 15.9]",0,0.00%,0,0.00%
2,"(15.9, 20.0]",183,0.65%,1422,1.39%
3,"(20.0, 40.0]",1644,5.84%,12004,11.77%
4,"(40.0, 60.0]",7579,26.92%,29424,28.85%
5,"(60.0, 90.0]",18082,64.23%,54801,53.72%
6,"(90.0, 120.0]",0,0.00%,0,0.00%
0,,660,2.34%,4328,4.24%


In [101]:
_compare('age', df_train_age, df_test_age, 'train_%', 'test_%')

age: train_%: 100.0% test_%: 100.0%


Unnamed: 0,age,train_count,train_%,test_count,test_%
0,"(-0.1, 0.0]",30,0.02%,0,0.00%
1,"(0.0, 15.9]",0,0.00%,0,0.00%
2,"(15.9, 20.0]",1605,1.23%,180,1.76%
3,"(20.0, 40.0]",13648,10.49%,1075,10.50%
4,"(40.0, 60.0]",37003,28.43%,2939,28.72%
5,"(60.0, 90.0]",72883,56.00%,5753,56.21%
6,"(90.0, 120.0]",0,0.00%,287,2.80%
0,,4988,3.83%,0,0.00%


In [None]:
_comparedf(df_train_dm_1_bmi, df_train_dm_0_bmi, 'train_dm_1_%', 'train_dm_0_%', 'bmi')

**Observations**: 
- In the **40-60yrs** group, number of people with and without dm are almost same.
- In the **60-90yrs** group, number of people with dm are more than without dm.
- In the **Obese** category, number people who have dm is more than those without dm
- In other age and bmi categories, number of people with dm is lesser than those without dm.
- The distribution of genders with and without dm is similar.

In [88]:
_compare('bmi', df_train_bmi, df_test_bmi, 'train_%', 'test_%')

bmi: train_%: 100.00000000000001% test_%: 100.0%


Unnamed: 0,bmi,train_count,train_%,test_count,test_%
0,[0.0-18.5] underweight,5323,4.09%,388,3.79%
1,[18.5-24.9] normal weight,36015,27.67%,2651,25.90%
2,[30-70] over weight,38010,29.20%,2863,27.98%
3,[70<] obese,46319,35.59%,3317,32.41%
0,,4490,3.45%,1015,9.92%


## Features Train Vs Test


In [61]:
_comparedf(df_train_age, df_test_age, 'train_%', 'test_%', 'age')

age: train_%: 100.00% test_%: 100.00%


Unnamed: 0,age,train_count,train_%,test_count,test_%
0,"(-0.1, 0.0]",30,0.02%,0,0.00%
1,"(0.0, 15.9]",0,0.00%,0,0.00%
2,"(15.9, 20.0]",1605,1.23%,180,1.76%
3,"(20.0, 40.0]",13648,10.49%,1075,10.50%
4,"(40.0, 60.0]",37003,28.43%,2939,28.72%
5,"(60.0, 90.0]",72883,56.00%,5753,56.21%
6,"(90.0, 120.0]",0,0.00%,287,2.80%
0,,4988,3.83%,0,0.00%


In [86]:
_compare('gender', df_train_gender, df_test_gender, 'train_%', 'test_%')

gender: train_%: 99.99999999999999% test_%: 100.0%


Unnamed: 0,gender,train_count,train_%,test_count,test_%
0,M,70518,54.18%,5525,53.99%
1,F,59573,45.77%,4704,45.96%
0,,66,0.05%,5,0.05%


In [87]:
_compare('ethnicity', df_train_ethnicity, df_test_ethnicity, 'train_%', 'test_%')

ethnicity: train_%: 100.00000000000001% test_%: 100.0%


Unnamed: 0,ethnicity,train_count,train_%,test_count,test_%
0,Caucasian,100236,77.01%,7939,77.57%
1,African American,13911,10.69%,931,9.10%
2,Other/Unknown,6261,4.81%,435,4.25%
3,Hispanic,5049,3.88%,471,4.60%
4,Asian,2198,1.69%,171,1.67%
5,Native American,915,0.70%,83,0.81%
0,,1587,1.22%,204,1.99%


In [62]:
_comparedf(df_train_bmi, df_test_bmi, 'bmi')


bmi: train_%: 100.00% test_%: 100.00%


Unnamed: 0,bmi,train_count,train_%,test_count,test_%
0,[0.0-18.5] underweight,5323,4.09%,388,3.79%
1,[18.5-24.9] normal weight,36015,27.67%,2651,25.90%
2,[30-70] over weight,38010,29.20%,2863,27.98%
3,[70<] obese,46319,35.59%,3317,32.41%
0,,4490,3.45%,1015,9.92%


In [63]:
_comparedf(df_train_gender, df_test_gender, 'gender')

gender: train_%: 100.00% test_%: 100.00%


Unnamed: 0,gender,train_count,train_%,test_count,test_%
0,M,70518,54.18%,5525,53.99%
1,F,59573,45.77%,4704,45.96%
0,,66,0.05%,5,0.05%


In [64]:
_comparedf(df_train_ethnicity, df_test_ethnicity, 'ethnicity')

ethnicity: train_%: 100.00% test_%: 100.00%


Unnamed: 0,ethnicity,train_count,train_%,test_count,test_%
0,Caucasian,100236,77.01%,7939,77.57%
1,African American,13911,10.69%,931,9.10%
2,Other/Unknown,6261,4.81%,435,4.25%
3,Hispanic,5049,3.88%,471,4.60%
4,Asian,2198,1.69%,171,1.67%
5,Native American,915,0.70%,83,0.81%
0,,1587,1.22%,204,1.99%


**Presence of invalid and missing values and handling :**
- **age**:
    -  In the training dataset, there are 30 invalid ages. Remove these rows
    - In the training dataset, there are null values for age. Handling?
- **gender**: In both training and test datasets, there are null values for gender. Fill "Unknown"
- **ethnicity**: In both training and test datasets, there are null values for ethnicity. Fill "Unkown"
- **weight**: Fill Average weight of the age, gender & ethnicity
- **height**: Fill Average height of the age, gender & ethnicity
- **bmi**: 
    - In both training and test datasets, there are null values for bmi. 
    - Calculate bmi based on  weight and height. Here, there 3 possibilities - weight only is present, height only is present, both weight and height is not present.
    -  Should weight and height be replaced only with bmi?

**Findings**
- Distribution of age across categories is similar for training and test datasets
- The distribution of bmi across categories is similar in training and test data sets

===========================X===========================X===========================X===========================X

*Work in Progress*

===========================X===========================X===========================X===========================X

- Combine features - ratio, difference 

In [114]:
df_train.groupby(['hospital_id']).count()['encounter_id'].sort_values()

hospital_id
25        2
130       2
93        6
95        6
23        7
       ... 
7      2944
86     2962
188    3075
19     3885
118    4306
Name: encounter_id, Length: 204, dtype: int64

In [117]:
df_train.loc[df_train['hospital_id']==4]

Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,...,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
8393,8394,173617,4,56.00%,23.72%,0,Caucasian,M,177.80%,Emergency Department,...,,,0,0,0,0,0,0,0,0
8570,8571,188717,4,,29.26%,0,,F,160.00%,Operating Room,...,,,0,0,0,0,0,0,0,0
8634,8635,276136,4,26.00%,28.67%,0,Other/Unknown,F,172.70%,Emergency Department,...,666.67%,654.81%,0,0,0,0,0,0,0,0
9522,9523,251919,4,,19.55%,0,Caucasian,F,152.40%,Emergency Department,...,,,0,0,0,0,0,0,0,0
9638,9639,197182,4,74.00%,47.95%,0,Caucasian,F,165.10%,Floor,...,,,0,0,0,0,0,0,0,0
9723,9724,275265,4,,26.19%,0,Caucasian,M,177.80%,Floor,...,,,0,0,0,0,0,0,0,0
9859,9860,177497,4,65.00%,30.79%,0,Caucasian,F,170.20%,Floor,...,,,0,0,0,0,0,0,0,0


In [118]:
df_train.groupby('icu_id')['icu_id'].agg(['count']).sort_values(by = 'count', ascending=False)

Unnamed: 0_level_0,count
icu_id,Unnamed: 1_level_1
1019,1344
646,1312
653,1296
876,1280
413,1219
...,...
365,2
241,2
989,2
302,2


In [None]:
df_train['bmi_calc'] = df['weight']/(df['height']/100)**2

In [None]:
df_bmi = df[['weight', 'height', 'bmi', 'bmi_calc']]
df_bmi_1  = df_bmi[df_bmi.isna().any(axis=1)]

In [None]:
df_bmi_1

BMI calculations - relation to diabetes
bmi = weight (kg) / [height (m)]2

Adults:
For adults, the interpretation of BMI does not depend on sex or age. 
For adults 20 years old and older, BMI is interpreted using standard weight status categories.
BMI	Weight Status
Below 18.5	Underweight
18.5 – 24.9	Normal or Healthy Weight
25.0 – 29.9	Overweight
30.0 and Above	Obese


In [None]:
df_bmi_1[ df_bmi_1['height'].isnull() & df_bmi_1['weight'].notnull() |
          df_bmi_1['height'].notnull() & df_bmi_1['weight'].isnull() |
          df_bmi_1['height'].isnull() & df_bmi_1['weight'].isnull()].shape

In [None]:
df_bmi_1[ df_bmi_1['bmi'].isnull()].shape

In [None]:
# Wherever either weight or height or both are not available, bmi is also not available.
# Isn't bmi dependent on gender
# There are no rows where height and weight are null and bmi is not null
'''print("Height-Nan:", df_bmi_1[df_bmi_1['height'].isnull().shape))
print("Weight-Nan:", df_bmi_1[df_bmi_1['height'].isnull().shape))
print("Height-Nan:", df_bmi_1[df_bmi_1['height'].isnull().shape))'''
print(df_bmi_1[df_bmi_1['height'].isnull() & df_bmi_1['weight'].isnull() & df_bmi_1['bmi'].notnull()])
print(df_bmi_1[df_bmi_1['height'].notnull() & df_bmi_1['weight'].isnull() & df_bmi_1['bmi'].notnull()])
print(df_bmi_1[df_bmi_1['height'].isnull() & df_bmi_1['weight'].notnull() & df_bmi_1['bmi'].notnull()])
print(df_bmi_1[df_bmi_1['height'].notnull() & df_bmi_1['weight'].notnull() & df_bmi_1['bmi'].isnull()])

In [None]:
df_bmi_F = df[df['gender']=='F']
bins = pd.cut(df_bmi_F['bmi'], bmi_bins, labels=bmi_labels)
df_bmi_F = df_bmi_F.groupby(bins)['bmi'].agg(['count'])
df_bmi_F = df_bmi_F.reset_index()
df_bmi_F = df_bmi_F.rename(columns = {'count':'Female'})
df_bmi_F

In [None]:
df_bmi = df_bmi_M.join(df_bmi_F.set_index('bmi'), on= 'bmi')
df_bmi['total'] = df_bmi['Male'] + df_bmi['Female']
df_bmi['%'] = 100 * df_bmi['total'] / df_bmi['total'].sum()
df_bmi

**Conclusions**
- Remove the rows with age = 0
- Categorize bmi into 4 categories


In [None]:
 pd.pivot_table(df, values='age', index=['age'],
                    columns=['diabetes_mellitus'], aggfunc=np.sum, fill_value=0)

In [None]:
# encounter_ids are unique and there are no repeats
df_temp = df.groupby(['encounter_id'])['encounter_id'].count().sort_values(ascending=False)
#df_temp = df_temp.rename(columns = {'encounter_id':'count'})
#df_temp['%'] = 100 * df_temp['count'] / df_temp['count'].sum()
#pd.options.display.float_format = '{:.2f}%'.format
print(df_temp)

In [None]:
def getAllValues(df, col):
    df_temp = df.groupby([col]).count()['encounter_id'].sort_values(ascending=False).reset_index()
    df_temp = df_temp.rename(columns = {'encounter_id':'count'})
    df_temp['%'] = 100 * df_temp['count'] / df_temp['count'].sum()
    print(df_temp)

In [None]:
# Columns - string type
for  col in ['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_type', 'icu_stay_type']:
    getAllValues(df, col)
    getAllValues(df_test, col)
    print()

In [None]:
# Columns - binary type
for col in ['elective_surgery', 'readmission_status', 'diabetes_mellitus' ]:
    getAllValues(df, col)	
    if col != 'diabetes_mellitus':
        getAllValues(df_test, col)
    print()

In [None]:
# Columns - nan data

nan_counts = []
nan_counts_percent = []
for col in df.columns:
    total = df[col].count()
    nan_rows = len(df[df[col].isnull()])
    nan_counts.append(nan_rows)
    nan_counts_percent.append(100 * nan_rows/(total+nan_rows))

In [None]:
df_nan = pd.DataFrame({'Variable Name':df.columns, 'nan_counts':nan_counts, '%':nan_counts_percent})
df_nan = df_nan.sort_values('nan_counts', ascending = False)
df_nan['%'] = df_nan['%'].map('{:,.1f}%'.format)
print(df_nan)
df_nan.to_csv("nan_data.csv")
print(df_nan.shape[0])

In [None]:
# Rows with nan data
df_nan = df[df.isna().any(axis=1)]
print (f"Rows with nan values: {100 * df_nan.shape[0]/df.shape[0]:.2f}%")

In [None]:
df_dict = pd.read_csv("data/DataDictionaryWiDS2021.csv")
print(df_dict[['Variable Name', 'Category', 'Data Type']])
print(df_dict.shape[0])

In [None]:
df_data_desc = pd.merge(df_nan, df_dict, on="Variable Name")
df_data_desc = df_data_desc.sort_values(by=['Category','Variable Name'])
df_data_desc.to_csv("datadesc.csv")

In [None]:
df[['apache_2_diagnosis', 'diabetes_mellitus']].corr()

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np
X = df[['arf_apache']]
Y = df['diabetes_mellitus']
slm = LinearRegression()
np.set_printoptions(precision=2)
slm.fit(X,Y)

In [None]:
Yhat =slm_hm.predict(X_hm)