Imports

In [2]:
import pandas as pd
from pathlib import Path


# 1. Hospital Deaths - Patient Survival Prediciton

**Sensitive attributes:** age, gender, ethnicity.

## Raw Dataset

### Load dataset into dataframe

In [3]:
df = pd.read_csv('datasets/patient_survival.csv')

### Data Preparation I

In [5]:
df.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,icu_admit_source,...,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,Unnamed: 83,hospital_death
0,66154,25312,118,68.0,22.73,0,Caucasian,M,180.3,Floor,...,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,,0
1,114252,59342,81,77.0,27.42,0,Caucasian,F,160.0,Floor,...,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,,0
2,119783,50777,118,25.0,31.95,0,Caucasian,F,172.7,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,,0
3,79267,46918,118,81.0,22.64,1,Caucasian,F,165.1,Operating Room / Recovery,...,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular,,0
4,92056,34377,33,19.0,,0,Caucasian,M,188.0,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma,,0


#### Drop irrelevant columns

Drop attributes that do not correlate to patient survival prediciton:

In [6]:
df.drop(
    [
        'encounter_id',
        'patient_id',
        'icu_admit_source',
        'icu_id',
        'icu_stay_type',
        'icu_type',
        'Unnamed: 83'
    ],
    axis = 1, inplace=True
)

Show updated dataframe:

In [7]:
df.head()

Unnamed: 0,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,pre_icu_los_days,weight,apache_2_diagnosis,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
0,118,68.0,22.73,0,Caucasian,M,180.3,0.541667,73.9,113.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,0
1,81,77.0,27.42,0,Caucasian,F,160.0,0.927778,70.2,108.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,0
2,118,25.0,31.95,0,Caucasian,F,172.7,0.000694,95.3,122.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0
3,118,81.0,22.64,1,Caucasian,F,165.1,0.000694,61.7,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular,0
4,33,19.0,,0,Caucasian,M,188.0,0.073611,,119.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma,0


#### EXPLORE COLUMNS

**Find the total number of null values:**

In [8]:
total_nulls = df.isna().sum().sum()
print(f"Remaining null values in the dataframe: {total_nulls}")

Remaining null values in the dataframe: 196221


**Find the distribution of null values among attributes:**

In [9]:
null_distribution = df.isna().sum()

**Calculate the percentage of null values per column:**

In [10]:
percent_null = [
    x*100 / total_nulls
    for x in null_distribution.values
]

**Show sorted results:**

In [11]:
null_df = pd.DataFrame(
    {
        'nulls': null_distribution.values,
        'percent_null': percent_null
    },
    index=null_distribution.index
)

null_df.sort_values(by='percent_null', ascending=False, inplace=True)
null_df

Unnamed: 0,nulls,percent_null
d1_potassium_min,9585,4.884798
d1_potassium_max,9585,4.884798
h1_mbp_noninvasive_min,9084,4.629474
h1_mbp_noninvasive_max,9084,4.629474
apache_4a_icu_death_prob,7947,4.050025
...,...,...
hospital_id,0,0.000000
apache_post_operative,0,0.000000
pre_icu_los_days,0,0.000000
elective_surgery,0,0.000000


**Consider the attributes carrying more than 2% of null values:**

In [12]:
high_nulls = [x for x in percent_null if x > 2]

print(
    f'The first {len(high_nulls)} attributes in \"null_df\" (above) make up {round(sum(high_nulls), 1)}% of the total null values.'
)

The first 20 attributes in "null_df" (above) make up 65.7% of the total null values.


**Now, only consider the attributes carrying more than 2.5% of null values:**

In [13]:
top_nulls = [x for x in percent_null if x > 2.5]

print(
    f'The first {len(top_nulls)} attributes in \"null_df\" (above) make up {round(sum(top_nulls), 1)}% of the total null values.'
)

The first 12 attributes in "null_df" (above) make up 48.0% of the total null values.


**Limit "null_df" to the 12 most impactful attributes:**

In [14]:
null_attributes_df = null_df.iloc[0:12]
null_attributes_df

Unnamed: 0,nulls,percent_null
d1_potassium_min,9585,4.884798
d1_potassium_max,9585,4.884798
h1_mbp_noninvasive_min,9084,4.629474
h1_mbp_noninvasive_max,9084,4.629474
apache_4a_icu_death_prob,7947,4.050025
apache_4a_hospital_death_prob,7947,4.050025
h1_diasbp_noninvasive_min,7350,3.745776
h1_diasbp_noninvasive_max,7350,3.745776
h1_sysbp_noninvasive_max,7341,3.74119
h1_sysbp_noninvasive_min,7341,3.74119


**We could consider dropping some of these columns to reduce the null values in the dataset. Which ones to drop - if any - still needs to be clarified.**

#### EXPLORE ROWS

**Find all rows with null values:**

In [15]:
null_rows_df = df[df.isna().any(axis=1)]
null_rows_df

Unnamed: 0,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,pre_icu_los_days,weight,apache_2_diagnosis,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
2,118,25.0,31.950000,0,Caucasian,F,172.7,0.000694,95.3,122.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0
3,118,81.0,22.640000,1,Caucasian,F,165.1,0.000694,61.7,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular,0
4,33,19.0,,0,Caucasian,M,188.0,0.073611,,119.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma,0
6,83,59.0,57.450000,0,Caucasian,F,165.1,0.000694,156.6,108.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,0
7,33,70.0,,0,Caucasian,M,165.0,0.002083,,113.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Sepsis,Cardiovascular,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91704,30,38.0,32.992923,0,Caucasian,M,177.8,0.015972,104.3,307.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0
91705,195,67.0,28.876843,0,African American,M,182.9,0.213194,96.6,123.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0
91707,183,,33.933518,0,Caucasian,F,152.0,-3.593056,78.4,,...,,,,,,,,,,0
91710,195,48.0,27.236914,0,Caucasian,M,170.2,0.046528,78.9,123.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0


**Calculate how many null values each row has:**

In [16]:
nulls_per_row = {
    f"{null_rows_df.index[i]}": null_rows_df.iloc[i].isna().sum()
    for i in range(len(null_rows_df))
}

**Sort the results in descending order:**

In [17]:
nulls_per_row_sorted = {
    key: value
    for key, value in sorted(
        nulls_per_row.items(),
        key=lambda item: item[1],
        reverse=True
    )
}

nulls_per_row_df = pd.DataFrame(
    nulls_per_row_sorted.items(),
    columns=['patient_index', 'patient_nulls'])

In [20]:
nulls_per_row_df

Unnamed: 0,patient_index,patient_nulls
0,3011,68
1,33104,68
2,35517,66
3,37148,66
4,17596,64
...,...,...
34722,91613,1
34723,91620,1
34724,91657,1
34725,91662,1


**Reduce "nulls_per_row_df" to the rows which have 5 or more null values (at least 6.4% of null entries)**:

In [21]:
nulls_per_row_df = nulls_per_row_df[nulls_per_row_df.patient_nulls >= 5]
nulls_per_row_df

Unnamed: 0,patient_index,patient_nulls
0,3011,68
1,33104,68
2,35517,66
3,37148,66
4,17596,64
...,...,...
11212,90487,5
11213,90517,5
11214,90864,5
11215,91156,5


In [25]:
len_nlpr = len(nulls_per_row_df)
p_nls = nulls_per_row_df.patient_nulls.sum()
print(
    f"{len_nlpr} patients out of {len(df)} ({round(len_nlpr*100/len(df), 1)}%) " +
    f"contribute with {p_nls} nulls out of {total_nulls} ({round(p_nls*100/total_nulls, 1)}%)."
)

11217 patients out of 91713 (12.2%) contribute with 138498 nulls out of 196221 (70.6%).


**I would recommend removing these 11,217 patients from the dataset instead of removing any columns. This is because 12% of patients cause 70% of the total null values in the dataset. On the contrary, removing columns would be much more complicated, because it requires an expert's opinion on which attributes do not correlate with survival prediciton.**

#### CLEANING THE DATASET BASED ON FINDINGS

**Drop rows with null values according to the previous findings:**

In [26]:
df_cleaned = df.copy()
for i in range(len(nulls_per_row_df)):
  idx = int(nulls_per_row_df.patient_index[i])
  df_cleaned.drop(df.index[idx], inplace=True)

In [27]:
df_cleaned

Unnamed: 0,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,pre_icu_los_days,weight,apache_2_diagnosis,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
0,118,68.0,22.730000,0,Caucasian,M,180.3,0.541667,73.9,113.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,0
1,81,77.0,27.420000,0,Caucasian,F,160.0,0.927778,70.2,108.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,0
2,118,25.0,31.950000,0,Caucasian,F,172.7,0.000694,95.3,122.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0
5,83,67.0,27.560000,0,Caucasian,M,190.5,0.000694,100.0,301.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Neurological,Neurologic,0
6,83,59.0,57.450000,0,Caucasian,F,165.1,0.000694,156.6,108.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91701,6,21.0,24.874059,0,Caucasian,M,185.4,0.052083,85.5,123.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0
91706,121,54.0,19.770448,0,Native American,M,177.8,0.025694,62.5,109.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular,0
91708,30,75.0,23.060250,0,Caucasian,M,177.8,0.298611,72.9,113.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,Sepsis,Cardiovascular,0
91709,121,56.0,47.179671,0,Caucasian,F,183.0,0.120139,158.0,113.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,0


**SAVE NEW DATASET**

In [28]:
filepath = Path('datasets/clean_patient_survival.csv')
df_cleaned.to_csv(filepath, index=False)

## Clean Dataset Encoding

### Load clean dataset into new dataframe

In [29]:
clean_df = pd.read_csv('datasets/clean_patient_survival.csv')

### Data Preparation II

In [30]:
clean_df

Unnamed: 0,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,pre_icu_los_days,weight,apache_2_diagnosis,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
0,118,68.0,22.730000,0,Caucasian,M,180.3,0.541667,73.9,113.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,0
1,81,77.0,27.420000,0,Caucasian,F,160.0,0.927778,70.2,108.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,0
2,118,25.0,31.950000,0,Caucasian,F,172.7,0.000694,95.3,122.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0
3,83,67.0,27.560000,0,Caucasian,M,190.5,0.000694,100.0,301.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Neurological,Neurologic,0
4,83,59.0,57.450000,0,Caucasian,F,165.1,0.000694,156.6,108.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80491,6,21.0,24.874059,0,Caucasian,M,185.4,0.052083,85.5,123.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0
80492,121,54.0,19.770448,0,Native American,M,177.8,0.025694,62.5,109.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular,0
80493,30,75.0,23.060250,0,Caucasian,M,177.8,0.298611,72.9,113.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,Sepsis,Cardiovascular,0
80494,121,56.0,47.179671,0,Caucasian,F,183.0,0.120139,158.0,113.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,0


#### ENCODE CATEGORICAL DATA

*should have used scikit label encoder*

In [31]:
final_df = clean_df.copy()

##### **Encode "ethnicity":**

Find unique values:

In [32]:
clean_df['ethnicity'].unique()

array(['Caucasian', nan, 'Hispanic', 'African American', 'Asian',
       'Native American', 'Other/Unknown'], dtype=object)

Legend:
- Caucasian = 0
- Hispanic = 1
- African American = 2
- Asian = 3
- Native American = 4
- Other/Unknown = 5

In [37]:
# replace these values in "final_df"
final_df['ethnicity'] = clean_df['ethnicity'].replace(
    {
        'Caucasian': 0,
        'Hispanic': 1,
        'African American': 2,
        'Asian': 3,
        'Native American': 4,
        'Other/Unknown': 5
    }
)

In [38]:
final_df['ethnicity'].unique()

array([ 0., nan,  1.,  2.,  3.,  4.,  5.])

##### **Encode "gender":**

Legend:
- M(ale) = 0
- F(emale) = 1

In [35]:
# replace these values in "final_df"
final_df['gender'] = clean_df['gender'].replace(
    {
        'M': 0,
        'F': 1
    }
)

In [36]:
final_df['gender'].unique()

array([ 0.,  1., nan])

##### **Encode "apache_3j_bodysystem":**

Find unique values:

In [39]:
clean_df['apache_3j_bodysystem'].unique()

array(['Sepsis', 'Respiratory', 'Metabolic', 'Neurological',
       'Cardiovascular', 'Gastrointestinal', 'Genitourinary', 'Trauma',
       nan, 'Hematological', 'Musculoskeletal/Skin', 'Gynecological'],
      dtype=object)

Legend:
- Sepsis = 0
- Respiratory = 1
- Metabolic = 2
- Neurological = 3
- Cardiovascular = 4
- Gastrointestinal = 5
- Genitourinary = 6
- Trauma = 7
- Hematological = 8
- Musculoskeletal/Skin = 9
- Gynecological = 10

In [40]:
# replace these values in "final_df"
final_df['apache_3j_bodysystem'] = clean_df['apache_3j_bodysystem'].replace(
    {
        'Sepsis': 0,
        'Respiratory': 1,
        'Metabolic': 2,
        'Neurological': 3,
        'Cardiovascular': 4,
        'Gastrointestinal': 5,
        'Genitourinary': 6,
        'Trauma': 7,
        'Hematological': 8,
        'Musculoskeletal/Skin': 9,
        'Gynecological': 10
    }
)

In [41]:
final_df['apache_3j_bodysystem'].unique()

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7., nan,  8.,  9., 10.])

##### **Encode "apache_2_bodysystem":**

Find unique values:

In [42]:
clean_df['apache_2_bodysystem'].unique()

array(['Cardiovascular', 'Respiratory', 'Metabolic', 'Neurologic',
       'Gastrointestinal', 'Renal/Genitourinary', 'Trauma', nan,
       'Undefined diagnoses', 'Haematologic', 'Undefined Diagnoses'],
      dtype=object)

Legend:
- Undefined diagnoses or Undefined Diagnoses = 0
- Respiratory = 1
- Metabolic = 2
- Neurologic = 3
- Cardiovascular = 4
- Gastrointestinal = 5
- Renal/Genitourinary = 6
- Trauma = 7
- Haematologic = 8

In [43]:
# replace these values in "final_df"
final_df['apache_2_bodysystem'] = clean_df['apache_2_bodysystem'].replace(
    {
        'Undefined diagnoses': 0,
        'Undefined Diagnoses': 0,
        'Respiratory': 1,
        'Metabolic': 2,
        'Neurologic': 3,
        'Cardiovascular': 4,
        'Gastrointestinal': 5,
        'Renal/Genitourinary': 6,
        'Trauma': 7,
        'Haematologic': 8
    }
)

In [44]:
final_df['apache_2_bodysystem'].unique()

array([ 4.,  1.,  2.,  3.,  5.,  6.,  7., nan,  0.,  8.])

### Check status and save updated dataset

In [45]:
final_df.dtypes.unique()

array([dtype('int64'), dtype('float64')], dtype=object)

In [46]:
final_df.select_dtypes(include='object')

0
1
2
3
4
...
80491
80492
80493
80494
80495


**SAVE ENCODED DATASET**

In [53]:
filepath = Path('datasets/final_patient_survival.csv')
final_df.to_csv(filepath, index=False)

**No other action required at this stage.**

## Fix final_patient_survival

In [54]:
df = pd.read_csv('datasets/final_patient_survival.csv')

In [55]:
df.head()

Unnamed: 0,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,pre_icu_los_days,weight,apache_2_diagnosis,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
0,118,68.0,22.73,0,0.0,0.0,180.3,0.541667,73.9,113.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0
1,81,77.0,27.42,0,0.0,1.0,160.0,0.927778,70.2,108.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0
2,118,25.0,31.95,0,0.0,1.0,172.7,0.000694,95.3,122.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0
3,83,67.0,27.56,0,0.0,0.0,190.5,0.000694,100.0,301.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0
4,83,59.0,57.45,0,0.0,1.0,165.1,0.000694,156.6,108.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0


##### Describe df

In [56]:
data = df.describe().loc["mean"]
data

hospital_id                    105.678580
age                             62.284685
bmi                             29.154863
elective_surgery                 0.158020
ethnicity                        0.575859
                                  ...    
lymphoma                         0.004174
solid_tumor_with_metastasis      0.021305
apache_3j_bodysystem             3.131925
apache_2_bodysystem              3.466204
hospital_death                   0.087271
Name: mean, Length: 78, dtype: float64

**Find columns with null values and attach mean value**

In [58]:
null_cols = {
    f'{attribute}': round(data[attribute])
    for attribute in df.isna()
    if df.isna().any(axis=0)[attribute] == True
}

In [60]:
null_cols

{'age': 62,
 'bmi': 29,
 'ethnicity': 1,
 'gender': 0,
 'height': 170,
 'weight': 84,
 'apache_2_diagnosis': 183,
 'apache_3j_diagnosis': 536,
 'gcs_eyes_apache': 3,
 'gcs_motor_apache': 5,
 'gcs_unable_apache': 0,
 'gcs_verbal_apache': 4,
 'heart_rate_apache': 100,
 'map_apache': 88,
 'resprate_apache': 26,
 'temp_apache': 36,
 'd1_mbp_noninvasive_max': 105,
 'd1_mbp_noninvasive_min': 65,
 'd1_resprate_max': 29,
 'd1_resprate_min': 13,
 'd1_spo2_max': 99,
 'd1_spo2_min': 91,
 'd1_temp_max': 37,
 'd1_temp_min': 36,
 'h1_diasbp_max': 76,
 'h1_diasbp_min': 63,
 'h1_diasbp_noninvasive_max': 76,
 'h1_diasbp_noninvasive_min': 63,
 'h1_heartrate_max': 93,
 'h1_heartrate_min': 84,
 'h1_mbp_max': 92,
 'h1_mbp_min': 80,
 'h1_mbp_noninvasive_max': 92,
 'h1_mbp_noninvasive_min': 80,
 'h1_resprate_max': 23,
 'h1_resprate_min': 17,
 'h1_spo2_max': 98,
 'h1_spo2_min': 95,
 'h1_sysbp_noninvasive_max': 133,
 'h1_sysbp_noninvasive_min': 116,
 'd1_glucose_max': 175,
 'd1_glucose_min': 115,
 'd1_potassiu

**FILL NULL VALUES WITH MEAN**

In [61]:
for key, v in null_cols.items():
  df[f'{key}'].fillna(value=v, inplace=True)

In [62]:
check_nulls_col = df.isna().any(axis=0)
True in check_nulls_col

False

In [63]:
check_nulls_row = df.isna().any(axis=1)
check_nulls_row = [boolean for boolean in check_nulls_row]
True in check_nulls_row

False

### Save

In [64]:
filepath = Path('datasets/final_patient_survival_mean_filled.csv')
df.to_csv(filepath, index=False)

## Reduce size of the dataset attributes to help prediction

In [65]:
df = pd.read_csv('datasets/final_patient_survival_mean_filled.csv')

### Reduce

In [66]:
df.columns

Index(['hospital_id', 'age', 'bmi', 'elective_surgery', 'ethnicity', 'gender',
       'height', 'pre_icu_los_days', 'weight', 'apache_2_diagnosis',
       'apache_3j_diagnosis', 'apache_post_operative', 'arf_apache',
       'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_unable_apache',
       'gcs_verbal_apache', 'heart_rate_apache', 'intubated_apache',
       'map_apache', 'resprate_apache', 'temp_apache', 'ventilated_apache',
       'd1_diasbp_max', 'd1_diasbp_min', 'd1_diasbp_noninvasive_max',
       'd1_diasbp_noninvasive_min', 'd1_heartrate_max', 'd1_heartrate_min',
       'd1_mbp_max', 'd1_mbp_min', 'd1_mbp_noninvasive_max',
       'd1_mbp_noninvasive_min', 'd1_resprate_max', 'd1_resprate_min',
       'd1_spo2_max', 'd1_spo2_min', 'd1_sysbp_max', 'd1_sysbp_min',
       'd1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min', 'd1_temp_max',
       'd1_temp_min', 'h1_diasbp_max', 'h1_diasbp_min',
       'h1_diasbp_noninvasive_max', 'h1_diasbp_noninvasive_min',
       'h1_heartrate_max'

In [67]:
df.drop(
    [
        'hospital_id', 'apache_2_diagnosis', 'apache_3j_diagnosis',
        'apache_post_operative', 'arf_apache', 'gcs_eyes_apache',
        'gcs_motor_apache', 'gcs_unable_apache', 'gcs_verbal_apache',
        'heart_rate_apache', 'intubated_apache', 'map_apache',
        'resprate_apache', 'temp_apache', 'ventilated_apache',
        'd1_diasbp_max', 'd1_diasbp_min', 'd1_diasbp_noninvasive_max',
        'd1_diasbp_noninvasive_min', 'd1_heartrate_max', 'd1_heartrate_min',
        'd1_mbp_max', 'd1_mbp_min', 'd1_mbp_noninvasive_max',
        'd1_mbp_noninvasive_min', 'd1_resprate_max', 'd1_resprate_min',
        'd1_spo2_max', 'd1_spo2_min', 'd1_sysbp_max',
        'd1_sysbp_min', 'd1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min',
        'd1_temp_max', 'd1_temp_min', 'h1_diasbp_max',
        'h1_diasbp_min', 'h1_diasbp_noninvasive_max', 'h1_diasbp_noninvasive_min',
        'h1_heartrate_max', 'h1_heartrate_min', 'h1_mbp_max',
        'h1_mbp_min', 'h1_mbp_noninvasive_max', 'h1_mbp_noninvasive_min',
        'h1_resprate_max', 'h1_resprate_min', 'h1_spo2_max',
        'h1_spo2_min', 'h1_sysbp_max', 'h1_sysbp_min',
        'h1_sysbp_noninvasive_max', 'h1_sysbp_noninvasive_min', 'd1_glucose_max',
        'd1_glucose_min', 'd1_potassium_max', 'd1_potassium_min',
        'apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob',
        'apache_3j_bodysystem', 'apache_2_bodysystem'
    ],
    axis=1,
    inplace=True
    )

In [68]:
df.rename(columns={
    "pre_icu_los_days": "pre_icu_days",
    "solid_tumor_with_metastasis": "solid_tumor"
    },
    inplace=True
)

**SAVE**

In [69]:
filepath = Path('datasets/final_patient_survival_reduced.csv')
df.to_csv(filepath, index=False)