In [1]:
import pandas as pd

data_unedited = pd.read_csv("data_unedited.csv")

# data cleaning

first let's check what these data look like right out of the box:

In [2]:
data_unedited

Unnamed: 0.1,Unnamed: 0,age,sex,Alb,PLT,WBC,CRP,APACHE II,SOFA,McCabe,...,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70,Unnamed: 71
0,1.0,79.0,M,2.3,10.8,4000.0,17.8,24.0,8.0,1.0,...,,,,,,,,,,
1,2.0,83.0,M,4.4,13.5,10200.0,8.9,16.0,6.0,1.0,...,,,,,,,,,,
2,3.0,70.0,M,2.7,10.8,5300.0,25.3,22.0,7.0,1.0,...,,,,,,,,,,
3,4.0,61.0,M,3.3,8.8,1800.0,22.2,26.0,7.0,1.0,...,,,,,,,,,,
4,5.0,81.0,M,3.1,26.2,10600.0,17.0,19.0,4.0,1.0,...,,,,,,,,,,
5,6.0,79.0,M,3.4,37.9,13200.0,21.0,18.0,5.0,1.0,...,,,,,,,,,,
6,7.0,83.0,M,2.8,18.0,1500.0,54.9,30.0,11.0,1.0,...,,,,,,,,,,
7,8.0,70.0,M,2.8,18.5,14100.0,18.4,15.0,5.0,1.0,...,,,,,,,,,,
8,9.0,65.0,F,3.2,35.4,7200.0,10.4,18.0,3.0,1.0,...,,,,,,,,,,
9,10.0,72.0,M,2.9,26.8,19800.0,29.4,20.0,5.0,1.0,...,,,,,,,,,,


a little messy. there's a bunch of dummy columns after the pertinent ones end called unnamed: whatever number, and there's like 19 dummy rows filled with NaNs. first order of business will be getting rid of those

In [12]:
data = data_unedited[:197]
data = data.drop(columns = "Unnamed: 0")
data = data.drop(columns = [f"Unnamed: {a}" for a in range(22,72)])

In [13]:
data

Unnamed: 0,age,sex,Alb,PLT,WBC,CRP,APACHE II,SOFA,McCabe,PaO2/FiO2,...,CT score,PEEP,PIP,TV,DARDS = 1,days,death = 1,days.1,ventilator weaning = 1,VFD
0,79.0,M,2.3,10.8,4000.0,17.8,24.0,8.0,1.0,108.0,...,191.6,24.0,n.d.,n.d.,0.0,21.0,1.0,28.0,0.0,0.0
1,83.0,M,4.4,13.5,10200.0,8.9,16.0,6.0,1.0,78.0,...,213.3,5.0,10,360,0.0,21.0,1.0,28.0,0.0,0.0
2,70.0,M,2.7,10.8,5300.0,25.3,22.0,7.0,1.0,70.9,...,221.7,18.0,24,525,0.0,8.0,1.0,28.0,0.0,0.0
3,61.0,M,3.3,8.8,1800.0,22.2,26.0,7.0,1.0,59.2,...,211.6,10.0,24,480,0.0,11.0,1.0,28.0,0.0,0.0
4,81.0,M,3.1,26.2,10600.0,17.0,19.0,4.0,1.0,83.6,...,234.9,5.0,10,625,0.0,6.0,1.0,28.0,0.0,0.0
5,79.0,M,3.4,37.9,13200.0,21.0,18.0,5.0,1.0,71.5,...,236.7,5.0,25,460,0.0,12.0,1.0,28.0,0.0,0.0
6,83.0,M,2.8,18.0,1500.0,54.9,30.0,11.0,1.0,96.2,...,180.5,22.0,n.d.,n.d.,0.0,1.0,1.0,28.0,0.0,0.0
7,70.0,M,2.8,18.5,14100.0,18.4,15.0,5.0,1.0,78.2,...,311.6,10.0,25,n.d.,0.0,13.0,1.0,28.0,0.0,0.0
8,65.0,F,3.2,35.4,7200.0,10.4,18.0,3.0,1.0,194.0,...,231.6,8.0,23,440,0.0,28.0,0.0,10.0,1.0,18.0
9,72.0,M,2.9,26.8,19800.0,29.4,20.0,5.0,1.0,74.0,...,233.1,5.0,25,n.d.,0.0,22.0,1.0,16.0,1.0,12.0


now we have the records we're supposed to have, but there's quite a few n.d's. these are nasty strings so we will first coerce them into NaN and then figure out what to do with them from there.

In [15]:
import numpy as np

data.replace("n.d.", np.nan, inplace = True)

so now let's see how many missing values we have in each column, this is probably important to figuring out how we would like to impute the values

In [18]:
data.isnull().sum(axis = 0)

age                        0
sex                        0
Alb                        0
PLT                        0
WBC                        0
CRP                        0
APACHE II                  0
SOFA                       0
McCabe                     0
PaO2/FiO2                  0
LDH                        0
CT score                   0
PEEP                       0
PIP                       43
TV                        58
DARDS = 1                  0
days                       0
death = 1                  0
days.1                     0
ventilator weaning = 1     0
VFD                        0
dtype: int64

also- even though all our data is behaving nicely now, pandas is still punishing us for having non-consistently typed data on import: notice the columns that have missing values both are `object` and not `float64`. let's fix that so we can do columnwise statistical methods

In [21]:
data.dtypes

age                       float64
sex                        object
Alb                       float64
PLT                       float64
WBC                       float64
CRP                       float64
APACHE II                 float64
SOFA                      float64
McCabe                    float64
PaO2/FiO2                 float64
LDH                       float64
CT score                  float64
PEEP                      float64
PIP                        object
TV                         object
DARDS = 1                 float64
days                      float64
death = 1                 float64
days.1                    float64
ventilator weaning = 1    float64
VFD                       float64
dtype: object

In [23]:
data["PIP"] = pd.to_numeric(data["PIP"])
data["TV"] = pd.to_numeric(data["TV"])
data.dtypes

age                       float64
sex                        object
Alb                       float64
PLT                       float64
WBC                       float64
CRP                       float64
APACHE II                 float64
SOFA                      float64
McCabe                    float64
PaO2/FiO2                 float64
LDH                       float64
CT score                  float64
PEEP                      float64
PIP                       float64
TV                        float64
DARDS = 1                 float64
days                      float64
death = 1                 float64
days.1                    float64
ventilator weaning = 1    float64
VFD                       float64
dtype: object

now that that's taken care of, we can fill those NaN values with whatever significant value we want to!

In [28]:
data.fillna(data.mean(), inplace=True)
data.isnull().sum(axis = 0)

age                       0
sex                       0
Alb                       0
PLT                       0
WBC                       0
CRP                       0
APACHE II                 0
SOFA                      0
McCabe                    0
PaO2/FiO2                 0
LDH                       0
CT score                  0
PEEP                      0
PIP                       0
TV                        0
DARDS = 1                 0
days                      0
death = 1                 0
days.1                    0
ventilator weaning = 1    0
VFD                       0
dtype: int64

just testing that the module works fine

In [11]:
from utils import intake_data

In [13]:
intake_data(0).isnull().sum(axis=0)

age                       0
sex                       0
Alb                       0
PLT                       0
WBC                       0
CRP                       0
APACHE II                 0
SOFA                      0
McCabe                    0
PaO2/FiO2                 0
LDH                       0
CT score                  0
PEEP                      0
PIP                       0
TV                        0
DARDS = 1                 0
days                      0
death = 1                 0
days.1                    0
ventilator weaning = 1    0
VFD                       0
dtype: int64