# Title

In [3]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from numpy import cov

import sklearn as sk
from sklearn import neighbors
from sklearn import ensemble
from sklearn import svm
from scipy.stats import pearsonr

import matplotlib.pyplot as plt

import math

# import unedited data
data_unedited = pd.read_csv("data_unedited.csv")

# Data Intake and Cleaning

First, let's take a look at the fresh-out-of-the-box data: 

In [8]:
data_unedited

Unnamed: 0.1,Unnamed: 0,age,sex,Alb,PLT,WBC,CRP,APACHE II,SOFA,McCabe,...,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70,Unnamed: 71
0,1.0,79.0,M,2.3,10.8,4000.0,17.8,24.0,8.0,1.0,...,,,,,,,,,,
1,2.0,83.0,M,4.4,13.5,10200.0,8.9,16.0,6.0,1.0,...,,,,,,,,,,
2,3.0,70.0,M,2.7,10.8,5300.0,25.3,22.0,7.0,1.0,...,,,,,,,,,,
3,4.0,61.0,M,3.3,8.8,1800.0,22.2,26.0,7.0,1.0,...,,,,,,,,,,
4,5.0,81.0,M,3.1,26.2,10600.0,17.0,19.0,4.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,,,,,,,,,,,...,,,,,,,,,,
211,,,,,,,,,,,...,,,,,,,,,,
212,,,,,,,,,,,...,,,,,,,,,,
213,,,,,,,,,,,...,,,,,,,,,,


### Cleaning

It's a little messy. There are a bunch of dummy columns after the pertinent ones called unnamed: whatever number, and there are also 19 dummy rows filled with NaNs. Let's start by getting rid of those dummy columns and rows. 

In [6]:
data = data_unedited[:197]
data = data.drop(columns = "Unnamed: 0")
data = data.drop(columns = [f"Unnamed: {a}" for a in range(22,72)])

In [9]:
data

Unnamed: 0,age,sex,Alb,PLT,WBC,CRP,APACHE II,SOFA,McCabe,PaO2/FiO2,...,CT score,PEEP,PIP,TV,DARDS = 1,days,death = 1,days.1,ventilator weaning = 1,VFD
0,79.0,M,2.3,10.8,4000.0,17.8,24.0,8.0,1.0,108.0,...,191.6,24.0,n.d.,n.d.,0.0,21.0,1.0,28.0,0.0,0.0
1,83.0,M,4.4,13.5,10200.0,8.9,16.0,6.0,1.0,78.0,...,213.3,5.0,10,360,0.0,21.0,1.0,28.0,0.0,0.0
2,70.0,M,2.7,10.8,5300.0,25.3,22.0,7.0,1.0,70.9,...,221.7,18.0,24,525,0.0,8.0,1.0,28.0,0.0,0.0
3,61.0,M,3.3,8.8,1800.0,22.2,26.0,7.0,1.0,59.2,...,211.6,10.0,24,480,0.0,11.0,1.0,28.0,0.0,0.0
4,81.0,M,3.1,26.2,10600.0,17.0,19.0,4.0,1.0,83.6,...,234.9,5.0,10,625,0.0,6.0,1.0,28.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,80.0,F,2.9,27.4,10600.0,13.0,15.0,3.0,1.0,160.0,...,349.7,8.0,9,450,1.0,28.0,0.0,12.0,1.0,16.0
193,63.0,F,2.7,10.3,10800.0,41.0,31.0,7.0,1.0,55.0,...,329.8,8.0,30,550,0.0,1.0,1.0,28.0,0.0,0.0
194,84.0,F,3.1,15.7,7500.0,13.4,16.0,7.0,1.0,121.0,...,343.0,10.0,8,320,1.0,28.0,0.0,7.0,1.0,21.0
195,69.0,F,2.3,12.7,15100.0,14.4,24.0,13.0,1.0,151.3,...,356.4,10.0,21,340,1.0,28.0,0.0,4.0,1.0,24.0


Great. Now we have to deal with the n.d. records and turn them into something we can work with - namely, NaNs. 

In [10]:
data.replace("n.d.", np.nan, inplace = True)

In [11]:
data

Unnamed: 0,age,sex,Alb,PLT,WBC,CRP,APACHE II,SOFA,McCabe,PaO2/FiO2,...,CT score,PEEP,PIP,TV,DARDS = 1,days,death = 1,days.1,ventilator weaning = 1,VFD
0,79.0,M,2.3,10.8,4000.0,17.8,24.0,8.0,1.0,108.0,...,191.6,24.0,,,0.0,21.0,1.0,28.0,0.0,0.0
1,83.0,M,4.4,13.5,10200.0,8.9,16.0,6.0,1.0,78.0,...,213.3,5.0,10,360,0.0,21.0,1.0,28.0,0.0,0.0
2,70.0,M,2.7,10.8,5300.0,25.3,22.0,7.0,1.0,70.9,...,221.7,18.0,24,525,0.0,8.0,1.0,28.0,0.0,0.0
3,61.0,M,3.3,8.8,1800.0,22.2,26.0,7.0,1.0,59.2,...,211.6,10.0,24,480,0.0,11.0,1.0,28.0,0.0,0.0
4,81.0,M,3.1,26.2,10600.0,17.0,19.0,4.0,1.0,83.6,...,234.9,5.0,10,625,0.0,6.0,1.0,28.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,80.0,F,2.9,27.4,10600.0,13.0,15.0,3.0,1.0,160.0,...,349.7,8.0,9,450,1.0,28.0,0.0,12.0,1.0,16.0
193,63.0,F,2.7,10.3,10800.0,41.0,31.0,7.0,1.0,55.0,...,329.8,8.0,30,550,0.0,1.0,1.0,28.0,0.0,0.0
194,84.0,F,3.1,15.7,7500.0,13.4,16.0,7.0,1.0,121.0,...,343.0,10.0,8,320,1.0,28.0,0.0,7.0,1.0,21.0
195,69.0,F,2.3,12.7,15100.0,14.4,24.0,13.0,1.0,151.3,...,356.4,10.0,21,340,1.0,28.0,0.0,4.0,1.0,24.0


### Imputation

Let's take a look at how many missing values each feature has so we know what we're working with. 

In [13]:
data.isnull().sum(axis = 0)

age                        0
sex                        0
Alb                        0
PLT                        0
WBC                        0
CRP                        0
APACHE II                  0
SOFA                       0
McCabe                     0
PaO2/FiO2                  0
LDH                        0
CT score                   0
PEEP                       0
PIP                       43
TV                        58
DARDS = 1                  0
days                       0
death = 1                  0
days.1                     0
ventilator weaning = 1     0
VFD                        0
dtype: int64

Just two columns - 'TV' and 'PIP' are missing values. First, because of the missing values, the data type of these columns is object. Let's change the data type to a numeric type, and then we can figure out what to impute the missing values with based on a comparison between univariate characteristics before imputing and after imputing with each the mean and median of the columns. 

In [18]:
data["PIP"] = pd.to_numeric(data["PIP"])
data["TV"] = pd.to_numeric(data["TV"])
data.dtypes

age                       float64
sex                        object
Alb                       float64
PLT                       float64
WBC                       float64
CRP                       float64
APACHE II                 float64
SOFA                      float64
McCabe                    float64
PaO2/FiO2                 float64
LDH                       float64
CT score                  float64
PEEP                      float64
PIP                       float64
TV                        float64
DARDS = 1                 float64
days                      float64
death = 1                 float64
days.1                    float64
ventilator weaning = 1    float64
VFD                       float64
dtype: object

Here we determine the univariate characteristics of PIP and TV before imputation and put values into a dataframe called imput_compare. 

In [19]:
# initialize dataframe with univariate characteristics
imput_compare = [['mean'], ['median'], ['variance'], ['std dev'], ['1st quartile'], ['3rd quartile'], 
                 ['IQR'], ['num. outliers']]

# add univariate characteristics that will be affected by imputation 
# for each feature into the dataframe
for col in ['PIP', 'TV']:
    #imput_compare[0].append(col)
    #maxm = data[col].max()
    #minm = data[col].min()
    #imput_compare[1].append(math.trunc(maxm))
    #imput_compare[2].append(math.trunc(minm))
    #imput_compare[3].append(math.trunc(maxm-minm))
    
    imput_compare[0].append(math.trunc(data[col].mean()))
    imput_compare[1].append(math.trunc(data[col].median()))
    #imput_compare[6].append(round(data[col].mode(), 2))

    imput_compare[2].append(math.trunc(data[col].var()))
    imput_compare[3].append(math.trunc(data[col].std()))
    quartiles = data[col].quantile([0.25, 0.5, 0.75])
    first_q = int(quartiles[0.25])
    third_q = int(quartiles[0.75])
    iqr = int(quartiles[0.75]-quartiles[0.25])
    imput_compare[4].append(first_q)
    imput_compare[5].append(third_q)
    imput_compare[6].append(iqr)
    num_out = sum(data[col] > (third_q + 1.5*iqr)) + sum(data[col] < (first_q - 1.5*iqr))
    imput_compare[7].append(num_out)

# print dataframe at the end of this initial step
pd.DataFrame(imput_compare, columns = ["characteristic", 'PIP Pre_Imp', "TV Pre_Imp"])


Unnamed: 0,characteristic,PIP Pre_Imp,TV Pre_Imp
0,mean,25,428
1,median,22,420
2,variance,2412,11095
3,std dev,49,105
4,1st quartile,18,350
5,3rd quartile,25,492
6,IQR,7,142
7,num. outliers,3,1
