# Individual Coding Exercise (ICE) 5 for HUDK4051: Learning Analytics

At the end of this ICE 5, the following topics will be covered:

- simulate MCAR data or MAR data
- implement common missing data imputation methods

In [1]:
#importing necessary packages

import numpy as np
import pandas as pd
from sklearn import datasets

In [2]:
#IRIS is included in sci-kit learn.

irisRaw = datasets.load_iris()

#convert it to a pandas dataframe for better visualization
iris = pd.DataFrame(data=irisRaw.data, columns=irisRaw.feature_names)
iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [3]:
#using MCAR as it's the easiest -- we can simply sample random proportion of the data and replace them with NaN

iris_MCAR = iris.copy()

In [4]:
#set the missingness rate at 30%
missing = 0.3

#only introduce missing rate to the 'sepal length (cm)'
iris_MCAR.loc[iris_MCAR.sample(frac=missing).index, 'sepal length (cm)'] = np.nan

iris_MCAR

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),sepal length(cm)
0,5.1,3.5,1.4,0.2,
1,4.9,3.0,1.4,0.2,
2,4.7,3.2,1.3,0.2,
3,4.6,3.1,1.5,0.2,
4,5.0,3.6,1.4,0.2,
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,
146,6.3,2.5,5.0,1.9,
147,6.5,3.0,5.2,2.0,
148,6.2,3.4,5.4,2.3,


In [17]:
#alternatively, iterate through all columns and introduce 20% missing to all variables
for col in iris_MCAR.columns:
    iris_MCAR.loc[iris_MCAR.sample(frac=missing).index, col] = np.nan

In [6]:
iris_MAR = iris.copy()

In [7]:
#set the missingness rate at 30%
missing = 0.3

#only introduce missing rate to the 'sepal length (cm)'
iris_MAR.loc[iris_MAR.sample(frac=missing, weights = 'sepal width (cm)').index, 'sepal length (cm)'] = np.nan

iris_MAR

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,,3.0,5.2,2.3
146,,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,,3.4,5.4,2.3


## Deletion Methods

Two different types of deletion methods:
- Listwise deletion
- Pairwise deletion

### Listwise deletion

It produces unbiased estimation only when the data is MCAR. However, the reduced sample size leads to reduced power. In python, we can use dropna() to perform a listwise deletion. You can see here, as we introduced 30% of missing into the dataset, the sample size is now 105.

In [9]:
iris_MCAR_listwise = iris_MCAR.copy()
iris_MCAR_listwise.dropna()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),sepal length(cm)


### Pairwise deletion

Alternatively, pairwise deletion skips the null observation when it is needed. Therefore, to avoid the data loss issue in listwise deletion to a certain degree, all methods in pandas such as mean, median, sum, etc. uses pairwise deletion by default. You can see that pairwise deletion performs a bit better on MCAR than MAR data (closer to the true estimation).

In [10]:
iris_MCAR_pairwise = iris_MCAR.copy()
iris_MAR_pairwise = iris_MAR.copy()

print("Mean estimation in MCAR data with pairwise deletion",
      iris_MCAR_pairwise['sepal length (cm)'].mean(), 
     "\nSD estimation in MCAR data with pairwise deletion",
      iris_MCAR_pairwise['sepal length (cm)'].std()
     )
print("Mean estimation in MAR data with pairwise deletion",
      iris_MAR_pairwise['sepal length (cm)'].mean(), 
     "\nSD estimation in MAR data with pairwise deletion",
      iris_MAR_pairwise['sepal length (cm)'].std()
     )
print("Mean estimation in the original data",
      iris['sepal length (cm)'].mean(),
      "\nSD estimation in the original data",
      iris['sepal length (cm)'].std()
     )

Mean estimation in MCAR data with pairwise deletion 5.845714285714285 
SD estimation in MCAR data with pairwise deletion 0.8659175404235595
Mean estimation in MAR data with pairwise deletion 5.8066666666666675 
SD estimation in MAR data with pairwise deletion 0.8170381154213566
Mean estimation in the original data 5.843333333333334 
SD estimation in the original data 0.828066127977863


### Single Imputation
As opposed to deletion, we can impute (or in plain English -- "guess") the missing value with various imputation methods. The simplest ones include mean/median/model imputation. For MCAR data, simple imputation produces unbiased estimation of marginal means, but underestimates variances, and overestimate correlations.

In [11]:
from sklearn.impute import SimpleImputer
iris_MAR_mean = iris_MAR.copy()

In [12]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(iris_MAR)
print(imp.transform(iris_MAR))

[[5.1        3.5        1.4        0.2       ]
 [5.80666667 3.         1.4        0.2       ]
 [4.7        3.2        1.3        0.2       ]
 [5.80666667 3.1        1.5        0.2       ]
 [5.         3.6        1.4        0.2       ]
 [5.4        3.9        1.7        0.4       ]
 [5.80666667 3.4        1.4        0.3       ]
 [5.         3.4        1.5        0.2       ]
 [4.4        2.9        1.4        0.2       ]
 [5.80666667 3.1        1.5        0.1       ]
 [5.4        3.7        1.5        0.2       ]
 [4.8        3.4        1.6        0.2       ]
 [4.8        3.         1.4        0.1       ]
 [4.3        3.         1.1        0.1       ]
 [5.8        4.         1.2        0.2       ]
 [5.7        4.4        1.5        0.4       ]
 [5.4        3.9        1.3        0.4       ]
 [5.1        3.5        1.4        0.3       ]
 [5.7        3.8        1.7        0.3       ]
 [5.1        3.8        1.5        0.3       ]
 [5.4        3.4        1.7        0.2       ]
 [5.1        

### Regression Imputation/Conditional Mean Imputation

A more common way to impute the missing value based on regression/anova models. In this case, you are modeling the dependency through the observed covariate(s). This can work fairly well if you can identify the correct covariate(s) that are related to the missingness.

In [13]:
from sklearn import linear_model
iris_MAR_regression = iris_MAR.copy()

In [14]:
iris_MAR_regression_model = iris_MAR_regression.copy().dropna()

model = linear_model.LinearRegression()
Xs = iris_MAR_regression_model['sepal width (cm)'].values.reshape(-1, 1)
ys = iris_MAR_regression_model['sepal length (cm)'].values.reshape(-1, 1)
model.fit(X = Xs, y = ys)

null_index = iris_MAR_regression['sepal length (cm)'].isnull()

na_result = model.predict(iris_MAR_regression[null_index]['sepal width (cm)'].values.reshape(-1, 1))


iris_MAR_regression.loc[iris_MAR_regression['sepal length (cm)'].isnull(), 'sepal length (cm)'] = na_result.reshape(len(na_result),)

iris_MAR_regression

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.100000,3.5,1.4,0.2
1,5.842092,3.0,1.4,0.2
2,4.700000,3.2,1.3,0.2
3,5.807329,3.1,1.5,0.2
4,5.000000,3.6,1.4,0.2
...,...,...,...,...
145,5.842092,3.0,5.2,2.3
146,6.015909,2.5,5.0,1.9
147,6.500000,3.0,5.2,2.0
148,5.703039,3.4,5.4,2.3


### Nearest neighbours or hot-deck imputation

Similarly, you can use other prediction methods to perform the imputation. One a commonly used method is KNN, as a type of hot-deck imputation. Here is the sci-kit learn implementation. By default, it draws from all the variables.

In [15]:
from sklearn.impute import KNNImputer
iris_MAR_knn = iris_MAR.copy()

In [16]:
knn_imputer = KNNImputer(n_neighbors=2, weights="uniform")

knn_imputer.fit_transform(iris_MAR_knn)

array([[5.1 , 3.5 , 1.4 , 0.2 ],
       [4.6 , 3.  , 1.4 , 0.2 ],
       [4.7 , 3.2 , 1.3 , 0.2 ],
       [4.95, 3.1 , 1.5 , 0.2 ],
       [5.  , 3.6 , 1.4 , 0.2 ],
       [5.4 , 3.9 , 1.7 , 0.4 ],
       [5.15, 3.4 , 1.4 , 0.3 ],
       [5.  , 3.4 , 1.5 , 0.2 ],
       [4.4 , 2.9 , 1.4 , 0.2 ],
       [4.85, 3.1 , 1.5 , 0.1 ],
       [5.4 , 3.7 , 1.5 , 0.2 ],
       [4.8 , 3.4 , 1.6 , 0.2 ],
       [4.8 , 3.  , 1.4 , 0.1 ],
       [4.3 , 3.  , 1.1 , 0.1 ],
       [5.8 , 4.  , 1.2 , 0.2 ],
       [5.7 , 4.4 , 1.5 , 0.4 ],
       [5.4 , 3.9 , 1.3 , 0.4 ],
       [5.1 , 3.5 , 1.4 , 0.3 ],
       [5.7 , 3.8 , 1.7 , 0.3 ],
       [5.1 , 3.8 , 1.5 , 0.3 ],
       [5.4 , 3.4 , 1.7 , 0.2 ],
       [5.1 , 3.7 , 1.5 , 0.4 ],
       [4.6 , 3.6 , 1.  , 0.2 ],
       [5.1 , 3.3 , 1.7 , 0.5 ],
       [5.1 , 3.4 , 1.9 , 0.2 ],
       [5.  , 3.  , 1.6 , 0.2 ],
       [5.  , 3.4 , 1.6 , 0.4 ],
       [5.2 , 3.5 , 1.5 , 0.2 ],
       [5.2 , 3.4 , 1.4 , 0.2 ],
       [4.85, 3.2 , 1.6 , 0.2 ],
       [4.