# Imputation

In [None]:
import pandas as pd
import numpy as np

In [None]:
import statsmodels
from statsmodels.imputation import mice

In [None]:
import random

In [None]:
random.seed(10)

## Create data frame

In [None]:
df = pd.read_csv("http://goo.gl/19NKXV")

In [None]:
df.head()

In [None]:
original = df.copy()

In [None]:
original.describe().loc['count',:]

** Add some missing values **

In [None]:
def add_nulls(df, n):
    new = df.copy()
    new.iloc[random.sample(range(new.shape[0]), n), :] = np.nan
    return new

In [None]:
df.Cholesterol = add_nulls(df[['Cholesterol']], 20)
df.Smoking = add_nulls(df[['Smoking']], 20)
df.Education = add_nulls(df[['Education']], 20)
df.Age = add_nulls(df[['Age']], 5)
df.BMI = add_nulls(df[['BMI']], 5)

Confirm the presence of null values

In [None]:
df.describe()

** Create categorical variables **

In [None]:
for col in ['Gender', 'Smoking', 'Education']:
    df[col] = df[col].astype('category')

In [None]:
df.dtypes

** Create dummy variables **

In [None]:
df = pd.get_dummies(df);

## Impute data
Replace null values using MICE model

** MICEData class **

In [None]:
imp = mice.MICEData(df)

** Imputation for one feature **
The `conditional_formula` attribute is a dictionary containing the models that will be used to impute the data for each column. This can be updated to change the imputation model.

In [None]:
imp.conditional_formula['BMI']

In [None]:
before = imp.data.BMI.copy()

The `perturb_params` method must be called before running the `impute` method, that runs the imputation. It updates the specified column in the `data` attribute.

In [None]:
imp.perturb_params('BMI')

In [None]:
imp.impute('BMI')

In [None]:
after = imp.data.BMI

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.clf()
fig, ax = plt.subplots(1, 1)
ax.plot(before, 'or', label='before', alpha=1, ms=8)
ax.plot(after, 'ok', label='after', alpha=0.8, mfc='w', ms=8)
plt.legend();

In [None]:
pd.DataFrame(dict(before=before.describe(), after=after.describe()))

In [None]:
before[before != after]

In [None]:
after[before != after]

### Impute all

In [None]:
imp.update_all(2)

In [None]:
imp.plot_fit_obs('BMI');

In [None]:
imp.plot_fit_obs('Age');

### Validation

In [None]:
original.mean()

In [None]:
for col in original.mean().index:
    x = original.mean()[col]
    y = imp.data[col].mean()
    e = abs(x - y) / x
    print("{:<12}  mean={:>8.2f}, exact={:>8.2f}, error={:>5.2g}%".format(col, x, y, e * 100))

## MICE
This allows to fit data containing missing values.