<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Missing Data Interactive Plot

_Author: Matt Brems_

---

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# To allow interactive plot.
import ipywidgets
from IPython.display import display

In [2]:
# Generate data and store in a dataframe.

np.random.seed(42)

age = np.random.uniform(20, 60, size = 100)
income = 15000 + 750 * age + np.random.normal(0, 20000, size = 100)
income = [i if i >= 0 else 0 for i in income]

df = pd.DataFrame({'income':income,
                   'age': age})

In [3]:
# Create function to model missingness according to certain patterns.

def create_missing(missing_type, df, missing_column = 'income', depends_on = 'age', p_missing = 0.01, random_state = 42):
    """
    Creates missingness indicator column, where data are MCAR (missing completely at random), MAR (missing at random), or NMAR (not missing at random).
    
    User must specify:
        missing type = the type of missingness ('mcar', 'mar', or 'nmar')
        df = the pandas DataFrame the user wants to read in for analysis
        missing_column = the name of the column in df that is missing
        depends_on = the name of the column in df which affects the missingness
        p_missing = the proportion of observations that are missing
        
    Function returns:
        mcar_column = a column that indicates whether data are missing, assuming MCAR
        mar_column = a column that indicates whether data are missing, assuming MAR
        nmar_column = a column that indicates whether data are missing, assuming NMAR
    """
    np.random.seed(random_state)
    
    # MCAR:
    if missing_type == 'mcar':
        mcar_indices = [df.sample(n = 1).index[0] for i in range(round(p_missing * df.shape[0]))]

        while len(set(mcar_indices)) < round(p_missing * df.shape[0]):
            mcar_indices.append(df.sample(n = 1).index[0])

        mcar_column = [1 if i in mcar_indices else 0 for i in range(df.shape[0])]

        return mcar_column
    
    # MAR:
    elif missing_type == 'mar':
        mar_indices = [df.sample(n = 1, weights = df[depends_on] ** -1).index[0] for i in range(round(p_missing * df.shape[0]))]

        while len(set(mar_indices)) < round(p_missing * df.shape[0]):
            mar_indices.append(df.sample(n = 1, weights = df[depends_on] ** -1).index[0])

        mar_column = [1 if i in mar_indices else 0 for i in range(df.shape[0])]

        return mar_column
    
    # NMAR:
    else:
        nmar_indices = [df.sample(n = 1, weights = df[missing_column] ** -1).index[0] for i in range(round(p_missing * df.shape[0]))]

        while len(set(nmar_indices)) < round(p_missing * df.shape[0]):
            nmar_indices.append(df.sample(n = 1, weights = df[missing_column] ** -1).index[0])

        nmar_column = [1 if i in nmar_indices else 0 for i in range(df.shape[0])]

        return nmar_column

In [4]:
def generate_scatterplot(p_missing, missing_type, missing_column = 'income', depends_on = 'age'):
    # Generate one plot.
    fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (16,9))

    # Set labels and axes.
    ax.set_xlabel("Age", position = (0,0), ha = 'left', fontsize = 25, color = 'grey', alpha = 0.85)
    ax.set_ylabel("Income", position = (0,1), ha = 'right', va = 'top', fontsize = 25, rotation = 0, color = 'grey', alpha = 0.85)
    
    ax.set_ylim([-1000, 100000])
    
    # Generate data with proportion p missing.
    df['missingness'] = create_missing(missing_type, df, missing_column = missing_column, depends_on = depends_on, p_missing = p_missing)
    
    # Generate scatterplot.
    ax.scatter(df['age'][df['missingness'] == 0], df['income'][df['missingness'] == 0], s = 35, color = '#185fad', alpha = 0.75, label = 'Observed')
    ax.scatter(df['age'][df['missingness'] == 1], df['income'][df['missingness'] == 1], s = 35, color = 'grey', alpha = 0.25, label = '')
    
    # Generate lines of best fit based on observed and missing values.
    x = np.linspace(20, 60)
    ax.plot(x, 15000 + 750 * x, c = 'orange', alpha = 0.7, label = '"True" Line', lw = 3)
    model = LinearRegression().fit(df[['age']][df['missingness'] == 0], df['income'][df['missingness'] == 0])
    ax.plot(x, model.intercept_ + model.coef_ * x, c = '#185fad', alpha = 0.7, label='Observed Line', lw = 3)

    # Generate title and legend.
    ax.set_title(f'Type of Missing Data: {missing_type} \nProportion Missing: {p_missing}', position = (0,1), ha = 'left', fontsize = 25)
    ax.legend(prop={'size': 20}, loc = 2)
    
    ax.set_xticks([])
    ax.set_yticks([])
    plt.show();

In [5]:
def plot_interact(p_missing = 0, missing_type = 'mcar'):
    generate_scatterplot(p_missing, missing_type, missing_column = 'income', depends_on = 'age')
    
ipywidgets.interact(plot_interact, p_missing = (0, 0.99, 0.05), missing_type = ['mcar','mar','nmar']);

interactive(children=(FloatSlider(value=0.0, description='p_missing', max=0.99, step=0.05), Dropdown(descripti…