# Imputation

Here we want to fill, or impute, the incomplete dataset.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from itertools import combinations

pd.set_option('display.max_columns', 100) # to display all columns at all time
pd.options.mode.chained_assignment = None # to ignore false-positive warnings about chained assignments

## Read data

In [None]:
data = pd.read_csv('ElectionsData.csv', header=0)

## Attribute Types

In [None]:
obj_attr = [col  for col in data if data[col].dtype==np.object]
for attr in obj_attr:
        data[attr] = data[attr].astype('category')

In [None]:
data.info()

## What we're working with

How many values are missing for each instance?

In [None]:
def get_nan_per_row_counter():
    return Counter(data.isnull().sum(axis=1).tolist())

def plot_pie_nan_per_row():
    counter = get_nan_per_row_counter()
    labels, histogram = zip(*counter.most_common())
    fig1, ax1 = plt.subplots()
    ax1.pie(histogram, labels=labels,
            colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral'],
            explode = [0.1] * len(histogram),
            autopct = lambda(p): '{:.0f}  ({:.2f}%)'.format(p * sum(histogram) / 100, p)
           )
    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()

plot_pie_nan_per_row()

Naively, we could simply remove the instances with the missing data. This, however, would result in us losing about 20% of our dataset.

Instead, we will impute the missing data.