In [1]:
import pandas as pd
import numpy as np

In [28]:
# Remove all rows with NaN
df = pd.read_csv('Building_Permits.csv', low_memory = False)
df.dropna(inplace = True)

In [29]:
# Column specific
df['Proposed Construction Type'].dropna(inplace = True)

In [30]:
# Impute with Mean or Median
# Note:
# Does not factor correlation between features
# Does not work with categorical features
df = pd.read_csv('Building_Permits.csv', low_memory = False)
df.fillna(df.mean(numeric_only = True), inplace = True)

In [31]:
# Column specific
df['Proposed Construction Type'].fillna((df['Proposed Construction Type'].mean()), inplace = True)

In [33]:
# Replace df.mean with df.median for median

In [37]:
# Impute with Most-Frequent value
# Note:
# Works with categorical features
# Does not factor correlation between features
# Will introduce bias
df = pd.read_csv('Building_Permits.csv', low_memory = False)
df.fillna(df.mode(numeric_only = True).iloc[0], inplace = True)

In [60]:
# Column specific
df['Proposed Construction Type'].fillna(df['Proposed Construction Type'].value_counts().index[0], inplace = True)

In [2]:
# Impute with K-NN
# Note:
# More accurate than mean/median/mode
# Computationally expensive in time and space
# Sensitive to outliers
from sklearn.impute import KNNImputer

df = pd.read_csv('Building_Permits.csv', low_memory = False)
df = df[:10000] # 100,000 seems to be too much
# df[df.select_dtypes([np.number]).columns] # All numeric columns
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean', copy=True, add_indicator=False)
imputed_df = imputer.fit_transform(df[df.select_dtypes([np.number]).columns])
df[df.select_dtypes([np.number]).columns] = imputed_df

In [2]:
# Multi-Imputation with Chained-Equation
# Note:
# Longer than KNN
# Handles uncertainty better
from impyute.imputation.cs import mice

df = pd.read_csv('Building_Permits.csv', low_memory = False)
df = df[:1000] # 10,000 seems to be too much
imputed_df = mice(df[df.select_dtypes([np.number]).columns])
df[df.select_dtypes([np.number]).columns] = imputed_df

  args[0] = args[0].as_matrix()
