# Handling missing value
Many popular predictive models such as support vector machines, logistic regression, and neural networks, cannot tolerate any amount of missing values

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

There are algorithms that can be made robust to missing data, such as k-Nearest Neighbors that can ignore a column from a distance measure when a value is missing. Naive Bayes can also support missing values when making a prediction. Tree based algorithms such as random forest, xgboost can also handle missing value

Sometimes, we drop rows if there are too many columns missing or drop a column if high percent is missing
Sometimes, we know missing is equivilent to 0, then we fillna(0)

One way to handle missing value is though binning and put missing value into a separate bin


In [2]:
import pandas as pd
import numpy as np

In [8]:
1 + np.nan


nan

In [12]:
val = np.array([1, np.nan, 3, 4]) 
val.sum(), val.min(), val.max()


(nan, nan, nan)

In [13]:
np.nansum(val), np.nanmin(val), np.nanmax(val)

(8.0, 1.0, 4.0)

In [14]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [15]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [16]:
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [17]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [19]:
#drop a column if the all values under that column is null
df.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [20]:
#keep the row only if it has at least non null value
df.dropna(axis='rows', thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


In [3]:
s = pd.Series([0, 1, np.nan, 3])
s

0    0.0
1    1.0
2    NaN
3    3.0
dtype: float64

In [4]:
s.interpolate()

0    0.0
1    1.0
2    2.0
3    3.0
dtype: float64

## Using fillna

In [29]:
# manually impute missing values with numpy
from pandas import read_csv
from numpy import nan
# load the dataset
dataset = read_csv('pima-indians-diabetes.csv', header=None)
# mark zero values as missing or NaN
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan)
# fill missing values with mean column values

#dataset.fillna(dataset.mean(), inplace=True)
# count the number of NaN values in each column
print(dataset.isnull().sum())

#df['col']=df['col'].fillna(df['col'].mean())


0      0
1      5
2     35
3    227
4    374
5     11
6      0
7      0
8      0
dtype: int64


## Using sklearn impute module

In [6]:

# example of imputing missing values using scikit-learn
from numpy import nan
from numpy import isnan
from pandas import read_csv
from sklearn.impute import SimpleImputer
# load the dataset
dataset = read_csv('pima-indians-diabetes.csv', header=None)
# mark zero values as missing or NaN
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan)
# retrieve the numpy array
values = dataset.values
# define the imputer
#If “mean”, then replace missing values using the mean along each column. Can only be used with numeric data.
#If “median”, then replace missing values using the median along each column. Can only be used with numeric data.
#If “most_frequent”, then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned.
#If “constant”, then replace missing values with fill_value. Can be used with strings or numeric data.
imputer = SimpleImputer(missing_values=nan, strategy='mean') 
# transform the dataset
transformed_values = imputer.fit_transform(values)
# count the number of NaN values in each column
print('Missing: %d' % isnan(transformed_values).sum())

Missing: 0


## full example

In [7]:
from numpy import nan
from pandas import read_csv
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
dataset = read_csv('pima-indians-diabetes.csv', header=None)
# mark zero values as missing or NaN
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan)
# split dataset into inputs and outputs
values = dataset.values
X = values[:,0:8]
y = values[:,8]
# define the imputer
imputer = SimpleImputer(missing_values=nan, strategy='mean')
# define the model
lda = LinearDiscriminantAnalysis()
# define the modeling pipeline
pipeline = Pipeline(steps=[('imputer', imputer),('model', lda)])
# define the cross validation procedure
kfold = KFold(n_splits=3, shuffle=True, random_state=1)
# evaluate the model
result = cross_val_score(pipeline, X, y, cv=kfold, scoring='accuracy')
# report the mean performance
print('Accuracy: %.3f' % result.mean())

Accuracy: 0.762
