In [1]:
# to make the .py script runnable
#!/usr/bin/env python

In [2]:
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('ggplot')

In [3]:
import os

# 2.5 Missing data (done)

## 2.5.1 Detect Missing Values

- Missing values appear as NaN. Funtions _isnull_ and _notnull_ are used to detect missings.
- They both produce booleans that can be used for subsetting

In [4]:
cities = pd.Series(data = [18, None, 5, None, 13], 
                index=['DEL', 'BOM', 'BLR', 'DXB', 'BKK'])
cities

DEL    18.0
BOM     NaN
BLR     5.0
DXB     NaN
BKK    13.0
dtype: float64

In [5]:
cities.values

array([18., nan,  5., nan, 13.])

In [None]:
pd.Series.isnull?
pd.Series.notnull?

In [6]:
cities.isnull()

DEL    False
BOM     True
BLR    False
DXB     True
BKK    False
dtype: bool

In [7]:
cities[cities.isnull()]

BOM   NaN
DXB   NaN
dtype: float64

In [8]:
list(zip(cities, cities.isnull()))

[(18.0, False), (nan, True), (5.0, False), (nan, True), (13.0, False)]

In [9]:
list(zip(cities, cities.notnull()))

[(18.0, True), (nan, False), (5.0, True), (nan, False), (13.0, True)]

In [10]:
cities.loc[cities.isnull()]

BOM   NaN
DXB   NaN
dtype: float64

In [11]:
cities.loc[cities.notnull()]

DEL    18.0
BLR     5.0
BKK    13.0
dtype: float64

In [12]:
my_series_2 = pd.Series({'c': 1, 'd': 0.14, 'e':10, 'f': 2, 'g':-0.5})
print(my_series_2)

c     1.00
d     0.14
e    10.00
f     2.00
g    -0.50
dtype: float64


In [13]:
index2 = ['a', 'd', 'e', 'f', 'z']
#my_series_2 = my_series_2.loc[index2]
my_series_2 = my_series_2.reindex(index2)
my_series_2

a      NaN
d     0.14
e    10.00
f     2.00
z      NaN
dtype: float64

In [14]:
my_series_2.loc[my_series_2.notnull()]

d     0.14
e    10.00
f     2.00
dtype: float64

In [15]:
my_series_2.loc[my_series_2.isnull()]

a   NaN
z   NaN
dtype: float64

## 2.5.2 Strategies for dealing with missing data

1. Drop if there aren't too many missings
2. Impute with 0s (for quantities like Amounts), with Mean (for a symmetric distribution), with Median for an Asymmetric Distribution
3. Impute by cluster mean/median
4. Use kNN/Regression where the variable with missings = DV, others are IVs. Predict the missing data. More on regression later.

---

### Pay close attention to the consequences of replacing missing data. Why is it missing? Is it random? 

---

In [16]:
my_series_2

a      NaN
d     0.14
e    10.00
f     2.00
z      NaN
dtype: float64

In [17]:
my_series_2.fillna(-999)

a   -999.00
d      0.14
e     10.00
f      2.00
z   -999.00
dtype: float64

In [18]:
my_series_2 = my_series_2.fillna(-999)
my_series_2

a   -999.00
d      0.14
e     10.00
f      2.00
z   -999.00
dtype: float64

In [19]:
cities

DEL    18.0
BOM     NaN
BLR     5.0
DXB     NaN
BKK    13.0
dtype: float64

In [20]:
cities.fillna(method='ffill') #fill with previous value

DEL    18.0
BOM    18.0
BLR     5.0
DXB     5.0
BKK    13.0
dtype: float64

In [21]:
cities

DEL    18.0
BOM     NaN
BLR     5.0
DXB     NaN
BKK    13.0
dtype: float64

In [22]:
cities.fillna(cities.median())

DEL    18.0
BOM    13.0
BLR     5.0
DXB    13.0
BKK    13.0
dtype: float64

In [23]:
cities.fillna(cities.mean())

DEL    18.0
BOM    12.0
BLR     5.0
DXB    12.0
BKK    13.0
dtype: float64

## 2.5.3 Difference between None and NaN

<big><br>

- `NaN` is a mathematical entity
- `None` is for missing data

In [24]:
type(np.nan)

float

In [25]:
bool(np.nan)
# Truthiness value of np.nan is True i.e. 'it exists'

True

In [26]:
type(None)

NoneType

In [27]:
bool(None)

False

In [28]:
# Series Methods do not discriminate between None and NaN
pd.Series({'a': None, 'c': 101, 'b': np.nan, 'd': 'red'})

a    None
c     101
b     NaN
d     red
dtype: object

In [29]:
# Series Methods do not discriminate between None and NaN
pd.Series({'a': None, 'c': 101, 'b': np.nan, 'd': 'red'}).isnull()

a     True
c    False
b     True
d    False
dtype: bool