# Agenda

1. Dealing with `NaN`
2. `NaN` vs. `NA` and nullable types
3. Interpolation
4. Dealing with bad values

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [3]:
s = Series([10, 20, np.nan, 40, 50])
s

0    10.0
1    20.0
2     NaN
3    40.0
4    50.0
dtype: float64

In [5]:
s.astype(np.int64)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

# How do we deal with `NaN`?

1. We can replace it, using `fillna` on a series
2. We can remove it, using `dropna` on a series
3. We can replace it, on a data frame (using `fillna`)
4. We can remove it, on a data frame (using `dropna`)

In [6]:
s

0    10.0
1    20.0
2     NaN
3    40.0
4    50.0
dtype: float64

In [7]:
s.fillna(999)

0     10.0
1     20.0
2    999.0
3     40.0
4     50.0
dtype: float64

In [8]:
# more common is for us to replace NaN with something calculated

s.fillna(s.mean())

0    10.0
1    20.0
2    30.0
3    40.0
4    50.0
dtype: float64

In [9]:
s.fillna(s.median())

0    10.0
1    20.0
2    30.0
3    40.0
4    50.0
dtype: float64

In [10]:
# but -- what if there isn't anything obvious that you can/want to do with the NaN values?
# in such cases, we can remove them

s.dropna()    # this returns a new series -- if you want (don't!), you can pass inplace=True

0    10.0
1    20.0
3    40.0
4    50.0
dtype: float64

In [11]:
# inplace=True does two things:
# (1) it modifies the series/data frame itself
# (2) it returns None

t = s
s.dropna(inplace=True)  # now, we get back None *and* anyone else who is referring to s will be modified, also

In [12]:
s

0    10.0
1    20.0
3    40.0
4    50.0
dtype: float64

In [13]:
t

0    10.0
1    20.0
3    40.0
4    50.0
dtype: float64

In [14]:
mylist = [10, 20, 30, 40, 50]

mylist.remove(30)

In [15]:
mylist

[10, 20, 40, 50]

In [16]:
s

0    10.0
1    20.0
3    40.0
4    50.0
dtype: float64

In [18]:
s.iloc[2]  # this is the positional index, identical to a Python string/list/tuple

40.0

In [19]:
s.loc[2]   # this uses the index that is defined

KeyError: 2

In [21]:
s = Series([10, 20, np.nan, 40, 50], index=list('abcde'))
s

a    10.0
b    20.0
c     NaN
d    40.0
e    50.0
dtype: float64

In [22]:
s.dropna(inplace=True)

In [23]:
s

a    10.0
b    20.0
d    40.0
e    50.0
dtype: float64

In [24]:
np.random.seed(0)
df = DataFrame(np.random.randint(-100, 100, [4, 5]),
               index=list('abcd'),
               columns=list('vwxyz'))
df

Unnamed: 0,v,w,x,y,z
a,72,-53,17,92,-33
b,95,3,-91,-79,-64
c,-13,-30,-12,40,-42
d,93,-61,-13,74,-12


In [None]:
# let's set some NaN values
df.loc['b', 'x'] = np.nan
df.loc['b', 'x'] = np.nan
df.loc['b', 'x'] = np.nan
df.loc['b', 'x'] = np.nan
