# Agenda

1. Dealing with `NaN`
2. `NaN` vs. `NA` and nullable types
3. Interpolation
4. Dealing with bad values

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [3]:
s = Series([10, 20, np.nan, 40, 50])
s

0    10.0
1    20.0
2     NaN
3    40.0
4    50.0
dtype: float64

In [5]:
s.astype(np.int64)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

# How do we deal with `NaN`?

1. We can replace it, using `fillna` on a series
2. We can remove it, using `dropna` on a series
3. We can replace it, on a data frame (using `fillna`)
4. We can remove it, on a data frame (using `dropna`)

In [6]:
s

0    10.0
1    20.0
2     NaN
3    40.0
4    50.0
dtype: float64

In [7]:
s.fillna(999)

0     10.0
1     20.0
2    999.0
3     40.0
4     50.0
dtype: float64

In [8]:
# more common is for us to replace NaN with something calculated

s.fillna(s.mean())

0    10.0
1    20.0
2    30.0
3    40.0
4    50.0
dtype: float64

In [9]:
s.fillna(s.median())

0    10.0
1    20.0
2    30.0
3    40.0
4    50.0
dtype: float64

In [10]:
# but -- what if there isn't anything obvious that you can/want to do with the NaN values?
# in such cases, we can remove them

s.dropna()    # this returns a new series -- if you want (don't!), you can pass inplace=True

0    10.0
1    20.0
3    40.0
4    50.0
dtype: float64

In [11]:
# inplace=True does two things:
# (1) it modifies the series/data frame itself
# (2) it returns None

t = s
s.dropna(inplace=True)  # now, we get back None *and* anyone else who is referring to s will be modified, also

In [12]:
s

0    10.0
1    20.0
3    40.0
4    50.0
dtype: float64

In [13]:
t

0    10.0
1    20.0
3    40.0
4    50.0
dtype: float64

In [14]:
mylist = [10, 20, 30, 40, 50]

mylist.remove(30)

In [15]:
mylist

[10, 20, 40, 50]

In [16]:
s

0    10.0
1    20.0
3    40.0
4    50.0
dtype: float64

In [18]:
s.iloc[2]  # this is the positional index, identical to a Python string/list/tuple

40.0

In [19]:
s.loc[2]   # this uses the index that is defined

KeyError: 2

In [21]:
s = Series([10, 20, np.nan, 40, 50], index=list('abcde'))
s

a    10.0
b    20.0
c     NaN
d    40.0
e    50.0
dtype: float64

In [22]:
s.dropna(inplace=True)

In [23]:
s

a    10.0
b    20.0
d    40.0
e    50.0
dtype: float64

In [24]:
np.random.seed(0)
df = DataFrame(np.random.randint(-100, 100, [4, 5]),
               index=list('abcd'),
               columns=list('vwxyz'))
df

Unnamed: 0,v,w,x,y,z
a,72,-53,17,92,-33
b,95,3,-91,-79,-64
c,-13,-30,-12,40,-42
d,93,-61,-13,74,-12


In [25]:
# let's set some NaN values
df.loc['b', 'x'] = np.nan
df.loc['d', 'z'] = np.nan
df.loc['a', 'z'] = np.nan
df.loc['c', 'y'] = np.nan
df


Unnamed: 0,v,w,x,y,z
a,72,-53,17.0,92.0,
b,95,3,,-79.0,-64.0
c,-13,-30,-12.0,,-42.0
d,93,-61,-13.0,74.0,


In [26]:
df.dtypes

v      int64
w      int64
x    float64
y    float64
z    float64
dtype: object

In [27]:
# what happens if I use fillna?

df.fillna(999)

Unnamed: 0,v,w,x,y,z
a,72,-53,17.0,92.0,999.0
b,95,3,999.0,-79.0,-64.0
c,-13,-30,-12.0,999.0,-42.0
d,93,-61,-13.0,74.0,999.0


In [28]:
df.mean()  # this returns a series, the mean from each column

v    61.750000
w   -35.250000
x    -2.666667
y    29.000000
z   -53.000000
dtype: float64

In [30]:
df.fillna(df.mean())   # replace NaN with the mean for that particular column

Unnamed: 0,v,w,x,y,z
a,72,-53,17.0,92.0,-53.0
b,95,3,-2.666667,-79.0,-64.0
c,-13,-30,-12.0,29.0,-42.0
d,93,-61,-13.0,74.0,-53.0


In [31]:
# if we don't want to calculate, but do want to specify
# a particular value for each column's NaN replacement,
# we can use a dict
df.fillna({'x':999, 'y':888, 'z':777})

Unnamed: 0,v,w,x,y,z
a,72,-53,17.0,92.0,777.0
b,95,3,999.0,-79.0,-64.0
c,-13,-30,-12.0,888.0,-42.0
d,93,-61,-13.0,74.0,777.0


In [32]:
# what about dropna?
# If I run dropna right now
# any row in which there is even a single NaN value is removed

df.dropna()

Unnamed: 0,v,w,x,y,z


In [33]:
df

Unnamed: 0,v,w,x,y,z
a,72,-53,17.0,92.0,
b,95,3,,-79.0,-64.0
c,-13,-30,-12.0,,-42.0
d,93,-61,-13.0,74.0,


In [34]:
# we can tell dropna that we want to keep the rows that contain
# at least a certain number of non-NaN values

df.dropna(thresh=3)   # this means: if there are 3 good values, keep the row

Unnamed: 0,v,w,x,y,z
a,72,-53,17.0,92.0,
b,95,3,,-79.0,-64.0
c,-13,-30,-12.0,,-42.0
d,93,-61,-13.0,74.0,


In [35]:
# it would be nice, sometimes, to say that we want three values, 
# but that they must be in v, w, x, and y.  Meaning: If there's a NaN
# in z, then we should get rid of the row. 

df.dropna(subset=['v', 'w', 'x', 'y'], thresh=3)  # we want 3 values from v, w, x, and y

Unnamed: 0,v,w,x,y,z
a,72,-53,17.0,92.0,
b,95,3,,-79.0,-64.0
c,-13,-30,-12.0,,-42.0
d,93,-61,-13.0,74.0,


In [40]:
# let's say that we need 3 from x, y, and z

df.dropna(subset=['x', 'y', 'z'],  # which columns are we looking at to make our decision
          thresh=2)                # how many non-NaN values we need

Unnamed: 0,v,w,x,y,z
a,72,-53,17.0,92.0,
b,95,3,,-79.0,-64.0
c,-13,-30,-12.0,,-42.0
d,93,-61,-13.0,74.0,


In [42]:
df.dropna(axis='columns')  #get rid of any column that contains a NaN value

Unnamed: 0,v,w
a,72,-53
b,95,3
c,-13,-30
d,93,-61


# Exercise: Means of evens

1. Create a 4x5 data frame containing random integers from -100 to +100.
2. Set the odd numbers to be `NaN`.
3. Replace the `NaN` values with the mean of each column.
4. Instead of replacing (in step 3), now remove any row that contains at least 2 `NaN` values in the first 3 columns.

In [43]:
np.random.seed(0)
df = DataFrame(np.random.randint(-100, 100, [4,5]),
               index=list('abcd'),
               columns=list('vwxyz'))
df

Unnamed: 0,v,w,x,y,z
a,72,-53,17,92,-33
b,95,3,-91,-79,-64
c,-13,-30,-12,40,-42
d,93,-61,-13,74,-12


In [46]:
# how to set the nans on odd numbers

df[df % 2 == 1] = np.nan
df

Unnamed: 0,v,w,x,y,z
a,72.0,,,92.0,
b,,,,,-64.0
c,,-30.0,-12.0,40.0,-42.0
d,,,,74.0,-12.0


In [47]:
df.fillna(df.mean())

Unnamed: 0,v,w,x,y,z
a,72.0,-30.0,-12.0,92.0,-39.333333
b,72.0,-30.0,-12.0,68.666667,-64.0
c,72.0,-30.0,-12.0,40.0,-42.0
d,72.0,-30.0,-12.0,74.0,-12.0


In [48]:
# 4. Instead of replacing (in step 3), now remove any row that contains at least 2 `NaN` values in the first 3 columns.

df.dropna(thresh=3, 
          subset=['v', 'w', 'x'])

Unnamed: 0,v,w,x,y,z


In [49]:
df

Unnamed: 0,v,w,x,y,z
a,72.0,,,92.0,
b,,,,,-64.0
c,,-30.0,-12.0,40.0,-42.0
d,,,,74.0,-12.0


# Nullable values

One of the problems we've seen with `NaN` is that it's a float. If you have it in an integer column, that column becomes a dtype of float. If you have it in a string column, then the column becomes `object`, because that's the only way Pandas can have both strings and floats in the same column.

This is weird! We would like to have a string column with `NaN` and call it a string.

The idea of having such dtypes is known as "nullable types," that stick with their original dtype but allow for null values.

Traditional Pandas does *not* allow for this. But we have two other options that we can use, if we really want:

- Pandas "extension" dtypes, which are provided by Pandas, and are wrappers around NumPy types. These are all nullable, meaning that they can have `pd.NA` (not `NaN`!) in them. This is a different "not a value" value.
- PyArrow, which replaces NumPy under the hood under modern, experimental versions of Pandas, and which will play a growing role in Pandas moving forward.

In [50]:
# example of using extension types, and how that would help

s1 = Series([10, 20, 30], dtype=pd.Int64Dtype())   # this will be a Pandas int64
s2 = Series(['hello', 'out', 'there'], dtype=pd.StringDtype())   # this will be a Pandas string

df = DataFrame({'int':s1, 'str':s2})
df


Unnamed: 0,int,str
0,10,hello
1,20,out
2,30,there


In [51]:
df.dtypes

int             Int64
str    string[python]
dtype: object

In [52]:
df.loc[1, 'int'] = pd.NA
df.loc[2, 'str'] = pd.NA
df

Unnamed: 0,int,str
0,10.0,hello
1,,out
2,30.0,


In [53]:
df.dtypes

int             Int64
str    string[python]
dtype: object

In [54]:
df.isna()   # notice, it's not isnan!

Unnamed: 0,int,str
0,False,False
1,True,False
2,False,True


In [56]:
s = Series([25, np.nan, 30, 31, np.nan, 27])
s

0    25.0
1     NaN
2    30.0
3    31.0
4     NaN
5    27.0
dtype: float64

In [57]:
# I could replace the NaN values with the mean
# that wouldn't be totally wrong.. but it wouldn't' be totally right, either.

# the way to handle this is with *interpolation*
# meaning: Pandas will look at each NaN and calculate the mean from the two values around it, and then fill in the NaN with it

s.interpolate()


0    25.0
1    27.5
2    30.0
3    31.0
4    29.0
5    27.0
dtype: float64

In [58]:
# what if things are a bit trickier?

s = Series([25, 27, np.nan, np.nan, 29, 25, 27])



In [59]:
s.interpolate()

0    25.000000
1    27.000000
2    27.666667
3    28.333333
4    29.000000
5    25.000000
6    27.000000
dtype: float64

In [None]:
# if you have NaN on the edges, it won't handle those 
# and there are a few other functions you can use 