In [1]:
from typing import Dict
import numpy as np
import pandas as pd

In [2]:
#pandas uses np.nan as a sentinel value
float_data = pd.Series([1.2, -3.5, np.nan, 0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [3]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
#NA is a convention from the R programming language to represent empty values.
#None from Python is also a null value
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])

In [5]:
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [6]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [7]:
float_data = pd.Series([1, 2, None], dtype='float64')

In [8]:
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [9]:
#NA handling object methods
string_data2 = string_data.copy()

In [10]:
string_data2.notna()

0     True
1    False
2    False
3     True
dtype: bool

In [11]:
string_data2.dropna()

0    aardvark
3     avocado
dtype: object

In [12]:
string_data2.fillna(0)

0    aardvark
1           0
2           0
3     avocado
dtype: object

In [13]:
string_data2.fillna(method='ffill')

0    aardvark
1    aardvark
2    aardvark
3     avocado
dtype: object

In [14]:
string_data2.fillna(method='ffill', limit=1)

0    aardvark
1    aardvark
2        None
3     avocado
dtype: object

In [15]:
string_data2.fillna(method='bfill', limit=1)

0    aardvark
1         NaN
2     avocado
3     avocado
dtype: object

In [16]:
string_data2.fillna(method='bfill', limit=1).fillna(method='ffill', limit=1)

0    aardvark
1    aardvark
2     avocado
3     avocado
dtype: object

In [17]:
#filtering out missing data
#we can filter by hand, but dropna helps us remove empty values
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [18]:
#series.dropna() or df.dropna(): same as data[data.notna()]
data.dropna() 

0    1.0
2    3.5
4    7.0
dtype: float64

In [19]:
#same as above
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [20]:
#we can also get only the empty values
data[data.isna()]

1   NaN
3   NaN
dtype: float64

In [21]:
#df.dropna(): by default drops any row containing any NA values. We can set it to drop, for example,
#only rows that are all NA values.
data = pd.DataFrame([[1., 6.5, 3.],[1., np.nan, np.nan],
                    [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [22]:
#default behavior: drops any rows with any null values
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [23]:
#passing the how='all' argujment drops only rows that are all NA.
#these functions return new objects rather than modifying existing objects
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [24]:
#how='any' reproduces the default behavior
data.dropna(how='any')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [25]:
#to drop columns, use axis='columns' or axis=1
data[4] = np.nan

In [26]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [27]:
data.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [28]:
#We can get the same result with axis=1
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [29]:
#how='any' will give us an empty DataFrame since all columns have at least one empty value
data.dropna(axis=1, how='any')

0
1
2
3


In [30]:
#axis=0 also goes over the rows
data.dropna(axis=0, how='any') #how='any' gives us another result

Unnamed: 0,0,1,2,4


In [31]:
#Let's say we want to keep only rows with at least a specific number of missing values
#thresh=2 (e.g.) drops any row with more than two empty values
df = pd.DataFrame(np.random.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.573761,,
1,-0.050574,,
2,-0.409324,,0.501515
3,0.381166,,-0.825663
4,-0.693808,0.010317,0.097079
5,0.444679,0.652186,0.564908
6,0.442057,0.076277,0.625419


In [32]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.693808,0.010317,0.097079
5,0.444679,0.652186,0.564908
6,0.442057,0.076277,0.625419


In [33]:
df.dropna(how='all')

Unnamed: 0,0,1,2
0,0.573761,,
1,-0.050574,,
2,-0.409324,,0.501515
3,0.381166,,-0.825663
4,-0.693808,0.010317,0.097079
5,0.444679,0.652186,0.564908
6,0.442057,0.076277,0.625419


In [34]:
df.dropna(how='any')

Unnamed: 0,0,1,2
4,-0.693808,0.010317,0.097079
5,0.444679,0.652186,0.564908
6,0.442057,0.076277,0.625419


In [35]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.409324,,0.501515
3,0.381166,,-0.825663
4,-0.693808,0.010317,0.097079
5,0.444679,0.652186,0.564908
6,0.442057,0.076277,0.625419


In [36]:
#fillna is the default filling method
#call fillna with a constant
df.fillna(0)

Unnamed: 0,0,1,2
0,0.573761,0.0,0.0
1,-0.050574,0.0,0.0
2,-0.409324,0.0,0.501515
3,0.381166,0.0,-0.825663
4,-0.693808,0.010317,0.097079
5,0.444679,0.652186,0.564908
6,0.442057,0.076277,0.625419


In [37]:
#dictionary of values can be used for each column
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.573761,0.5,0.0
1,-0.050574,0.5,0.0
2,-0.409324,0.5,0.501515
3,0.381166,0.5,-0.825663
4,-0.693808,0.010317,0.097079
5,0.444679,0.652186,0.564908
6,0.442057,0.076277,0.625419


In [38]:
#same reindexing methods work for fillna
df = pd.DataFrame(np.random.standard_normal((6, 3)))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,1.313396,0.539043,1.507764
1,-1.227808,-1.313451,-0.088934
2,1.784916,,0.492993
3,0.219855,,-0.969785
4,1.123184,,
5,-0.016067,,


In [39]:
#ffill fills in the row with the previous valus
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.313396,0.539043,1.507764
1,-1.227808,-1.313451,-0.088934
2,1.784916,-1.313451,0.492993
3,0.219855,-1.313451,-0.969785
4,1.123184,-1.313451,-0.969785
5,-0.016067,-1.313451,-0.969785


In [40]:
#limit sets the number of values that can be filled
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,1.313396,0.539043,1.507764
1,-1.227808,-1.313451,-0.088934
2,1.784916,-1.313451,0.492993
3,0.219855,-1.313451,-0.969785
4,1.123184,,-0.969785
5,-0.016067,,-0.969785


In [41]:
df.fillna(method='ffill', limit=1)

Unnamed: 0,0,1,2
0,1.313396,0.539043,1.507764
1,-1.227808,-1.313451,-0.088934
2,1.784916,-1.313451,0.492993
3,0.219855,,-0.969785
4,1.123184,,-0.969785
5,-0.016067,,


In [42]:
df.fillna(method='bfill', axis=1)

Unnamed: 0,0,1,2
0,1.313396,0.539043,1.507764
1,-1.227808,-1.313451,-0.088934
2,1.784916,0.492993,0.492993
3,0.219855,-0.969785,-0.969785
4,1.123184,,
5,-0.016067,,


In [43]:
df.fillna(method='bfill', axis=1)

Unnamed: 0,0,1,2
0,1.313396,0.539043,1.507764
1,-1.227808,-1.313451,-0.088934
2,1.784916,0.492993,0.492993
3,0.219855,-0.969785,-0.969785
4,1.123184,,
5,-0.016067,,


In [44]:
#impute data using limits/mean statistics
data = pd.Series([1., np.nan, 3.5, np.nan, 7])

In [45]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [46]:
#Data transformation
#removing duplicates
data = pd.DataFrame({'k1': ['one', 'two']*3+['two'],
                    'k2': [1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [47]:
#df.duplicated() returns boolean series indicating whether or not the element is a duplicate
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [48]:
data.duplicated(keep='last') #same as if we did not have any duplicates

0    False
1    False
2    False
3    False
4    False
5     True
6    False
dtype: bool

In [49]:
#df.drop_duplicates() returns a DataFrame with any rows in which
#the duplicated array is False filtered out
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [50]:
data.drop_duplicates(keep='first') #same as above

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [51]:
data.drop_duplicates(keep='last') #same as above

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
6,two,4


In [52]:
#let's specify a subset of columns to detect ones that are duplicates
#create an additional column
data['v1'] = range(7)

In [53]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [54]:
data.drop_duplicates(subset=['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [55]:
data.drop_duplicates(subset=['k1'], keep='first')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [56]:
data.drop_duplicates(subset=['k1'], keep='last')

Unnamed: 0,k1,k2,v1
4,one,3,4
6,two,4,6


In [57]:
data.drop_duplicates(subset=['k1'], keep='last', ignore_index=True)

Unnamed: 0,k1,k2,v1
0,one,3,4
1,two,4,6


In [58]:
data.drop_duplicates(subset=['k1'], keep=False)

Unnamed: 0,k1,k2,v1


In [59]:
#passing keep=last will keep the last item
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [60]:
#transforming data using functions or mapping
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                             'pastrami', 'corned beef', 'bacon',
                             'pastrami', 'honey ham', 'nova lox'],
                    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [61]:
meat_to_animal: Dict = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}