# Missing Data

Let's show a few convenient methods to deal with Missing Data in pandas:

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Let's create a DataFrame with a few missing values
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

In [3]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [4]:
# Use the dropna() method to drop any rows with a missing value
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [5]:
# Can also use this to drop any columns with missing values by setting the axis=1
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [7]:
# Set 'thresh' to drop any rows with 2 or more missing values
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [8]:
# Use 'fillna' to fill in blank or missing values with whatever we set value to.
df.fillna(value='FILL VALUE')

Unnamed: 0,A,B,C
0,1,5,1
1,2,FILL VALUE,2
2,FILL VALUE,FILL VALUE,3


In [9]:
# Bit more complicated, but we can fill the missing values in for a specific column
# Fill the missing values in column 'A' with the average of the other values in the column
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

# Great Job!