![alt text](pandas.png "Title")

# Dealing with missing data

In [1]:
import pandas as pd
import numpy as np

Pandas uses np.nan to denote missing data. 

In [2]:
# Let's create some test data and introduce missing values
df = pd.DataFrame(
    np.random.randn(5, 3),
    index=["a", "c", "e", "f", "h"],
    columns=["one", "two", "three"],
)
df

Unnamed: 0,one,two,three
a,-1.663834,-2.082215,-0.607487
c,-0.910501,-0.292188,-0.774109
e,-0.211021,-0.181116,-0.550121
f,-1.523506,-0.770824,1.431325
h,-0.285099,1.594745,1.256591


In [3]:
df2 = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"])
df2

# NaN: Not a Number

Unnamed: 0,one,two,three
a,-1.663834,-2.082215,-0.607487
b,,,
c,-0.910501,-0.292188,-0.774109
d,,,
e,-0.211021,-0.181116,-0.550121
f,-1.523506,-0.770824,1.431325
g,,,
h,-0.285099,1.594745,1.256591


In [4]:
# You can also explicitely create missings. You can use np.nan or None, which are treated like np.nan
series1 = pd.Series([1, 2, np.nan, 4])
series1

0    1.0
1    2.0
2    NaN
3    4.0
dtype: float64

## np.nan intricacies

In [5]:
# Careful, as opposed to None, np.nan don't compare equal...
print(None==None)
print(np.nan==np.nan)

True
False


In [6]:
# ...and that's why the following doesn't work!
this_missing_value = series1[2]
print(this_missing_value)
if this_missing_value == np.nan:
    pass
else: 
    print(f"Wait, what? {this_missing_value} is NOT a missing ??!")
    
# we'll below how to detect them

nan
Wait, what? nan is NOT a missing ??!


In [7]:
# Also, np.nan is a float (a number with digits), which can be annoying if your Series should only contain integers:
series1.info()

# Because we have one missing, the whole series is casted as float!

<class 'pandas.core.series.Series'>
RangeIndex: 4 entries, 0 to 3
Series name: None
Non-Null Count  Dtype  
--------------  -----  
3 non-null      float64
dtypes: float64(1)
memory usage: 160.0 bytes


In [8]:
# yes, that's a float :s
type(series1[0])

numpy.float64

In [9]:
# Advanced note: in that case, you may want to force the data type
series2 = pd.Series([1, 2, np.nan, 4], dtype=pd.Int64Dtype())
series2
# https://pandas.pydata.org/pandas-docs/stable/user_guide/gotchas.html#gotchas-intna

# This <NA> missing doesn't force the series to be a float

0       1
1       2
2    <NA>
3       4
dtype: Int64

In [10]:
series2.info()

<class 'pandas.core.series.Series'>
RangeIndex: 4 entries, 0 to 3
Series name: None
Non-Null Count  Dtype
--------------  -----
3 non-null      Int64
dtypes: Int64(1)
memory usage: 164.0 bytes


In [11]:
# Now we have an integer
type(series2[0])

numpy.int64

## Comparisons and search for missings

In [12]:
series1

0    1.0
1    2.0
2    NaN
3    4.0
dtype: float64

In [13]:
series2

0       1
1       2
2    <NA>
3       4
dtype: Int64

In [14]:
# let's compare these 2 Series
series1==series2

0    True
1    True
2    <NA>
3    True
dtype: boolean

In [15]:
# We could also compare these Series inside a df. First let's align these 2 Series on the same index
df = pd.DataFrame (dict (s1=series1, s2=series2))

# and let's compare, element-wise
df['compare'] = df['s1'] == df['s2']
df

Unnamed: 0,s1,s2,compare
0,1.0,1.0,True
1,2.0,2.0,True
2,,,
3,4.0,4.0,True


In [16]:
# Spot any missings in the whole df
df.isnull()

Unnamed: 0,s1,s2,compare
0,False,False,False
1,False,False,False
2,True,True,True
3,False,False,False


In [17]:
# and know immediately if we at least one null value
df.isnull().values.any() # 'values' is a method to create an array (i.e. a sort of list) from the pandas df values

True

In [18]:
# we can do the same at the Series level
df['s1_missing'] = df['s1'].isnull()
df

Unnamed: 0,s1,s2,compare,s1_missing
0,1.0,1.0,True,False
1,2.0,2.0,True,False
2,,,,True
3,4.0,4.0,True,False


In [19]:
# or the other way around
df['s1_not_missing'] = df['s1'].notnull()
df

Unnamed: 0,s1,s2,compare,s1_missing,s1_not_missing
0,1.0,1.0,True,False,True
1,2.0,2.0,True,False,True
2,,,,True,False
3,4.0,4.0,True,False,True


Note: Panda dataframes also have a method called isna(). It does exactly the same thing as inull()

In [20]:
# Let's use the missing column as a condition
df['s1_mod'] = np.where(df['s1_missing'], 'Missing', df['s1'] * 2)
df

Unnamed: 0,s1,s2,compare,s1_missing,s1_not_missing,s1_mod
0,1.0,1.0,True,False,True,2.0
1,2.0,2.0,True,False,True,4.0
2,,,,True,False,Missing
3,4.0,4.0,True,False,True,8.0


## Remove missings

In [21]:
usubjid     = pd.Series(['01_001', '01_002', None, '01_004', '01_005', '01_006'])
age         = pd.Series([25, 46, 37, None, 26])
discontinued = pd.Series([True, False, False, True, False, True, False])
df = pd.DataFrame (dict (usubjid=usubjid, age=age, discontinued=discontinued))
df

# now you know why age is represented as Float.

Unnamed: 0,usubjid,age,discontinued
0,01_001,25.0,True
1,01_002,46.0,False
2,,37.0,False
3,01_004,,True
4,01_005,26.0,False
5,01_006,,True
6,,,False


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   usubjid       5 non-null      object 
 1   age           4 non-null      float64
 2   discontinued  7 non-null      bool   
dtypes: bool(1), float64(1), object(1)
memory usage: 247.0+ bytes


In [23]:
# None and NaN are treated the same
df.isnull()

Unnamed: 0,usubjid,age,discontinued
0,False,False,False
1,False,False,False
2,True,False,False
3,False,True,False
4,False,False,False
5,False,True,False
6,True,True,False


In [24]:
# Remove records with missings in the whole df. This is not in place!
df.dropna()

# If I need to save this for later use, I can do: 
# df = df.dropna()
# or:
# df.dropna(inplace=True)

Unnamed: 0,usubjid,age,discontinued
0,01_001,25.0,True
1,01_002,46.0,False
4,01_005,26.0,False


https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html

In [25]:
# and the same but looking specically at one Series. Subset can also take a python list
df.dropna(subset='usubjid')

Unnamed: 0,usubjid,age,discontinued
0,01_001,25.0,True
1,01_002,46.0,False
3,01_004,,True
4,01_005,26.0,False
5,01_006,,True


In [26]:
# For the record, this does the same:
df[ df['usubjid'].notnull() ] # Create an array of boolean and use it to filter the dataframe

Unnamed: 0,usubjid,age,discontinued
0,01_001,25.0,True
1,01_002,46.0,False
3,01_004,,True
4,01_005,26.0,False
5,01_006,,True


Note: With the how parameter ('any'|'all') you can decide to drop rows/columns if any values if missing or all

In [27]:
# This time, with axis=0, we keep columns without nan. Axis=0 by default (-> look row by row)
df.dropna(axis=1)

Unnamed: 0,discontinued
0,True
1,False
2,False
3,True
4,False
5,True
6,False


## Replace missings

In [28]:
df

Unnamed: 0,usubjid,age,discontinued
0,01_001,25.0,True
1,01_002,46.0,False
2,,37.0,False
3,01_004,,True
4,01_005,26.0,False
5,01_006,,True
6,,,False


https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html#pandas.DataFrame.fillna

In [28]:
# You can replace missings with what you want. fillna() is not in place
df1 = df.fillna("That's missing") [['usubjid', 'age']]
df1

Unnamed: 0,usubjid,age
0,01_001,25.0
1,01_002,46.0
2,That's missing,37.0
3,01_004,That's missing
4,01_005,26.0
5,01_006,That's missing
6,That's missing,That's missing


In [29]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   usubjid  7 non-null      object
 1   age      7 non-null      object
dtypes: object(2)
memory usage: 240.0+ bytes


In [30]:
# More interestingly, you can apply a forward-fill (aka LOCF):
df1 = df.fillna(method='ffill') [['usubjid', 'age']]
df1

# you can also use a backfill method='bfill'

Unnamed: 0,usubjid,age
0,01_001,25.0
1,01_002,46.0
2,01_002,37.0
3,01_004,37.0
4,01_005,26.0
5,01_006,26.0
6,01_006,26.0


In [31]:
# You can use fillna() on a groupby. Let's create the data first
def create_df():
    return pd.DataFrame(
        data    = [ [1, 25], [1, None], [2, None], [2, 37] ],
        columns = ['usubjid', 'age']
    )
df = create_df()
df

Unnamed: 0,usubjid,age
0,1,25.0
1,1,
2,2,
3,2,37.0


In [32]:
df['age'] = df.groupby('usubjid').age.fillna(method='ffill')
# same as df['age'] = df.groupby('usubjid')['age'].fillna(method='ffill')
df

Unnamed: 0,usubjid,age
0,1,25.0
1,1,25.0
2,2,
3,2,37.0


In [33]:
# You probably want to sort the df first and then apply the LOCF
df = create_df()
df.sort_values(['usubjid', 'age'], inplace=True)
df['age'] = df.groupby('usubjid').age.fillna(method='ffill')
df

Unnamed: 0,usubjid,age
0,1,25.0
1,1,25.0
3,2,37.0
2,2,37.0


In [34]:
# Create the series in one line
age = create_df().sort_values(['usubjid', 'age']).groupby('usubjid').age.fillna(method='ffill')
age

0    25.0
1    25.0
3    37.0
2    37.0
Name: age, dtype: float64

In [None]:
dataframes = {}

datraframes['df1'] = pd.DataFrame(...)
datraframes['df2'] = pd.DataFrame(...)

for df in dataframes:
    df.fillna()


__________________________________________________
Nicolas Dupuis, Methodology and Innovation (IDAR C&SP), 2020+