Handling Missing Data - fillna, interpolate, dropna
---------------------------------------------------

In [1]:
import pandas as pd

In [4]:
# parse_dates - Create as a separate date column.

df = pd.read_csv("01weather_data.csv",parse_dates=['day'])
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [3]:
type(df.day[0])

pandas._libs.tslibs.timestamps.Timestamp

In [3]:
df.set_index('day',inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


fillna
------

    Fill all NaN with one specific value

In [5]:
new_df = df.fillna(0)
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,0
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,0
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [6]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [7]:
# Change based on column values

new_df = df.fillna({
        'temperature': 0,
        'windspeed': 0,
        'event': 'No Event'
    })
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,No Event
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,No Event
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


Use method to determine how to fill na values
---------------------------------------------

In [9]:
# ffill: propagate last valid observation forward to next valid.

new_df = df.fillna(method="ffill")
new_df

  new_df = df.fillna(method="ffill")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,7.0,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [10]:
# backfill / bfill: use next valid observation to fill gap.

new_df = df.fillna(method="bfill")
new_df

  new_df = df.fillna(method="bfill")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,28.0,9.0,Sunny
2017-01-05,28.0,7.0,Snow
2017-01-06,32.0,7.0,Rain
2017-01-07,32.0,8.0,Rain
2017-01-08,34.0,8.0,Sunny
2017-01-09,34.0,8.0,Cloudy
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


Use of axis
-----------

In [11]:
new_df = df.fillna(method="bfill", axis="columns") # axis is either "index" or "columns"
new_df

  new_df = df.fillna(method="bfill", axis="columns") # axis is either "index" or "columns"
  new_df = df.fillna(method="bfill", axis="columns") # axis is either "index" or "columns"


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,9.0,9.0,Sunny
2017-01-05,28.0,Snow,Snow
2017-01-06,7.0,7.0,
2017-01-07,32.0,Rain,Rain
2017-01-08,Sunny,Sunny,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


limit parameter
---------------

In [13]:
new_df = df.fillna(method="ffill",limit=1)
new_df

  new_df = df.fillna(method="ffill",limit=1)


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,,Sunny
2017-01-09,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


interpolate
-----------

The interpolate() method in pandas calculates missing numeric values by using interpolation techniques. By default, it uses linear interpolation, filling in missing values by assuming a straight-line relationship between adjacent known values.

Step 1: Interpolating temperature
---------------------------------

    1. Calculation for each missing value - numeric values only

                   (x − x1)                 
        y = y1 − ------------  * (y2 - y1) 
                   (x2 - x1)                    

    1. x1, y1, x2, y2 are the known points on either side of the missing value.

    2. x is the position of the missing value.

                                        
Interpolation for temperature
-----------------------------
Original values in temperature:
-------------------------------
        [32.0,NaN,28.0,NaN,32.0,NaN,NaN,34.0,40.0]

    1. 2017-01-04 (NaN):

        x1 = 2017-01-01

        x2 = 2017-01-05

        y1 = 32.0

        y2 = 28.0

                        (4−1)
            y = 32.0 + ------- ×(28.0 − 32.0) ==> y = 32.0 + (−4.0 × 0.75) ===> y = 29.0
                        (5−1)

    2. 2017-01-06 (NaN):

        x1 = 2017-01-05

        x2 = 2017-01-07

        y1 = 28.0

        y2 = 32.0

                        (6−5)
            y = 28.0 + ------- ×(32.0 − 28.0) ==> y = 28.0 + (4.0 × 0.5) ===> y = 30.0
                        (7−5)

Interpolation for windspeed:
----------------------------
Original values in windspeed:
-----------------------------

        [6.0,9.0,NaN,7.0,NaN,NaN,NaN,8.0,12.0]

    1. 2017-01-05 (NaN):

        x1 = 2017-01-04

        x2 = 2017-01-06

        y1 = 9.0

        y2 = 7.0

                        (5−4)
            y = 9.0 + ------- ×(7.0 − 9.0) ==> y = 9.0 − (2.0 × 0.5) ===> y 9.0 - 1.0 ===> y = 8.0
                        (6−4)


    2. 2017-01-07 (NaN):

        x1 = 2017-01-06

        x2 = 2017-01-10

        y1 = 7.0

        y2 = 8.0

                        (7−6)
            y = 7.0 + ------- × (8.0 − 7.0) ==> y = 7.0 + (1.0 × 0.25) ===> y = 7.25
                        (10−6)


    3. 2017-01-08 (NaN):

        x1 = 2017-01-06

        x2 = 2017-01-10

        y1 = 7.0

        y2 = 8.0

                        (8−6)
            y = 7.0 + ------- × (8.0 − 7.0) ==> y = 7.0 + (2.0 × 0.25) ===> y = 7.5
                        (10−6)

    4. 2017-01-09 (NaN):

        x1 = 2017-01-06

        x2 = 2017-01-10

        y1 = 7.0

        y2 = 8.0

                        (9−6)
            y = 7.0 + ------- × (8.0 − 7.0) ==> y = 7.0 + (3.0 × 0.25) ===> y = 7.75
                        (10−6)


There are many other methods for interpolation such as quadratic, piecewise_polynomial, cubic etc. Just google "dataframe interpolate" to see complete documentation


In [5]:
# day column is index mandatary

new_df = df.interpolate(method="time") 
new_df

  new_df = df.interpolate(method="time")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,29.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [6]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


dropna
------

In [7]:
new_df = df.dropna()
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [12]:
'''
how : {'any', 'all'}, default 'any'
    Determine if row or column is removed from DataFrame, when we have
    at least one NA or all NA.

    * 'any' : If any NA values are present, drop that row or column.
    * 'all' : If all values are NA, drop that row or column.
2017-01-09 row have all NaN value so delete the row
'''

new_df = df.dropna(how='all')
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [13]:
'''
thresh : int, optional
    Require that many non-NA values. Cannot be combined with how.
'''

new_df = df.dropna(thresh=1)
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


Inserting Missing Dates
-----------------------

In [15]:
dt = pd.date_range("01-01-2017","01-11-2017")
idx = pd.DatetimeIndex(dt)
df.reindex(idx)

Unnamed: 0,temperature,windspeed,event
2017-01-01,32.0,6.0,Rain
2017-01-02,,,
2017-01-03,,,
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
