# Pandas interpolate

Handling missing values using `interpolate` function

In [1]:
import pandas as pd
import numpy as np

`DataFrame.Interpolate()`

syntax:

`DataFrame.Interpolate(method='linear',`<br>
                        `axis=0,`<br>
                        `limit=None,`<br>
                        `inplace=False,`<br>
                        `limit_direction='forward',`<br>
                        `limit_area=None,`<br>
                        `downcast=None, **kwargs)`

parameters:

method : {'linear', 'time', 'index', 'values', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', 'polynomial', 'spline', 'piecewise_polynomial', 'from_derivatives', 'pchip'}

axis : 
{{0 or 'index', 1 or 'columns', None}}, default None

limit : Maximum number of consecutive NaNs to fill. Must be greater than 0.

limit_direction : {‘forward’, ‘backward’, ‘both’}, default ‘forward’

limit_area : None (default) no fill restriction. 

inside :  Only fill NaNs surrounded by valid values (interpolate).

outside : Only fill NaNs outside valid values (extrapolate). 

If limit is specified, consecutive NaNs will be filled in this direction.

inplace

In [2]:
df_temp = pd.read_csv("./datasets/cities_temp_inter.csv")

df_temp

Unnamed: 0,Date,City,State (Abbrev),Temp (C)
0,1-Nov-20,Anaheim,CA,31.0
1,3-Nov-20,Ann Arbor,MI,
2,10-Nov-20,Baltimore,MD,
3,13-Nov-20,Savannah,GA,21.0
4,15-Nov-20,Davenport,IA,
5,20-Nov-20,Baton Rouge,LA,24.0
6,23-Nov-20,Elizabeth,NJ,
7,27-Nov-20,Clearwater,FL,26.0


## Basic interpolation

In [3]:
df_temp.interpolate()

Unnamed: 0,Date,City,State (Abbrev),Temp (C)
0,1-Nov-20,Anaheim,CA,31.0
1,3-Nov-20,Ann Arbor,MI,27.666667
2,10-Nov-20,Baltimore,MD,24.333333
3,13-Nov-20,Savannah,GA,21.0
4,15-Nov-20,Davenport,IA,22.5
5,20-Nov-20,Baton Rouge,LA,24.0
6,23-Nov-20,Elizabeth,NJ,25.0
7,27-Nov-20,Clearwater,FL,26.0


In [5]:
df_temp.interpolate(method='nearest')

Unnamed: 0,Date,City,State (Abbrev),Temp (C)
0,1-Nov-20,Anaheim,CA,31.0
1,3-Nov-20,Ann Arbor,MI,31.0
2,10-Nov-20,Baltimore,MD,21.0
3,13-Nov-20,Savannah,GA,21.0
4,15-Nov-20,Davenport,IA,21.0
5,20-Nov-20,Baton Rouge,LA,24.0
6,23-Nov-20,Elizabeth,NJ,24.0
7,27-Nov-20,Clearwater,FL,26.0


## Using date

Converting a column's data type to datetime to perform time interpolation

In [6]:
df_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            8 non-null      object 
 1   City            8 non-null      object 
 2   State (Abbrev)  8 non-null      object 
 3   Temp (C)        4 non-null      float64
dtypes: float64(1), object(3)
memory usage: 384.0+ bytes


Convert the column to datetime

In [10]:
df_temp['Date'] = df_temp['Date'].astype('datetime64[ns]')

  df_temp['Date'] = df_temp['Date'].astype('datetime64[ns]')


Check each column's dtype

In [11]:
df_temp.dtypes

Date              datetime64[ns]
City                      object
State (Abbrev)            object
Temp (C)                 float64
dtype: object

Check dataframe

In [12]:
df_temp

Unnamed: 0,Date,City,State (Abbrev),Temp (C)
0,2020-11-01,Anaheim,CA,31.0
1,2020-11-03,Ann Arbor,MI,
2,2020-11-10,Baltimore,MD,
3,2020-11-13,Savannah,GA,21.0
4,2020-11-15,Davenport,IA,
5,2020-11-20,Baton Rouge,LA,24.0
6,2020-11-23,Elizabeth,NJ,
7,2020-11-27,Clearwater,FL,26.0


Set the date column as index

In [13]:
df_temp = df_temp.set_index(df_temp['Date'])

In [14]:
df_temp

Unnamed: 0_level_0,Date,City,State (Abbrev),Temp (C)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-11-01,2020-11-01,Anaheim,CA,31.0
2020-11-03,2020-11-03,Ann Arbor,MI,
2020-11-10,2020-11-10,Baltimore,MD,
2020-11-13,2020-11-13,Savannah,GA,21.0
2020-11-15,2020-11-15,Davenport,IA,
2020-11-20,2020-11-20,Baton Rouge,LA,24.0
2020-11-23,2020-11-23,Elizabeth,NJ,
2020-11-27,2020-11-27,Clearwater,FL,26.0


Drop the date column

In [15]:
df_temp_drop = df_temp.drop('Date', axis=1)

In [16]:
df_temp_drop

Unnamed: 0_level_0,City,State (Abbrev),Temp (C)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-11-01,Anaheim,CA,31.0
2020-11-03,Ann Arbor,MI,
2020-11-10,Baltimore,MD,
2020-11-13,Savannah,GA,21.0
2020-11-15,Davenport,IA,
2020-11-20,Baton Rouge,LA,24.0
2020-11-23,Elizabeth,NJ,
2020-11-27,Clearwater,FL,26.0


Interpolate the temperature based on time

In [17]:
df_temp_drop.interpolate(method='time')

Unnamed: 0_level_0,City,State (Abbrev),Temp (C)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-11-01,Anaheim,CA,31.0
2020-11-03,Ann Arbor,MI,29.333333
2020-11-10,Baltimore,MD,23.5
2020-11-13,Savannah,GA,21.0
2020-11-15,Davenport,IA,21.857143
2020-11-20,Baton Rouge,LA,24.0
2020-11-23,Elizabeth,NJ,24.857143
2020-11-27,Clearwater,FL,26.0


Using index

In [18]:
df_temp_drop

Unnamed: 0_level_0,City,State (Abbrev),Temp (C)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-11-01,Anaheim,CA,31.0
2020-11-03,Ann Arbor,MI,
2020-11-10,Baltimore,MD,
2020-11-13,Savannah,GA,21.0
2020-11-15,Davenport,IA,
2020-11-20,Baton Rouge,LA,24.0
2020-11-23,Elizabeth,NJ,
2020-11-27,Clearwater,FL,26.0


In [19]:
df_temp_drop.interpolate(method='index')

Unnamed: 0_level_0,City,State (Abbrev),Temp (C)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-11-01,Anaheim,CA,31.0
2020-11-03,Ann Arbor,MI,29.333333
2020-11-10,Baltimore,MD,23.5
2020-11-13,Savannah,GA,21.0
2020-11-15,Davenport,IA,21.857143
2020-11-20,Baton Rouge,LA,24.0
2020-11-23,Elizabeth,NJ,24.857143
2020-11-27,Clearwater,FL,26.0


Notice the interpolation results using index are the same with time because we set the date column as an index of the dataframe

Let's actually compare with different index

In [23]:
df_temp = df_temp_drop.reset_index()

df_temp

Unnamed: 0,Date,City,State (Abbrev),Temp (C)
0,2020-11-01,Anaheim,CA,31.0
1,2020-11-03,Ann Arbor,MI,
2,2020-11-10,Baltimore,MD,
3,2020-11-13,Savannah,GA,21.0
4,2020-11-15,Davenport,IA,
5,2020-11-20,Baton Rouge,LA,24.0
6,2020-11-23,Elizabeth,NJ,
7,2020-11-27,Clearwater,FL,26.0


In [29]:
df_temp['Temp (C)'].interpolate(method='piecewise_polynomial')

0    31.000000
1    27.666667
2    24.333333
3    21.000000
4    22.500000
5    24.000000
6    25.000000
7    26.000000
Name: Temp (C), dtype: float64

## Interpolate to nearest values

In [31]:
df_temp_drop.interpolate(method='nearest')

Unnamed: 0_level_0,City,State (Abbrev),Temp (C)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-11-01,Anaheim,CA,31.0
2020-11-03,Ann Arbor,MI,31.0
2020-11-10,Baltimore,MD,21.0
2020-11-13,Savannah,GA,21.0
2020-11-15,Davenport,IA,21.0
2020-11-20,Baton Rouge,LA,24.0
2020-11-23,Elizabeth,NJ,24.0
2020-11-27,Clearwater,FL,26.0


## Plynomial interpolation

In [34]:
df_temp = pd.read_csv("./datasets/cities_temp_inter.csv")

df_temp

Unnamed: 0,Date,City,State (Abbrev),Temp (C)
0,1-Nov-20,Anaheim,CA,31.0
1,3-Nov-20,Ann Arbor,MI,
2,10-Nov-20,Baltimore,MD,
3,13-Nov-20,Savannah,GA,21.0
4,15-Nov-20,Davenport,IA,
5,20-Nov-20,Baton Rouge,LA,24.0
6,23-Nov-20,Elizabeth,NJ,
7,27-Nov-20,Clearwater,FL,26.0


In [35]:
df_temp.interpolate(method='polynomial', order=1)

Unnamed: 0,Date,City,State (Abbrev),Temp (C)
0,1-Nov-20,Anaheim,CA,31.0
1,3-Nov-20,Ann Arbor,MI,27.666667
2,10-Nov-20,Baltimore,MD,24.333333
3,13-Nov-20,Savannah,GA,21.0
4,15-Nov-20,Davenport,IA,22.5
5,20-Nov-20,Baton Rouge,LA,24.0
6,23-Nov-20,Elizabeth,NJ,25.0
7,27-Nov-20,Clearwater,FL,26.0


In [36]:
df_temp.interpolate(method='spline', order=1)

Unnamed: 0,Date,City,State (Abbrev),Temp (C)
0,1-Nov-20,Anaheim,CA,31.0
1,3-Nov-20,Ann Arbor,MI,27.693909
2,10-Nov-20,Baltimore,MD,25.200017
3,13-Nov-20,Savannah,GA,21.0
4,15-Nov-20,Davenport,IA,23.068286
5,20-Nov-20,Baton Rouge,LA,24.0
6,23-Nov-20,Elizabeth,NJ,24.553037
7,27-Nov-20,Clearwater,FL,26.0


## Specifying the axis

Interpolate rowwise

In [37]:
df_temp.interpolate(axis=0)

Unnamed: 0,Date,City,State (Abbrev),Temp (C)
0,1-Nov-20,Anaheim,CA,31.0
1,3-Nov-20,Ann Arbor,MI,27.666667
2,10-Nov-20,Baltimore,MD,24.333333
3,13-Nov-20,Savannah,GA,21.0
4,15-Nov-20,Davenport,IA,22.5
5,20-Nov-20,Baton Rouge,LA,24.0
6,23-Nov-20,Elizabeth,NJ,25.0
7,27-Nov-20,Clearwater,FL,26.0


Interpolate columnwise

In [38]:
df_temp.interpolate(axis=1)

TypeError: Cannot interpolate with all object-dtype columns in the DataFrame. Try setting at least one column to a numeric dtype.

It returns an error because we don't have a previous numeric column as interpolation reference

## Set limit

Maximum number of consecutive NaNs to fill. Must be greater than zero

In [40]:
df_temp.interpolate(limit=2)

Unnamed: 0,Date,City,State (Abbrev),Temp (C)
0,1-Nov-20,Anaheim,CA,31.0
1,3-Nov-20,Ann Arbor,MI,27.666667
2,10-Nov-20,Baltimore,MD,24.333333
3,13-Nov-20,Savannah,GA,21.0
4,15-Nov-20,Davenport,IA,22.5
5,20-Nov-20,Baton Rouge,LA,24.0
6,23-Nov-20,Elizabeth,NJ,25.0
7,27-Nov-20,Clearwater,FL,26.0


## Set limit direction

limit_direction : {‘forward’, ‘backward’, ‘both’}, default ‘forward’

In [41]:
df_temp.interpolate(limit_direction='forward')

Unnamed: 0,Date,City,State (Abbrev),Temp (C)
0,1-Nov-20,Anaheim,CA,31.0
1,3-Nov-20,Ann Arbor,MI,27.666667
2,10-Nov-20,Baltimore,MD,24.333333
3,13-Nov-20,Savannah,GA,21.0
4,15-Nov-20,Davenport,IA,22.5
5,20-Nov-20,Baton Rouge,LA,24.0
6,23-Nov-20,Elizabeth,NJ,25.0
7,27-Nov-20,Clearwater,FL,26.0


## Limit area

limit_area : None (default) no fill restriction. 

inside :  Only fill NaNs surrounded by valid values (interpolate).

outside : Only fill NaNs outside valid values (extrapolate). 

If limit is specified, consecutive NaNs will be filled in this direction.

inplace

In [42]:
df_temp.interpolate(limit_area='inside')

Unnamed: 0,Date,City,State (Abbrev),Temp (C)
0,1-Nov-20,Anaheim,CA,31.0
1,3-Nov-20,Ann Arbor,MI,27.666667
2,10-Nov-20,Baltimore,MD,24.333333
3,13-Nov-20,Savannah,GA,21.0
4,15-Nov-20,Davenport,IA,22.5
5,20-Nov-20,Baton Rouge,LA,24.0
6,23-Nov-20,Elizabeth,NJ,25.0
7,27-Nov-20,Clearwater,FL,26.0


## Inplace

Make changes and save the changes to the current dataframe

In [43]:
df_temp.interpolate(method='krogh', inplace=True)

In [44]:
df_temp

Unnamed: 0,Date,City,State (Abbrev),Temp (C)
0,1-Nov-20,Anaheim,CA,31.0
1,3-Nov-20,Ann Arbor,MI,24.485714
2,10-Nov-20,Baltimore,MD,21.464286
3,13-Nov-20,Savannah,GA,21.0
4,15-Nov-20,Davenport,IA,22.157143
5,20-Nov-20,Baton Rouge,LA,24.0
6,23-Nov-20,Elizabeth,NJ,25.592857
7,27-Nov-20,Clearwater,FL,26.0


The dataframe has been changed