## <font color="Yellow"><h4 align="center">Handling Missing Data - fillna, interpolate, dropna</font>

In [9]:
import pandas as pd

# df = pd.read_csv("weather_data.csv")
# type(df.day[0])
# The out would be str since it is a string

# lets convert it into timestamp format
df = pd.read_csv("weather_data.csv",parse_dates=['day'])
type(df.day[0])
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [8]:
# To make date as index and make this change reflect the original file

# df.set_index('day',inplace=True)
# df

# <font color="green"><u>fillna</u></font>

<font color="cyan">**Fill all NaN with one specific value**</font>

In [12]:
new_df = df.fillna(0)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,0.0,9.0,Sunny
2,2017-01-05,28.0,0.0,Snow
3,2017-01-06,0.0,7.0,0
4,2017-01-07,32.0,0.0,Rain
5,2017-01-08,0.0,0.0,Sunny
6,2017-01-09,0.0,0.0,0
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


<font color="cyan">**Fill na using column names and dict, with specific values**</font>

In [29]:
new_df = df.fillna({
        'temperature': 0,
        'windspeed': 0,
        'event': 'No Event'
    })
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,7.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,No Event
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,No Event
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


<font color="cyan">**Use method to determine how to fill na values**</font>

In [17]:
new_df = df.ffill() #ffill means forward fill, means fill it with the previous day's value
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,32.0,9.0,Sunny
2,2017-01-05,28.0,9.0,Snow
3,2017-01-06,28.0,7.0,Snow
4,2017-01-07,32.0,7.0,Rain
5,2017-01-08,32.0,7.0,Sunny
6,2017-01-09,32.0,7.0,Sunny
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [18]:
new_df = df.bfill() #bfill means backward fill, means fill it with the next day's value
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,28.0,9.0,Sunny
2,2017-01-05,28.0,7.0,Snow
3,2017-01-06,32.0,7.0,Rain
4,2017-01-07,32.0,8.0,Rain
5,2017-01-08,34.0,8.0,Sunny
6,2017-01-09,34.0,8.0,Cloudy
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


<font color="cyan">**Use of axis**</font>

In [44]:
pd.set_option('future.no_silent_downcasting', True)
"""
Before: Pandas might automatically downcast certain object dtype arrays during operations, potentially without giving you a warning.
After Setting: Pandas will explicitly handle downcasting, and you will be notified of any changes or issues related to data types.
This helps in ensuring that you are aware of how your data types are being handled, which can prevent unexpected behavior
"""
new_df = df.bfill(axis='columns')  # axis is either "index" or "columns"
new_df = new_df.infer_objects()

# print(df.dtypes)
# print(new_df.dtypes)
new_df


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,9.0,9.0,Sunny
2,2017-01-05,28.0,Snow,Snow
3,2017-01-06,7.0,7.0,
4,2017-01-07,32.0,Rain,Rain
5,2017-01-08,Sunny,Sunny,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


<!-- <font color="purple">**limit parameter**</font> -->

In [55]:
new_df = df.ffill(limit=1) # means if we have two consecutive NaN columns then only one will be filled with previous values and the other one will be NaN, we can set the limit accordingly
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,32.0,9.0,Sunny
2,2017-01-05,28.0,9.0,Snow
3,2017-01-06,28.0,7.0,Snow
4,2017-01-07,32.0,7.0,Rain
5,2017-01-08,32.0,,Sunny
6,2017-01-09,,,Sunny
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


### <font color="green">interpolate</font>
##### it fills the NaN value wrt to the method.

In [85]:
new_df = df.interpolate() # only works for number data, not strings
new_df

  new_df = df.interpolate() # only works for number data, not strings


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,30.0,9.0,Sunny
2,2017-01-05,28.0,8.0,Snow
3,2017-01-06,30.0,7.0,
4,2017-01-07,32.0,7.25,Rain
5,2017-01-08,32.666667,7.5,Sunny
6,2017-01-09,33.333333,7.75,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [90]:
new_df = df.interpolate(limit=1) #not all the NaN are filled, we can control this by limit
new_df

  new_df = df.interpolate(limit=1) #not all the NaN are filled


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,30.0,9.0,Sunny
2,2017-01-05,28.0,8.0,Snow
3,2017-01-06,30.0,7.0,
4,2017-01-07,32.0,7.25,Rain
5,2017-01-08,32.666667,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


**Notice that in above temperature on 2017-01-04 is 29 instead of 30 (in plain linear interpolate)**

**There are many other methods for interpolation such as quadratic, piecewise_polynomial, cubic etc. 
Just google "dataframe interpolate" to see complete documentation**

### <font color="green">dropna</font>

In [63]:
new_df = df.dropna() #drops the rows which has NaN in any of the column
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


##### Effect of axis parameter

In [66]:
new_df = df.dropna(axis=1) #drops the whole column which has NaN in any of its row
new_df

Unnamed: 0,day
0,2017-01-01
1,2017-01-04
2,2017-01-05
3,2017-01-06
4,2017-01-07
5,2017-01-08
6,2017-01-09
7,2017-01-10
8,2017-01-11


In [72]:
new_df = df.dropna(how='all') #will remove only those rows which has all column value as NaN
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [73]:
new_df = df.dropna(thresh=1) # will keep the rows which has atleast one non-NaN value
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


#### Subset parameter in dropna

In [76]:
new_df = df.dropna(subset=["event"]) # will look into the NaN value of a specific column and remove the rows corresponding to that
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


#### inplace parameter in dropna will reflect the change into the original DF

### <font color="cyan">Inserting Missing Dates</font>

In [100]:
df = pd.read_csv("weather_data.csv", parse_dates=["day"])
new_df = df.set_index('day')
new_df

dt = pd.date_range("01-01-2017","01-11-2017")
idx = pd.DatetimeIndex(dt)
new_df = new_df.reindex(idx)
new_df


Unnamed: 0,temperature,windspeed,event
2017-01-01,32.0,6.0,Rain
2017-01-02,,,
2017-01-03,,,
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
