In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('datasets/weather_data.csv')
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,
4,1/7/2017,32.0,,Rain
5,1/8/2017,,,Sunny
6,1/9/2017,,,
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


In [3]:
type(df.day[0])

str

### checking nan values

In [4]:
#to check if there is any null value
df.isnull()

Unnamed: 0,day,temperature,windspeed,event
0,False,False,False,False
1,False,True,False,False
2,False,False,True,False
3,False,True,False,True
4,False,False,True,False
5,False,True,True,False
6,False,True,True,True
7,False,False,False,False
8,False,False,False,False


In [5]:
print(df.isnull().sum())

day            0
temperature    4
windspeed      4
event          2
dtype: int64


In [6]:
df.isna()

Unnamed: 0,day,temperature,windspeed,event
0,False,False,False,False
1,False,True,False,False
2,False,False,True,False
3,False,True,False,True
4,False,False,True,False
5,False,True,True,False
6,False,True,True,True
7,False,False,False,False
8,False,False,False,False


### Set index and parsing dates

In [7]:
df = pd.read_csv('datasets/weather_data.csv', parse_dates=['day'])
df.set_index('day', inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [8]:
type(df.day[0])

AttributeError: 'DataFrame' object has no attribute 'day'

In [9]:
type(df.index)

pandas.core.indexes.datetimes.DatetimeIndex

### Dealing with null values

### if null values are less in number,then we can directly drop them

In [10]:
df.dropna()

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [11]:
df.dropna(how='all')

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [12]:
df.dropna(thresh=2)

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-07,32.0,,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [13]:
df.dropna(axis=0)

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### fill them with other values

In [14]:
df.dtypes

temperature    float64
windspeed      float64
event           object
dtype: object

In [15]:
#we can fill with anything which we want like mean ,median etc
import numpy as np
mean=np.mean(df.windspeed)
df.windspeed.fillna(mean)

day
2017-01-01     6.0
2017-01-04     9.0
2017-01-05     8.4
2017-01-06     7.0
2017-01-07     8.4
2017-01-08     8.4
2017-01-09     8.4
2017-01-10     8.0
2017-01-11    12.0
Name: windspeed, dtype: float64

In [16]:
#Approach 1
newdf=df.fillna(0)
newdf

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,0
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,0
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [17]:
#approach 2
newdf = df.fillna({
    'temperature': 0,
    'windspeed': 0,
    'event': 'no event'
})
newdf

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,no event
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,no event
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


###  forward fill ..carry forward the previous value

In [18]:
newdf=df.fillna(method='ffill')
newdf

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,7.0,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### limit will control how many times a file must be copied

In [19]:
df.fillna(method='ffill',limit=1)

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,,Sunny
2017-01-09,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [20]:
#similarly backward fill
newdf=df.fillna(method='bfill')
newdf

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,28.0,9.0,Sunny
2017-01-05,28.0,7.0,Snow
2017-01-06,32.0,7.0,Rain
2017-01-07,32.0,8.0,Rain
2017-01-08,34.0,8.0,Sunny
2017-01-09,34.0,8.0,Cloudy
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [21]:
#linear interpolate
newdf = df.interpolate(method='linear')
newdf

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [22]:
#interpolate
newdf = df.interpolate(method='time')
newdf

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,29.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### third way is to replace them

In [23]:
df.replace({
      np.nan:'no event'
})

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,no event,9,Sunny
2017-01-05,28,no event,Snow
2017-01-06,no event,7,no event
2017-01-07,32,no event,Rain
2017-01-08,no event,no event,Sunny
2017-01-09,no event,no event,no event
2017-01-10,34,8,Cloudy
2017-01-11,40,12,Sunny


### To fill missing day values

In [24]:
dt=pd.date_range("01-01-2017","01-11-2017")
idx=pd.DatetimeIndex(dt)
weather=df.reindex(idx)
weather

Unnamed: 0,temperature,windspeed,event
2017-01-01,32.0,6.0,Rain
2017-01-02,,,
2017-01-03,,,
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy


In [25]:
weather.interpolate(how='time',inplace=True)
weather

Unnamed: 0,temperature,windspeed,event
2017-01-01,32.0,6.0,Rain
2017-01-02,31.0,7.0,
2017-01-03,30.0,8.0,
2017-01-04,29.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy


### converting categorical data into numerical

In [26]:
#firstly we will fill na values
weather.fillna({
    'event': 'no event'
},inplace=True)
weather

Unnamed: 0,temperature,windspeed,event
2017-01-01,32.0,6.0,Rain
2017-01-02,31.0,7.0,no event
2017-01-03,30.0,8.0,no event
2017-01-04,29.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,no event
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,no event
2017-01-10,34.0,8.0,Cloudy


In [27]:
weather.dtypes

temperature    float64
windspeed      float64
event           object
dtype: object

In [28]:
weather['event'] = weather['event'].astype('category')
print("\n after converting to category \n")
print(weather.dtypes)


 after converting to category 

temperature     float64
windspeed       float64
event          category
dtype: object


In [29]:
weather.event.unique()

[Rain, no event, Sunny, Snow, Cloudy]
Categories (5, object): [Rain, no event, Sunny, Snow, Cloudy]

In [30]:
#Then you can assign the encoded variable to a new column using the cat.codes accessor:
weather["event_cat"] = weather['event'].cat.codes
weather

Unnamed: 0,temperature,windspeed,event,event_cat
2017-01-01,32.0,6.0,Rain,1
2017-01-02,31.0,7.0,no event,4
2017-01-03,30.0,8.0,no event,4
2017-01-04,29.0,9.0,Sunny,3
2017-01-05,28.0,8.0,Snow,2
2017-01-06,30.0,7.0,no event,4
2017-01-07,32.0,7.25,Rain,1
2017-01-08,32.666667,7.5,Sunny,3
2017-01-09,33.333333,7.75,no event,4
2017-01-10,34.0,8.0,Cloudy,0
