In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('Data Preprocessing/dataset/weather_data.txt',parse_dates=['day']) 

In [19]:
df.dtypes

day            datetime64[ns]
temperature           float64
windspeed             float64
event                  object
dtype: object

In [8]:
df.head()

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain


In [9]:
#counting missing values in each column
df.isnull().sum()

day            0
temperature    4
windspeed      4
event          2
dtype: int64

In [16]:
df[df.temperature.isnull()]

Unnamed: 0,day,temperature,windspeed,event
1,2017-01-04,,9.0,Sunny
3,2017-01-06,,7.0,
5,2017-01-08,,,Sunny
6,2017-01-09,,,


In [22]:
df.set_index('day',inplace=True)

KeyError: 'day'

In [23]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [24]:
new_df=df.fillna(4)     #every nan will be filled with the number 4

In [25]:
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,4.0,9.0,Sunny
2017-01-05,28.0,4.0,Snow
2017-01-06,4.0,7.0,4
2017-01-07,32.0,4.0,Rain
2017-01-08,4.0,4.0,Sunny
2017-01-09,4.0,4.0,4
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [26]:
new_df=df.fillna({
    'temperature':df.temperature.mean(),
    'windspeed':0,
    'event':'no event'
})
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,33.2,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,33.2,7.0,no event
2017-01-07,32.0,0.0,Rain
2017-01-08,33.2,0.0,Sunny
2017-01-09,33.2,0.0,no event
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [27]:
new_df=df.fillna(method='ffill')      #carry forward the previous values of the data frame into the NaN one
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,7.0,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [28]:
new_df=df.fillna(method='bfill')      #borrow the next value of the data frame
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,28.0,9.0,Sunny
2017-01-05,28.0,7.0,Snow
2017-01-06,32.0,7.0,Rain
2017-01-07,32.0,8.0,Rain
2017-01-08,34.0,8.0,Sunny
2017-01-09,34.0,8.0,Cloudy
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [30]:
new_df=df.fillna(method='ffill',limit=1)    #copies values up till one place only
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,,Sunny
2017-01-09,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [32]:
new_df=df.interpolate()     #fill immediate value with equal intervals
new_df                      #interpolation will not be done on string

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [34]:
new_df=df.interpolate(method='time')       #this will be used only when we have a column as day
                                           #fill the column with the help of linear equation basically
new_df                                     

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,29.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [35]:
new_df=df.dropna()     #drops all the rows which have nan in it

In [36]:
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [37]:
new_df=df.dropna(how="all")        #drop only those rows which have all the columns as nan
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [38]:
new_df=df.dropna(thresh=2)   #drop all rows with 2 nan  (exact 2 nan)
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-07,32.0,,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


# replace function

In [51]:
new_df=pd.read_csv('Data Preprocessing/dataset/weather_data2.txt')

In [42]:
new_df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/4/2017,-99999,9,Sunny
2,1/5/2017,28,-99999,Snow
3,1/6/2017,-99999,7,No event
4,1/7/2017,32,-88888,Rain
5,1/11/2017,40,12,No event


In [49]:
newdf=new_df.replace([-99999,-88888],np.nan)
newdf

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,No event
4,1/7/2017,32.0,,Rain
5,1/11/2017,40.0,12.0,No event


In [52]:
new_df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/4/2017,-99999,9,Sunny
2,1/5/2017,28,-99999,Snow
3,1/6/2017,-99999,7,No event
4,1/7/2017,32,-88888,Rain
5,1/11/2017,40,12,No event


In [53]:
newdf=new_df.replace({
    'temperature':-99999,
    'windspeed':[-99999,-88888],
    'event':'No Event'
},np.nan)       #all the values in the dictionary will be replaced by NAN

In [54]:
newdf

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,No event
4,1/7/2017,32.0,,Rain
5,1/11/2017,40.0,12.0,No event


In [56]:
newdf=newdf.replace({
    -99999:np.nan,
    -88888:np.nan,
    'No Event':'Sunny'
})
newdf

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,No event
4,1/7/2017,32.0,,Rain
5,1/11/2017,40.0,12.0,No event


In [57]:
new_df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/4/2017,-99999,9,Sunny
2,1/5/2017,28,-99999,Snow
3,1/6/2017,-99999,7,No event
4,1/7/2017,32,-88888,Rain
5,1/11/2017,40,12,No event


In [59]:
new_df=new_df.replace({
    -99999:np.nan,
    -88888:np.nan,
    'No event':'Sunny'
})
new_df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,Sunny
4,1/7/2017,32.0,,Rain
5,1/11/2017,40.0,12.0,Sunny


# Use Regression

In [60]:
df=pd.read_csv('Data Preprocessing/dataset/weather_data3.txt')

In [61]:
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32F,6mph,Rain
1,1/4/2017,,9mph,Sunny
2,1/5/2017,28,,Snow
3,1/6/2017,,7,
4,1/7/2017,32C,,Rain
5,1/8/2017,,,Sunny
6,1/9/2017,,,
7,1/10/2017,34,8,Cloudy
8,1/11/2017,40,12,Sunny


In [62]:
new_df=df.replace('[A-Za-z]','',regex=True)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,
1,1/4/2017,,9.0,
2,1/5/2017,28.0,,
3,1/6/2017,,7.0,
4,1/7/2017,32.0,,
5,1/8/2017,,,
6,1/9/2017,,,
7,1/10/2017,34.0,8.0,
8,1/11/2017,40.0,12.0,


In [66]:
new_df=df[['temperature','windspeed']].replace('[A-Za-z]','',regex=True)
new_df

Unnamed: 0,temperature,windspeed
0,32.0,6.0
1,,9.0
2,28.0,
3,,7.0
4,32.0,
5,,
6,,
7,34.0,8.0
8,40.0,12.0


In [67]:
new=df.replace({
    'temperature':'[A-Za-z]',
    'windspeed':'[A-Za-z]'
},'',regex=True)

In [68]:
new

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,
4,1/7/2017,32.0,,Rain
5,1/8/2017,,,Sunny
6,1/9/2017,,,
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny
