### preprocessing of the data
#### steps involved:
1. Acquiring the dataset
2. Importing required packages
3. Importing the dataset
4. Finding missing values
5. Encoding the categorical data
6. Splitting data
7. Feature scaling

In [2]:
import numpy as np
import pandas as pd
import sklearn

In [3]:
data = pd.read_csv(r"C:\Users\hp\Desktop\CVT\Datasets-master\weather_data.csv")
data

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,-99999,7,Sunny
2,1/3/2017,28,-99999,Snow
3,1/4/2017,-99999,7,0
4,1/5/2017,32,-99999,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


In [4]:
data = data.replace({'temperature':-99999,'windspeed':-99999,'event':'0'},np.nan)
data

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/2/2017,,7.0,Sunny
2,1/3/2017,28.0,,Snow
3,1/4/2017,,7.0,
4,1/5/2017,32.0,,Rain
5,1/6/2017,31.0,2.0,Sunny
6,1/6/2017,34.0,5.0,


In [5]:
#Removes rows with Nan values
data.dropna()

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
5,1/6/2017,31.0,2.0,Sunny


In [6]:
#Removes columns with Nan values
data.dropna(axis=1)

Unnamed: 0,day
0,1/1/2017
1,1/2/2017
2,1/3/2017
3,1/4/2017
4,1/5/2017
5,1/6/2017
6,1/6/2017


In [7]:
data.dropna(how='all')

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/2/2017,,7.0,Sunny
2,1/3/2017,28.0,,Snow
3,1/4/2017,,7.0,
4,1/5/2017,32.0,,Rain
5,1/6/2017,31.0,2.0,Sunny
6,1/6/2017,34.0,5.0,


In [8]:
data.dropna(how="any")

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
5,1/6/2017,31.0,2.0,Sunny


In [10]:
int_data = data.interpolate()
int_data

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/2/2017,30.0,7.0,Sunny
2,1/3/2017,28.0,7.0,Snow
3,1/4/2017,30.0,7.0,
4,1/5/2017,32.0,4.5,Rain
5,1/6/2017,31.0,2.0,Sunny
6,1/6/2017,34.0,5.0,


### Handling missing values using sklearn
### sklearn ==> impute ==> SimpleImputer
1. All occurences of missing  values will be imputed 
2. two way transformation ==> fit and transform
3. First we need to fit it to the data amd then trnsform the data for the column

In [9]:
from sklearn.impute import SimpleImputer

In [10]:
imputer = SimpleImputer(missing_values= np.nan,strategy='mean')

In [11]:
imputer.fit(data.iloc[:,1:3])

SimpleImputer()

In [15]:
data.iloc[:,1:3] = imputer.transform(data.iloc[:,1:3])
data

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/2/2017,31.4,7.0,Sunny
2,1/3/2017,28.0,5.4,Snow
3,1/4/2017,31.4,7.0,
4,1/5/2017,32.0,5.4,Rain
5,1/6/2017,31.0,2.0,Sunny
6,1/6/2017,34.0,5.0,


In [16]:
imputer = SimpleImputer(missing_values= np.nan,strategy='median')

In [17]:
imputer.fit(data.iloc[:,1:3])

SimpleImputer(strategy='median')

In [18]:
data.iloc[:,1:3] = imputer.transform(data.iloc[:,1:3])
data

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/2/2017,31.4,7.0,Sunny
2,1/3/2017,28.0,5.4,Snow
3,1/4/2017,31.4,7.0,
4,1/5/2017,32.0,5.4,Rain
5,1/6/2017,31.0,2.0,Sunny
6,1/6/2017,34.0,5.0,


In [26]:
data

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/2/2017,31.4,7.0,Sunny
2,1/3/2017,28.0,5.4,Snow
3,1/4/2017,31.4,7.0,missing_value
4,1/5/2017,32.0,5.4,Rain
5,1/6/2017,31.0,2.0,Sunny
6,1/6/2017,34.0,5.0,missing_value


In [28]:
data = data.replace({'event':'missing_value'},'Snow')
data

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/2/2017,31.4,7.0,Sunny
2,1/3/2017,28.0,5.4,Snow
3,1/4/2017,31.4,7.0,Snow
4,1/5/2017,32.0,5.4,Rain
5,1/6/2017,31.0,2.0,Sunny
6,1/6/2017,34.0,5.0,Snow


In [29]:
#get_dummies() ==> pandas package
dummy_set = pd.get_dummies(data.event)
dummy_set

Unnamed: 0,Rain,Snow,Sunny
0,1,0,0
1,0,0,1
2,0,1,0
3,0,1,0
4,1,0,0
5,0,0,1
6,0,1,0


In [30]:
merged_data = pd.concat([data,dummy_set],axis=1)
merged_data

Unnamed: 0,day,temperature,windspeed,event,Rain,Snow,Sunny
0,1/1/2017,32.0,6.0,Rain,1,0,0
1,1/2/2017,31.4,7.0,Sunny,0,0,1
2,1/3/2017,28.0,5.4,Snow,0,1,0
3,1/4/2017,31.4,7.0,Snow,0,1,0
4,1/5/2017,32.0,5.4,Rain,1,0,0
5,1/6/2017,31.0,2.0,Sunny,0,0,1
6,1/6/2017,34.0,5.0,Snow,0,1,0


In [31]:
pd.get_dummies(data,columns=['event'],drop_first = True)

Unnamed: 0,day,temperature,windspeed,event_Snow,event_Sunny
0,1/1/2017,32.0,6.0,0,0
1,1/2/2017,31.4,7.0,0,1
2,1/3/2017,28.0,5.4,1,0
3,1/4/2017,31.4,7.0,1,0
4,1/5/2017,32.0,5.4,0,0
5,1/6/2017,31.0,2.0,0,1
6,1/6/2017,34.0,5.0,1,0
