# DATA PREPROCESSING/DATA PREPARATION

1. Data Cleaning - drop, fillna with mean, median, mode Imputation, changing dtype
2. Data Transformation
    * If data is CONTINUOUS = Standard Scaler, MinMaxScaler, Robust Scaler
    * If data is DISCRETE   = Label Encoder, One hot Encoder

### 1. Import Necessasary libraries

In [1]:
import pandas as pd

### 2. Import data

In [38]:
weather_data_2010 = pd.read_csv('data_clean.csv')
weather_data_2010

Unnamed: 0.1,Unnamed: 0,Ozone,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,1,41.0,190.0,7.4,67,5,1,2010,67,S
1,2,36.0,118.0,8.0,72,5,2,2010,72,C
2,3,12.0,149.0,12.6,74,5,3,2010,74,PS
3,4,18.0,313.0,11.5,62,5,4,2010,62,S
4,5,,,14.3,56,5,5,2010,56,S
...,...,...,...,...,...,...,...,...,...,...
153,154,41.0,190.0,7.4,67,5,1,2010,67,C
154,155,30.0,193.0,6.9,70,9,26,2010,70,PS
155,156,,145.0,13.2,77,9,27,2010,77,S
156,157,14.0,191.0,14.3,75,9,28,2010,75,S


### 3. Data Understanding

#### 3.1 Initial Analysis

In [5]:
weather_data_2010.shape

(158, 10)

In [6]:
weather_data_2010.isna().sum()

Unnamed: 0     0
Ozone         38
Solar.R        7
Wind           0
Temp C         0
Month          0
Day            0
Year           0
Temp           0
Weather        3
dtype: int64

In [7]:
weather_data_2010.dtypes

Unnamed: 0      int64
Ozone         float64
Solar.R       float64
Wind          float64
Temp C         object
Month          object
Day             int64
Year            int64
Temp            int64
Weather        object
dtype: object

In [9]:
weather_data_2010.describe(include = 'all')

Unnamed: 0.1,Unnamed: 0,Ozone,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
count,158.0,120.0,151.0,158.0,158.0,158.0,158.0,158.0,158.0,155
unique,,,,,41.0,6.0,,,,3
top,,,,,81.0,9.0,,,,S
freq,,,,,11.0,34.0,,,,59
mean,79.5,41.583333,185.403974,9.957595,,,16.006329,2010.0,77.727848,
std,45.754781,32.620709,88.723103,3.511261,,,8.997166,0.0,9.377877,
min,1.0,1.0,7.0,1.7,,,1.0,2010.0,56.0,
25%,40.25,18.0,119.0,7.4,,,8.0,2010.0,72.0,
50%,79.5,30.5,197.0,9.7,,,16.0,2010.0,78.5,
75%,118.75,61.5,257.0,11.875,,,24.0,2010.0,84.0,


#### 3.2 EDA

In [11]:
weather_data_2010.Weather.unique()

array(['S', 'C', 'PS', nan], dtype=object)

In [12]:
weather_data_2010.head(50)

Unnamed: 0.1,Unnamed: 0,Ozone,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,1,41.0,190.0,7.4,67,5,1,2010,67,S
1,2,36.0,118.0,8.0,72,5,2,2010,72,C
2,3,12.0,149.0,12.6,74,5,3,2010,74,PS
3,4,18.0,313.0,11.5,62,5,4,2010,62,S
4,5,,,14.3,56,5,5,2010,56,S
5,6,28.0,,14.9,66,5,6,2010,66,C
6,7,23.0,299.0,8.6,65,5,7,2010,65,PS
7,8,19.0,99.0,13.8,59,5,8,2010,59,C
8,9,8.0,19.0,20.1,61,5,9,2010,61,PS
9,10,,194.0,8.6,69,5,10,2010,69,S


### 4. Data Preparation/Preprocessing

### Step 1: Perform Data Cleaning

In [39]:
del weather_data_2010['Unnamed: 0']

In [14]:
weather_data_2010.isna().sum()

Ozone      38
Solar.R     7
Wind        0
Temp C      0
Month       0
Day         0
Year        0
Temp        0
Weather     3
dtype: int64

In [15]:
weather_data_2010.describe()

Unnamed: 0,Ozone,Solar.R,Wind,Day,Year,Temp
count,120.0,151.0,158.0,158.0,158.0,158.0
mean,41.583333,185.403974,9.957595,16.006329,2010.0,77.727848
std,32.620709,88.723103,3.511261,8.997166,0.0,9.377877
min,1.0,7.0,1.7,1.0,2010.0,56.0
25%,18.0,119.0,7.4,8.0,2010.0,72.0
50%,30.5,197.0,9.7,16.0,2010.0,78.5
75%,61.5,257.0,11.875,24.0,2010.0,84.0
max,168.0,334.0,20.7,31.0,2010.0,97.0


### PTR: Client - Shashank, approved to go drop the Ozone feature because of more tha 20% of Nan values

In [40]:
del weather_data_2010['Ozone']

In [18]:
weather_data_2010.isna().sum()

Solar.R    7
Wind       0
Temp C     0
Month      0
Day        0
Year       0
Temp       0
Weather    3
dtype: int64

### PTR: Client - Shashank, approved to go with the Median imputaion for Solar.R feature.

In [41]:
weather_data_2010['Solar.R'] = weather_data_2010['Solar.R'].fillna(value=197)

In [21]:
weather_data_2010.isna().sum()

Solar.R    0
Wind       0
Temp C     0
Month      0
Day        0
Year       0
Temp       0
Weather    3
dtype: int64

### PTR: Client - Shashank, approved to go with the Mode imputaion for Weather.

In [42]:
weather_data_2010['Weather'] = weather_data_2010['Weather'].fillna(value = 'S')

In [24]:
weather_data_2010.isna().sum()

Solar.R    0
Wind       0
Temp C     0
Month      0
Day        0
Year       0
Temp       0
Weather    0
dtype: int64

### OBSERVATION:

We can see that there is no Null entries in my data. Now we have to check for the datatypes if it is int or float.

In [25]:
weather_data_2010.dtypes

Solar.R    float64
Wind       float64
Temp C      object
Month       object
Day          int64
Year         int64
Temp         int64
Weather     object
dtype: object

In [27]:
weather_data_2010.head(50)

Unnamed: 0,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,190.0,7.4,67,5,1,2010,67,S
1,118.0,8.0,72,5,2,2010,72,C
2,149.0,12.6,74,5,3,2010,74,PS
3,313.0,11.5,62,5,4,2010,62,S
4,197.0,14.3,56,5,5,2010,56,S
5,197.0,14.9,66,5,6,2010,66,C
6,299.0,8.6,65,5,7,2010,65,PS
7,99.0,13.8,59,5,8,2010,59,C
8,19.0,20.1,61,5,9,2010,61,PS
9,194.0,8.6,69,5,10,2010,69,S


In [43]:
weather_data_2010['Temp C'] = pd.to_numeric(arg = weather_data_2010['Temp C'], errors='coerce')
weather_data_2010['Month']  = pd.to_numeric(arg = weather_data_2010['Month'], errors='coerce')

In [34]:
weather_data_2010.head(50)

Unnamed: 0,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,190.0,7.4,67.0,5.0,1,2010,67,S
1,118.0,8.0,72.0,5.0,2,2010,72,C
2,149.0,12.6,74.0,5.0,3,2010,74,PS
3,313.0,11.5,62.0,5.0,4,2010,62,S
4,197.0,14.3,56.0,5.0,5,2010,56,S
5,197.0,14.9,66.0,5.0,6,2010,66,C
6,299.0,8.6,65.0,5.0,7,2010,65,PS
7,99.0,13.8,59.0,5.0,8,2010,59,C
8,19.0,20.1,61.0,5.0,9,2010,61,PS
9,194.0,8.6,69.0,5.0,10,2010,69,S


In [35]:
weather_data_2010.dtypes

Solar.R    float64
Wind       float64
Temp C     float64
Month      float64
Day          int64
Year         int64
Temp         int64
Weather     object
dtype: object

In [32]:
weather_data_2010['Temp C'].mean()

77.7515923566879

In [48]:
weather_data_2010['Temp C'] = weather_data_2010['Temp C'].fillna(value=77.75)
weather_data_2010['Month']  = weather_data_2010['Month'].fillna(value = 5).astype('int')

In [49]:
weather_data_2010.dtypes

Solar.R    float64
Wind       float64
Temp C     float64
Month        int32
Day          int64
Year         int64
Temp         int64
Weather     object
dtype: object

In [50]:
weather_data_2010

Unnamed: 0,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,190.0,7.4,67.0,5,1,2010,67,S
1,118.0,8.0,72.0,5,2,2010,72,C
2,149.0,12.6,74.0,5,3,2010,74,PS
3,313.0,11.5,62.0,5,4,2010,62,S
4,197.0,14.3,56.0,5,5,2010,56,S
...,...,...,...,...,...,...,...,...
153,190.0,7.4,67.0,5,1,2010,67,C
154,193.0,6.9,70.0,9,26,2010,70,PS
155,145.0,13.2,77.0,9,27,2010,77,S
156,191.0,14.3,75.0,9,28,2010,75,S


In [51]:
del weather_data_2010['Temp C']

In [52]:
weather_data_2010.dtypes

Solar.R    float64
Wind       float64
Month        int32
Day          int64
Year         int64
Temp         int64
Weather     object
dtype: object

In [53]:
weather_data_2010.isna().sum()

Solar.R    0
Wind       0
Month      0
Day        0
Year       0
Temp       0
Weather    0
dtype: int64

In [54]:
del weather_data_2010['Year']

In [56]:
weather_data_2010.head(50)

Unnamed: 0,Solar.R,Wind,Month,Day,Temp,Weather
0,190.0,7.4,5,1,67,S
1,118.0,8.0,5,2,72,C
2,149.0,12.6,5,3,74,PS
3,313.0,11.5,5,4,62,S
4,197.0,14.3,5,5,56,S
5,197.0,14.9,5,6,66,C
6,299.0,8.6,5,7,65,PS
7,99.0,13.8,5,8,59,C
8,19.0,20.1,5,9,61,PS
9,194.0,8.6,5,10,69,S


### 4.2 Data Transformation
* If data is CONTINUOUS = Standard Scaler, MinMaxScaler, Robust Scaler
* If data is DISCRETE   = Label Encoder, One hot Encoder

In [60]:
weather_data_2010_2 = weather_data_2010.copy()
weather_data_2010_2

Unnamed: 0,Solar.R,Wind,Month,Day,Temp,Weather
0,190.0,7.4,5,1,67,S
1,118.0,8.0,5,2,72,C
2,149.0,12.6,5,3,74,PS
3,313.0,11.5,5,4,62,S
4,197.0,14.3,5,5,56,S
...,...,...,...,...,...,...
153,190.0,7.4,5,1,67,C
154,193.0,6.9,9,26,70,PS
155,145.0,13.2,9,27,77,S
156,191.0,14.3,9,28,75,S


#### Label Encoding Technique

It can be performed by using a **library support - sklearn** or **creating a user-defined function**.

In [65]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
weather_data_2010_2['Encoded_Weather'] = le.fit_transform(weather_data_2010_2['Weather'])
weather_data_2010_2.head(20)

Unnamed: 0,Solar.R,Wind,Month,Day,Temp,Weather,Encoded_Weather
0,190.0,7.4,5,1,67,S,2
1,118.0,8.0,5,2,72,C,0
2,149.0,12.6,5,3,74,PS,1
3,313.0,11.5,5,4,62,S,2
4,197.0,14.3,5,5,56,S,2
5,197.0,14.9,5,6,66,C,0
6,299.0,8.6,5,7,65,PS,1
7,99.0,13.8,5,8,59,C,0
8,19.0,20.1,5,9,61,PS,1
9,194.0,8.6,5,10,69,S,2


In [66]:
del weather_data_2010_2['Weather']

In [67]:
weather_data_2010_2

Unnamed: 0,Solar.R,Wind,Month,Day,Temp,Encoded_Weather
0,190.0,7.4,5,1,67,2
1,118.0,8.0,5,2,72,0
2,149.0,12.6,5,3,74,1
3,313.0,11.5,5,4,62,2
4,197.0,14.3,5,5,56,2
...,...,...,...,...,...,...
153,190.0,7.4,5,1,67,0
154,193.0,6.9,9,26,70,1
155,145.0,13.2,9,27,77,2
156,191.0,14.3,9,28,75,2


In [68]:
weather_data_2010_2.dtypes

Solar.R            float64
Wind               float64
Month                int32
Day                  int64
Temp                 int64
Encoded_Weather      int32
dtype: object

#### OneHotEncoding Technique

Can be achieved using any of these 2 libraries:

**1. sklearn - OneHotEncoder**

**2. Pandas  - get_dummies()**

### Pandas - OHE

In [71]:
weather_data_2010_3 = weather_data_2010.copy()
weather_data_2010_3

Unnamed: 0,Solar.R,Wind,Month,Day,Temp,Weather
0,190.0,7.4,5,1,67,S
1,118.0,8.0,5,2,72,C
2,149.0,12.6,5,3,74,PS
3,313.0,11.5,5,4,62,S
4,197.0,14.3,5,5,56,S
...,...,...,...,...,...,...
153,190.0,7.4,5,1,67,C
154,193.0,6.9,9,26,70,PS
155,145.0,13.2,9,27,77,S
156,191.0,14.3,9,28,75,S


In [74]:
weather_data_2010_3 = pd.get_dummies(data = weather_data_2010_3,columns=['Weather'])
weather_data_2010_3

Unnamed: 0,Solar.R,Wind,Month,Day,Temp,Weather_C,Weather_PS,Weather_S
0,190.0,7.4,5,1,67,0,0,1
1,118.0,8.0,5,2,72,1,0,0
2,149.0,12.6,5,3,74,0,1,0
3,313.0,11.5,5,4,62,0,0,1
4,197.0,14.3,5,5,56,0,0,1
...,...,...,...,...,...,...,...,...
153,190.0,7.4,5,1,67,1,0,0
154,193.0,6.9,9,26,70,0,1,0
155,145.0,13.2,9,27,77,0,0,1
156,191.0,14.3,9,28,75,0,0,1


### Using sklearn's OHE

In [78]:
weather_data_2010_4 = weather_data_2010.copy()
weather_data_2010_4

Unnamed: 0,Solar.R,Wind,Month,Day,Temp,Weather
0,190.0,7.4,5,1,67,S
1,118.0,8.0,5,2,72,C
2,149.0,12.6,5,3,74,PS
3,313.0,11.5,5,4,62,S
4,197.0,14.3,5,5,56,S
...,...,...,...,...,...,...
153,190.0,7.4,5,1,67,C
154,193.0,6.9,9,26,70,PS
155,145.0,13.2,9,27,77,S
156,191.0,14.3,9,28,75,S


In [79]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
weather_data_2010_4['Encoded_Weather'] = ohe.fit_transform(weather_data_2010_4['Weather'])
weather_data_2010_4.head(20)

ValueError: Expected 2D array, got 1D array instead:
array=['S' 'C' 'PS' 'S' 'S' 'C' 'PS' 'C' 'PS' 'S' 'C' 'PS' 'S' 'S' 'C' 'S' 'S'
 'C' 'PS' 'S' 'S' 'C' 'PS' 'C' 'PS' 'C' 'PS' 'S' 'S' 'C' 'PS' 'S' 'C' 'PS'
 'C' 'PS' 'C' 'PS' 'S' 'S' 'S' 'C' 'PS' 'S' 'S' 'C' 'PS' 'C' 'PS' 'S' 'S'
 'S' 'C' 'PS' 'S' 'C' 'PS' 'C' 'PS' 'S' 'S' 'S' 'C' 'PS' 'S' 'S' 'C' 'C'
 'PS' 'C' 'PS' 'S' 'S' 'S' 'C' 'PS' 'S' 'S' 'C' 'PS' 'C' 'S' 'S' 'C' 'PS'
 'PS' 'C' 'S' 'S' 'C' 'PS' 'C' 'PS' 'S' 'PS' 'S' 'S' 'C' 'C' 'PS' 'C' 'PS'
 'S' 'S' 'S' 'C' 'C' 'C' 'PS' 'C' 'PS' 'S' 'S' 'C' 'PS' 'C' 'PS' 'S' 'S'
 'S' 'S' 'S' 'C' 'C' 'PS' 'C' 'PS' 'S' 'S' 'S' 'C' 'PS' 'C' 'PS' 'S' 'S'
 'PS' 'PS' 'S' 'PS' 'S' 'C' 'PS' 'PS' 'S' 'S' 'C' 'PS' 'C' 'PS' 'S' 'PS'
 'S' 'C' 'PS' 'S' 'S' 'C'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.