## 1 - SetUp Environment

In [1]:
import numpy as np
import pandas as pd
import pickle

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 2 - Load Dataframe

now we should load the dataframe that we saved in the preprocessing section.

In [2]:
with open('/content/drive/MyDrive/Python/Regression/Assets/df(3.mapped).pickle', 'rb') as file:
    df = pickle.load(file)

df.head(3)

Unnamed: 0,Year,Month,Week Day,Duration,Cost,Team Member,Height,Frequency,Signal Strength,Antenna Type,Orientation,Power Supply,Zone
0,2019,3,0,241.0,516773.0,12.0,24.0,0,3,0,0,0,0
1,2019,10,2,608.0,954888.0,22.0,42.0,0,3,0,1,1,1
2,2019,6,5,772.0,932640.0,14.0,43.0,4,3,1,2,1,1


## 3 - Hot Encoding Categorical Variable

In this section we should declare all categorical variables to have better view about hot-encoding process
*   Year
*   Month
*   Week Day
*   Frequency <i>(we don't need convert it to dummy, because it is ordinal)</i>
*   Signal Strengh
*   Antenna Type
*   Orientation
*   Power Supply
*   Zone

for each variable, we should do this process</br>
1-convert to dummy</br>
2-change the column name</br>
3-remove the first one (prevent multicollinearity)

In [3]:
def make_dummy(x):
    dummy = pd.get_dummies(df[x] , drop_first=False)

    return dummy

### 3.1 - Year

In [4]:
df['Year'].unique()

array([2019, 2021, 2018, 2017, 2020])

In [21]:
#step 1
Year_dummy = make_dummy('Year')

#step 2
col_name = ['Year 2017', 'Year 2018', 'Year 2019', 'Year 2020', 'Year 2021']
Year_dummy.columns = col_name

#step 3
Year_dummy = Year_dummy.drop(['Year 2017'], axis = 1)
Year_dummy.head(2)

Unnamed: 0,Year 2018,Year 2019,Year 2020,Year 2021
0,0,1,0,0
1,0,1,0,0


### 3.2 - Month

In [6]:
sorted(df['Month'].unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [7]:
#step 1
Month_dummy = make_dummy('Month')

#step 2
col_name = ['January', 'Februrary', 'March', 'April', 'May', 'June', 'July', 'Auguest', 'September', 'October', 'November', 'December']
Month_dummy.columns = col_name

#step extra
Q1 = Month_dummy.iloc[: , 0:3].max(axis = 1)
Q2 = Month_dummy.iloc[: , 3:6].max(axis = 1)
Q3 = Month_dummy.iloc[: , 6:9].max(axis = 1)
Q4 = Month_dummy.iloc[: , 9:12].max(axis = 1)

Q = pd.concat([Q1, Q2, Q3, Q4] , axis=1)

col_name_Q = ['Quarter One', 'Quarter Two', 'Quarter Three', 'Quarter Four']
Q.columns = col_name_Q

#step 3
Quarter_dummy = Q.drop(['Quarter One'], axis = 1)
Quarter_dummy.head(2)

Unnamed: 0,Quarter Two,Quarter Three,Quarter Four
0,0,0,0
1,0,0,1


### 3.3 - Week Day

In [8]:
sorted(df['Week Day'].unique())

[0, 1, 2, 3, 4, 5, 6]

In [9]:
#step 1
WeekDay_dummy = make_dummy('Week Day')

#step 2
col_name = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
WeekDay_dummy.columns = col_name

#step 3
WeekDay_dummy = WeekDay_dummy.drop(['Monday'], axis = 1)
WeekDay_dummy.head(2)

Unnamed: 0,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,0,0,0,0,0,0
1,0,1,0,0,0,0


### 3.4 - Signal Strengh

In [10]:
sorted(df['Signal Strength'].unique())

[0, 1, 2, 3, 4, 5]

In [11]:
#step 1
Signal_Strength_dummy = make_dummy('Signal Strength')

#step 2
col_name = ['Signal 0', 'Signal 1', 'Signal 2', 'Signal 3', 'Signal 4', 'Signal 5']
Signal_Strength_dummy.columns = col_name

#step 3
Signal_Strength_dummy = Signal_Strength_dummy.drop(['Signal 0'], axis = 1)
Signal_Strength_dummy.head(2)

Unnamed: 0,Signal 1,Signal 2,Signal 3,Signal 4,Signal 5
0,0,0,1,0,0
1,0,0,1,0,0


### 3.5 - Antenna Type

In [12]:
sorted(df['Antenna Type'].unique())

[0, 1, 2, 3, 4, 5]

In [13]:
#step 1
Antenna_Type_dummy = make_dummy('Antenna Type')

#step 2
col_name = ['Dielectric.Antenna', 'PCB.Antenna', 'Wire.Antenna', 'Aperture.Antenna', 'Reflector.Antenna', 'Array.Antenna']
Antenna_Type_dummy.columns = col_name

#step 3
Antenna_Type_dummy = Antenna_Type_dummy.drop(['Dielectric.Antenna'], axis = 1)
Antenna_Type_dummy.head(2)

Unnamed: 0,PCB.Antenna,Wire.Antenna,Aperture.Antenna,Reflector.Antenna,Array.Antenna
0,0,0,0,0,0
1,0,0,0,0,0


### 3.6 - Orientation

In [14]:
sorted(df['Orientation'].unique())

[0, 1, 2, 3]

In [15]:
#step 1
Orientation_dummy = make_dummy('Orientation')

#step 2
col_name = ['Omni-directional_Orien', 'Circular_Orien', 'Horizontal_Orien', 'Vertical_Orien']
Orientation_dummy.columns = col_name

#step 3
Orientation_dummy = Orientation_dummy.drop(['Omni-directional_Orien'], axis = 1)
Orientation_dummy.head(2)

Unnamed: 0,Circular_Orien,Horizontal_Orien,Vertical_Orien
0,0,0,0
1,1,0,0


### 3.7 - Power Supply

In [16]:
sorted(df['Power Supply'].unique())

[0, 1, 2, 3, 4]

In [17]:
#step 1
Power_Supply_dummy = make_dummy('Power Supply')

#step 2
col_name = ['Solar-powered', 'Active-powered', 'Battery-powered', 'PoE-powered', 'Passive-powered']
Power_Supply_dummy.columns = col_name

#step 3
Power_Supply_dummy = Power_Supply_dummy.drop(['Solar-powered'], axis = 1)
Power_Supply_dummy.head(2)

Unnamed: 0,Active-powered,Battery-powered,PoE-powered,Passive-powered
0,0,0,0,0
1,1,0,0,0


### 3.8 - Zone

In [18]:
sorted(df['Zone'].unique())

[0, 1, 2, 3, 4]

In [19]:
#step 1
Zone_dummy = make_dummy('Zone')

#step 2
col_name = ['North_Zone', 'Center_Zone', 'West_Zone', 'South_Zone', 'East_Zone']
Zone_dummy.columns = col_name

#step 3
Zone_dummy = Zone_dummy.drop(['North_Zone'], axis = 1)
Zone_dummy.head(2)

Unnamed: 0,Center_Zone,West_Zone,South_Zone,East_Zone
0,0,0,0,0
1,1,0,0,0


## 4 - Summary of Benchmark

let's review the benchmark of each dummy variable. the benchmarks were those categories that we removed to prevenet multicollinearity.

*   <b>Antenna Type</b>: <i> Dielectric (0) </i>
*   <b>Quarter</b>: <i> Quarter One - January (1), February (2), March (3) </i>
*   <b>Orientation</b>: <i> Omni-directional (0) </i>
*   <b>Power Supply</b>: <i> Solar-powered (0)  </i>
*   <b>Singnal Strength</b>: <i> 0 </i>
*   <b>Week Day</b>: <i> Monday (0) </i>
*   <b>Year</b>: <i> Year 2017 (2017)  </i>
*   <b>Zone</b>: <i> North (0)  </i>

## 5 - Encoded Dataset

In this section we can aggregate all dummy variable, and conbine them with hnumerical variable to create encoded dataset.

In [24]:
df_without_dummy = df.iloc[: , [3,4,5,6,7]]

df_dummy = pd.concat([
    Year_dummy, 
    Quarter_dummy, 
    WeekDay_dummy, 
    Signal_Strength_dummy,
    Antenna_Type_dummy,
    Orientation_dummy,
    Power_Supply_dummy,
    Zone_dummy],
    axis = 1)

df = pd.concat([df_without_dummy, df_dummy] , axis = 1)

In [25]:
df

Unnamed: 0,Duration,Cost,Team Member,Height,Frequency,Year 2018,Year 2019,Year 2020,Year 2021,Quarter Two,...,Horizontal_Orien,Vertical_Orien,Active-powered,Battery-powered,PoE-powered,Passive-powered,Center_Zone,West_Zone,South_Zone,East_Zone
0,241.0,516773.0,12.0,24.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,608.0,954888.0,22.0,42.0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,772.0,932640.0,14.0,43.0,4,0,1,0,0,1,...,1,0,1,0,0,0,1,0,0,0
3,312.0,680455.0,19.0,23.0,1,0,1,0,0,0,...,0,1,1,0,0,0,0,1,0,0
4,123.0,128590.0,28.0,50.0,3,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,397.0,639447.0,30.0,49.0,6,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
295,1250.0,1611551.0,27.0,44.0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
296,478.0,885945.0,24.0,21.0,3,0,0,0,1,0,...,0,0,0,0,0,1,0,1,0,0
297,502.0,1100133.0,15.0,21.0,3,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0


## Check Point

In [26]:
with open('/content/drive/MyDrive/Python/Regression/Assets/df(3.encoded).pickle', 'wb') as file:
    pickle.dump(df, file)