# One-Hot Encoding

## Why Bother with One-Hot Encoding?

It's useful for feeding categorical data into machine-learning algorithms since integers are computationally less expensive than strings.

In [2]:
import pandas as pd
print(pd.__version__)

2.0.3


In [3]:
disengagements = pd.read_excel("../data/cassi-autonomous-shuttle/autonomous_shuttle_disengagement.xlsx",usecols=["Incident Datetime", "Location","Weather","Vehicle Speed in Miles per Hour", "Initiated by","Cause"], parse_dates=True)
disengagements

Unnamed: 0,Incident Datetime,Location,Weather,Vehicle Speed in Miles per Hour,Initiated by,Cause
0,2023-03-07T10:00:00-05:00,"35.7849964, -78.8268094",Sunny;,2,Operator,Fault Code/Error Code
1,2023-03-07T14:00:00-05:00,"35.7847312, -78.8245051",Sunny;,5,Operator,Station Blocked
2,2023-03-07T14:30:00-05:00,"35.7824658, -78.8244159",Sunny;,5,Operator,Station Blocked
3,2023-03-07T15:15:00-05:00,"35.7824658, -78.8244159",Sunny;,4,Operator,Station Blocked
4,2023-03-08T10:00:00-05:00,"35.7852558, -78.8273737",Sunny;,2,Operator,Shuttle Manually Deviated from Approved Path
...,...,...,...,...,...,...
174,2023-06-01T16:00:00-04:00,"35.783456, -78.821639",Sunny;,5,Operator,Signal Loss
175,2023-06-02T10:32:00-04:00,"35.7819145, -78.8235603",Sunny;,4,Operator,Station Blocked
176,2023-06-02T10:35:00-04:00,"35.7813188, -78.8256601",Sunny;,3,Operator,Station Blocked
177,2023-06-02T10:44:00-04:00,"35.7847325, -78.824496",Sunny;,4,Operator,Obstacle Detection


In [4]:
disengagements.dtypes

Incident Datetime                  object
Location                           object
Weather                            object
Vehicle Speed in Miles per Hour     int64
Initiated by                       object
Cause                              object
dtype: object

In [5]:
disengagements['Incident Datetime'] = pd.to_datetime(disengagements['Incident Datetime'], utc=True)
disengagements['Initiated by'] = disengagements['Initiated by'].astype('category')
disengagements['Cause'] = disengagements['Cause'].astype('category')
disengagements.dtypes

Incident Datetime                  datetime64[ns, UTC]
Location                                        object
Weather                                         object
Vehicle Speed in Miles per Hour                  int64
Initiated by                                  category
Cause                                         category
dtype: object

In [6]:
disengagements = disengagements.assign(week_of_year = disengagements['Incident Datetime'].dt.isocalendar().week, week_of_pilot = lambda x: disengagements['Incident Datetime'].dt.isocalendar().week - 9)
disengagements

Unnamed: 0,Incident Datetime,Location,Weather,Vehicle Speed in Miles per Hour,Initiated by,Cause,week_of_year,week_of_pilot
0,2023-03-07 15:00:00+00:00,"35.7849964, -78.8268094",Sunny;,2,Operator,Fault Code/Error Code,10,1
1,2023-03-07 19:00:00+00:00,"35.7847312, -78.8245051",Sunny;,5,Operator,Station Blocked,10,1
2,2023-03-07 19:30:00+00:00,"35.7824658, -78.8244159",Sunny;,5,Operator,Station Blocked,10,1
3,2023-03-07 20:15:00+00:00,"35.7824658, -78.8244159",Sunny;,4,Operator,Station Blocked,10,1
4,2023-03-08 15:00:00+00:00,"35.7852558, -78.8273737",Sunny;,2,Operator,Shuttle Manually Deviated from Approved Path,10,1
...,...,...,...,...,...,...,...,...
174,2023-06-01 20:00:00+00:00,"35.783456, -78.821639",Sunny;,5,Operator,Signal Loss,22,13
175,2023-06-02 14:32:00+00:00,"35.7819145, -78.8235603",Sunny;,4,Operator,Station Blocked,22,13
176,2023-06-02 14:35:00+00:00,"35.7813188, -78.8256601",Sunny;,3,Operator,Station Blocked,22,13
177,2023-06-02 14:44:00+00:00,"35.7847325, -78.824496",Sunny;,4,Operator,Obstacle Detection,22,13


In [7]:
disengagements['Cause']

0                             Fault Code/Error Code
1                                   Station Blocked
2                                   Station Blocked
3                                   Station Blocked
4      Shuttle Manually Deviated from Approved Path
                           ...                     
174                                     Signal Loss
175                                 Station Blocked
176                                 Station Blocked
177                              Obstacle Detection
178                         Signalized Intersection
Name: Cause, Length: 179, dtype: category
Categories (9, object): ['Fault Code/Error Code', 'Obstacle Detection', 'Other Road Users', 'Priority Zone', ..., 'Signal Loss', 'Signalized Intersection', 'Station Blocked', 'Vegetation']

In [8]:
disengagements['Cause'].cat.categories

Index(['Fault Code/Error Code', 'Obstacle Detection', 'Other Road Users',
       'Priority Zone', 'Shuttle Manually Deviated from Approved Path',
       'Signal Loss', 'Signalized Intersection', 'Station Blocked',
       'Vegetation'],
      dtype='object')

In [9]:
disengagements_datetime_is_index = disengagements.set_index('Incident Datetime')
disengagements_datetime_is_index

Unnamed: 0_level_0,Location,Weather,Vehicle Speed in Miles per Hour,Initiated by,Cause,week_of_year,week_of_pilot
Incident Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-03-07 15:00:00+00:00,"35.7849964, -78.8268094",Sunny;,2,Operator,Fault Code/Error Code,10,1
2023-03-07 19:00:00+00:00,"35.7847312, -78.8245051",Sunny;,5,Operator,Station Blocked,10,1
2023-03-07 19:30:00+00:00,"35.7824658, -78.8244159",Sunny;,5,Operator,Station Blocked,10,1
2023-03-07 20:15:00+00:00,"35.7824658, -78.8244159",Sunny;,4,Operator,Station Blocked,10,1
2023-03-08 15:00:00+00:00,"35.7852558, -78.8273737",Sunny;,2,Operator,Shuttle Manually Deviated from Approved Path,10,1
...,...,...,...,...,...,...,...
2023-06-01 20:00:00+00:00,"35.783456, -78.821639",Sunny;,5,Operator,Signal Loss,22,13
2023-06-02 14:32:00+00:00,"35.7819145, -78.8235603",Sunny;,4,Operator,Station Blocked,22,13
2023-06-02 14:35:00+00:00,"35.7813188, -78.8256601",Sunny;,3,Operator,Station Blocked,22,13
2023-06-02 14:44:00+00:00,"35.7847325, -78.824496",Sunny;,4,Operator,Obstacle Detection,22,13


In [10]:
disengagements_datetime_is_index.index=disengagements_datetime_is_index.index.tz_convert(tz='US/Eastern')
disengagements_datetime_is_index

Unnamed: 0_level_0,Location,Weather,Vehicle Speed in Miles per Hour,Initiated by,Cause,week_of_year,week_of_pilot
Incident Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-03-07 10:00:00-05:00,"35.7849964, -78.8268094",Sunny;,2,Operator,Fault Code/Error Code,10,1
2023-03-07 14:00:00-05:00,"35.7847312, -78.8245051",Sunny;,5,Operator,Station Blocked,10,1
2023-03-07 14:30:00-05:00,"35.7824658, -78.8244159",Sunny;,5,Operator,Station Blocked,10,1
2023-03-07 15:15:00-05:00,"35.7824658, -78.8244159",Sunny;,4,Operator,Station Blocked,10,1
2023-03-08 10:00:00-05:00,"35.7852558, -78.8273737",Sunny;,2,Operator,Shuttle Manually Deviated from Approved Path,10,1
...,...,...,...,...,...,...,...
2023-06-01 16:00:00-04:00,"35.783456, -78.821639",Sunny;,5,Operator,Signal Loss,22,13
2023-06-02 10:32:00-04:00,"35.7819145, -78.8235603",Sunny;,4,Operator,Station Blocked,22,13
2023-06-02 10:35:00-04:00,"35.7813188, -78.8256601",Sunny;,3,Operator,Station Blocked,22,13
2023-06-02 10:44:00-04:00,"35.7847325, -78.824496",Sunny;,4,Operator,Obstacle Detection,22,13


In [11]:
disengagements_datetime_is_index.dtypes

Location                             object
Weather                              object
Vehicle Speed in Miles per Hour       int64
Initiated by                       category
Cause                              category
week_of_year                         UInt32
week_of_pilot                        UInt32
dtype: object

In [12]:
one_hot = disengagements_datetime_is_index.Weather.str.get_dummies(sep=';')
one_hot

Unnamed: 0_level_0,Cloudy,Partly Cloudy,Rain,Sunny,Windy
Incident Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-03-07 10:00:00-05:00,0,0,0,1,0
2023-03-07 14:00:00-05:00,0,0,0,1,0
2023-03-07 14:30:00-05:00,0,0,0,1,0
2023-03-07 15:15:00-05:00,0,0,0,1,0
2023-03-08 10:00:00-05:00,0,0,0,1,0
...,...,...,...,...,...
2023-06-01 16:00:00-04:00,0,0,0,1,0
2023-06-02 10:32:00-04:00,0,0,0,1,0
2023-06-02 10:35:00-04:00,0,0,0,1,0
2023-06-02 10:44:00-04:00,0,0,0,1,0


In [13]:
one_hot.columns = 'Weather_' + one_hot.columns
one_hot

Unnamed: 0_level_0,Weather_Cloudy,Weather_Partly Cloudy,Weather_Rain,Weather_Sunny,Weather_Windy
Incident Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-03-07 10:00:00-05:00,0,0,0,1,0
2023-03-07 14:00:00-05:00,0,0,0,1,0
2023-03-07 14:30:00-05:00,0,0,0,1,0
2023-03-07 15:15:00-05:00,0,0,0,1,0
2023-03-08 10:00:00-05:00,0,0,0,1,0
...,...,...,...,...,...
2023-06-01 16:00:00-04:00,0,0,0,1,0
2023-06-02 10:32:00-04:00,0,0,0,1,0
2023-06-02 10:35:00-04:00,0,0,0,1,0
2023-06-02 10:44:00-04:00,0,0,0,1,0


In [14]:
one_hot_cause = disengagements_datetime_is_index.Cause.str.get_dummies()
one_hot_cause.columns = 'Cause_' + one_hot_cause.columns
one_hot_cause

Unnamed: 0_level_0,Cause_Fault Code/Error Code,Cause_Obstacle Detection,Cause_Other Road Users,Cause_Priority Zone,Cause_Shuttle Manually Deviated from Approved Path,Cause_Signal Loss,Cause_Signalized Intersection,Cause_Station Blocked,Cause_Vegetation
Incident Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-03-07 10:00:00-05:00,1,0,0,0,0,0,0,0,0
2023-03-07 14:00:00-05:00,0,0,0,0,0,0,0,1,0
2023-03-07 14:30:00-05:00,0,0,0,0,0,0,0,1,0
2023-03-07 15:15:00-05:00,0,0,0,0,0,0,0,1,0
2023-03-08 10:00:00-05:00,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...
2023-06-01 16:00:00-04:00,0,0,0,0,0,1,0,0,0
2023-06-02 10:32:00-04:00,0,0,0,0,0,0,0,1,0
2023-06-02 10:35:00-04:00,0,0,0,0,0,0,0,1,0
2023-06-02 10:44:00-04:00,0,1,0,0,0,0,0,0,0


In [15]:
disengagements_datetime_is_index = disengagements_datetime_is_index.drop(['Weather', 'Initiated by', 'Cause'], axis=1)
cassi_data_one_hot_encoded = pd.concat([disengagements_datetime_is_index, one_hot, one_hot_cause], axis=1)
cassi_data_one_hot_encoded

Unnamed: 0_level_0,Location,Vehicle Speed in Miles per Hour,week_of_year,week_of_pilot,Weather_Cloudy,Weather_Partly Cloudy,Weather_Rain,Weather_Sunny,Weather_Windy,Cause_Fault Code/Error Code,Cause_Obstacle Detection,Cause_Other Road Users,Cause_Priority Zone,Cause_Shuttle Manually Deviated from Approved Path,Cause_Signal Loss,Cause_Signalized Intersection,Cause_Station Blocked,Cause_Vegetation
Incident Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2023-03-07 10:00:00-05:00,"35.7849964, -78.8268094",2,10,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0
2023-03-07 14:00:00-05:00,"35.7847312, -78.8245051",5,10,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2023-03-07 14:30:00-05:00,"35.7824658, -78.8244159",5,10,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2023-03-07 15:15:00-05:00,"35.7824658, -78.8244159",4,10,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2023-03-08 10:00:00-05:00,"35.7852558, -78.8273737",2,10,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-01 16:00:00-04:00,"35.783456, -78.821639",5,22,13,0,0,0,1,0,0,0,0,0,0,1,0,0,0
2023-06-02 10:32:00-04:00,"35.7819145, -78.8235603",4,22,13,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2023-06-02 10:35:00-04:00,"35.7813188, -78.8256601",3,22,13,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2023-06-02 10:44:00-04:00,"35.7847325, -78.824496",4,22,13,0,0,0,1,0,0,1,0,0,0,0,0,0,0


In [16]:
cassi_data_one_hot_encoded.index = cassi_data_one_hot_encoded.index.tz_convert(tz='UTC')

In [17]:
cassi_data_one_hot_encoded

Unnamed: 0_level_0,Location,Vehicle Speed in Miles per Hour,week_of_year,week_of_pilot,Weather_Cloudy,Weather_Partly Cloudy,Weather_Rain,Weather_Sunny,Weather_Windy,Cause_Fault Code/Error Code,Cause_Obstacle Detection,Cause_Other Road Users,Cause_Priority Zone,Cause_Shuttle Manually Deviated from Approved Path,Cause_Signal Loss,Cause_Signalized Intersection,Cause_Station Blocked,Cause_Vegetation
Incident Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2023-03-07 15:00:00+00:00,"35.7849964, -78.8268094",2,10,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0
2023-03-07 19:00:00+00:00,"35.7847312, -78.8245051",5,10,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2023-03-07 19:30:00+00:00,"35.7824658, -78.8244159",5,10,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2023-03-07 20:15:00+00:00,"35.7824658, -78.8244159",4,10,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2023-03-08 15:00:00+00:00,"35.7852558, -78.8273737",2,10,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-01 20:00:00+00:00,"35.783456, -78.821639",5,22,13,0,0,0,1,0,0,0,0,0,0,1,0,0,0
2023-06-02 14:32:00+00:00,"35.7819145, -78.8235603",4,22,13,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2023-06-02 14:35:00+00:00,"35.7813188, -78.8256601",3,22,13,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2023-06-02 14:44:00+00:00,"35.7847325, -78.824496",4,22,13,0,0,0,1,0,0,1,0,0,0,0,0,0,0
