## Feature Engineering
The preliminary goal is to:
- consider each feature one by one
- identify features to keep or drop
- relate missing values to feature (why)

In [2]:
import pandas as pd
import numpy as np

In [4]:
vehicles = pd.read_csv("vehicles.sample.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


- [`Vehicles`](https://data.cityofchicago.org/Transportation/Traffic-Crashes-Vehicles/68nd-jvt3)

> *\"Each mode of transportation involved in a crash is a **“unit”** and get one entry here. Each vehicle, each pedestrian, each motorcyclist, and each bicyclist **is considered an independent unit** that can have a trajectory separate from the other units.\"*

> *\"Vehicle information can be linked back to Crash data using the **CRASH_RECORD_ID** field.\"* 

> *\"Data for occupants who do not make up an independent unit, typically **drivers and passengers**, are available in the **People** table.\"*

> *\"Since this dataset is a combination of vehicles, pedestrians, and pedal cyclists not all columns are applicable to each record. Look at the **Unit Type** field to determine what additional data may be available for that record.\"*

In [5]:
vehicles.head()

Unnamed: 0,CRASH_UNIT_ID,CRASH_RECORD_ID,RD_NO,CRASH_DATE,UNIT_NO,UNIT_TYPE,NUM_PASSENGERS,VEHICLE_ID,CMRC_VEH_I,MAKE,...,TRAILER1_LENGTH,TRAILER2_LENGTH,TOTAL_VEHICLE_LENGTH,AXLE_CNT,VEHICLE_CONFIG,CARGO_BODY_TYPE,LOAD_TYPE,HAZMAT_OUT_OF_SERVICE_I,MCS_OUT_OF_SERVICE_I,HAZMAT_CLASS
0,228894,2524b06fc6634bd349fc1d7f132c8362529473768f6ac2...,JA453603,10/01/2017 02:20:00 AM,1,DRIVER,,224185.0,,DODGE,...,,,,,,,,,,
1,228895,2524b06fc6634bd349fc1d7f132c8362529473768f6ac2...,JA453603,10/01/2017 02:20:00 AM,2,PARKED,,224187.0,,GENERAL MOTORS CORP.,...,,,,,,,,,,
2,228896,2524b06fc6634bd349fc1d7f132c8362529473768f6ac2...,JA453603,10/01/2017 02:20:00 AM,3,PARKED,,224191.0,,SATURN,...,,,,,,,,,,
3,228905,f7210aec9ac43a7b36c30b49033adebcb3060b96580341...,JA453640,10/01/2017 02:41:00 AM,1,DRIVER,,224199.0,,FORD,...,,,,,,,,,,
4,228906,f7210aec9ac43a7b36c30b49033adebcb3060b96580341...,JA453640,10/01/2017 02:41:00 AM,2,DRIVERLESS,,224203.0,,CHEVROLET,...,,,,,,,,,,


In [6]:
vehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75673 entries, 0 to 75672
Data columns (total 72 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CRASH_UNIT_ID             75673 non-null  int64  
 1   CRASH_RECORD_ID           75673 non-null  object 
 2   RD_NO                     67544 non-null  object 
 3   CRASH_DATE                75673 non-null  object 
 4   UNIT_NO                   75673 non-null  int64  
 5   UNIT_TYPE                 75561 non-null  object 
 6   NUM_PASSENGERS            11633 non-null  float64
 7   VEHICLE_ID                73804 non-null  float64
 8   CMRC_VEH_I                1514 non-null   object 
 9   MAKE                      73804 non-null  object 
 10  MODEL                     73787 non-null  object 
 11  LIC_PLATE_STATE           67639 non-null  object 
 12  VEHICLE_YEAR              61945 non-null  float64
 13  VEHICLE_DEFECT            73804 non-null  object 
 14  VEHICL

In [7]:
print('Column names are not specificied as',
      len(vehicles.columns[vehicles.isnull().any()]), 'out of',
      len(vehicles.columns), 'columns have missing values.')
print('')
print('\033[1m', sum(vehicles.isnull().mean().round(2) > 0.5),
      'of them have missing values more than 50%!', '\033[0m')
#print(vehicles.columns[vehicles.isnull().mean().round(2) > 0.5].tolist())

Column names are not specificied as 68 out of 72 columns have missing values.

[1m 54 of them have missing values more than 50%! [0m


In [8]:
path_vehicles = 'vehicles.sample.csv'

categoricals_crashes = dict.fromkeys(
    [5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 35],
    'category')
categoricals_vehicles = dict.fromkeys([5, 9, 10, 11, 13, 14, 15, 16, 17, 38],
                                      'category')
categoricals_people = dict.fromkeys(
    [7, 8, 10, 12, 13, 14, 15, 17, 21, 22, 23, 27], 'category')

vehicles = pd.read_csv(path_vehicles,
                       parse_dates=["CRASH_DATE"],
                       low_memory=False,
                       dtype=categoricals_vehicles)

In [10]:
for col in vehicles.columns:
    if vehicles[col].dtype.name == 'category':
        print('\033[1m', '\nColumn Name:', col, '\033[0m')
        print(vehicles[col].value_counts())

[1m 
Column Name: UNIT_TYPE [0m
DRIVER                 63122
PARKED                  9995
PEDESTRIAN              1025
DRIVERLESS               719
BICYCLE                  593
NON-MOTOR VEHICLE         79
NON-CONTACT VEHICLE       16
DISABLED VEHICLE           9
EQUESTRIAN                 3
Name: UNIT_TYPE, dtype: int64
[1m 
Column Name: MAKE [0m
CHEVROLET                             8580
FORD                                  7442
UNKNOWN                               7083
TOYOTA MOTOR COMPANY, LTD.            6401
NISSAN                                6313
                                      ... 
KWICK KIT CEMENT MIXER                   1
KUT-KWICK CORP. BRUNSWICK, GEORGIA       1
KON KWEST MANUFACTURING                  1
KING TRAILER CO., INC.                   1
"(ASSEMBLED FROM PARTS, HOMEMADE)"       1
Name: MAKE, Length: 326, dtype: int64
[1m 
Column Name: MODEL [0m
UNKNOWN                                       12919
OTHER (EXPLAIN IN NARRATIVE)                   2772
C

In [9]:
vehicles['MODEL'].describe()

count       73787
unique       1256
top       UNKNOWN
freq        12919
Name: MODEL, dtype: object

In [11]:
print(vehicles['MODEL'].value_counts())

UNKNOWN                                       12919
OTHER (EXPLAIN IN NARRATIVE)                   2772
CAMRY                                          2349
COROLLA                                        1492
CIVIC                                          1368
                                              ...  
OUTBACK SPORT                                     1
OTTAWA TRUCK DIV., GULF & WESTERN MFG. CO.        1
OTTAWA TRUCK DIV., (GULF WESTERN MFG. CO.)        1
Neon                                              1
"(ASSEMBLED FROM PARTS, HOMEMADE)"                1
Name: MODEL, Length: 1256, dtype: int64


In [12]:
print(vehicles['LIC_PLATE_STATE'].value_counts())

IL    62106
IN     1349
XX     1341
WI      409
MI      284
TX      209
FL      203
OH      172
AZ      166
MO      123
IA      115
MN      114
GA      107
TN      102
CA       92
OK       63
PA       58
NC       49
KY       49
NY       47
CO       46
MS       43
VA       37
LA       31
NJ       30
NB       29
MD       27
KS       27
AR       23
WA       21
MA       21
AL       20
SC       19
ME       13
OR       12
CT       10
UT        9
SD        8
MT        7
ND        7
NV        7
ID        6
WV        6
WY        4
NH        4
VT        4
AK        4
HI        2
NM        2
RI        2
Name: LIC_PLATE_STATE, dtype: int64


#Grouping of Levels

In [32]:
#Group License Plates either into "IL" or "OTHER"

def LIC_PLATE_STATE(x):
    if ("IL" in x):
            return "IL"
    else:
        return "OTHER"   
vehicles["LIC_PLATE_NEW"] = vehicles["LIC_PLATE_STATE"].apply(lambda x: LIC_PLATE_STATE(x))
print(vehicles['LIC_PLATE_NEW'].value_counts())

IL       62106
OTHER     5533
Name: LIC_PLATE_NEW, dtype: int64


In [33]:
print(vehicles['VEHICLE_TYPE'].value_counts())

PASSENGER                                 46978
SPORT UTILITY VEHICLE (SUV)                9460
UNKNOWN/NA                                 6715
VAN/MINI-VAN                               3676
PICKUP                                     2263
TRUCK - SINGLE UNIT                        1498
BUS OVER 15 PASS.                           884
OTHER                                       814
TRACTOR W/ SEMI-TRAILER                     774
MOTORCYCLE (OVER 150CC)                     185
BUS UP TO 15 PASS.                          146
SINGLE UNIT TRUCK WITH TRAILER              113
TRACTOR W/O SEMI-TRAILER                    110
OTHER VEHICLE WITH TRAILER                  106
MOTOR DRIVEN CYCLE                           24
AUTOCYCLE                                    24
MOPED OR MOTORIZED BICYCLE                   13
ALL-TERRAIN VEHICLE (ATV)                     9
FARM EQUIPMENT                                7
3-WHEELED MOTORCYCLE (2 REAR WHEELS)          3
RECREATIONAL OFF-HIGHWAY VEHICLE (ROV)  

In [42]:
#Group License Plates either into "PASSENGER" or "OTHER"

def VEHICLE_TYPE(x):
    if ("PASSENGER" in x):
        if ("SPORT UTILITY VEHICLE (SUV)" in x):
            if ("VAN/MINI-VAN" in x):
                if ("PICKUP" in x):
                    return "PASSENGER"
    else:
        return "OTHER"   
vehicles["VEHICLE_TYPE_NEW"] = vehicles["VEHICLE_TYPE"].apply(lambda x: VEHICLE_TYPE(x))

print(vehicles['VEHICLE_TYPE_NEW'].value_counts())

OTHER    26826
Name: VEHICLE_TYPE_NEW, dtype: int64


In [46]:
#Group Vehicle Defect into "Yes" or "No"

def VEHICLE_DEFECT(x):
    if ("UNKNOWN" in x):
        if ("NONE" in x):
            return "No"
    else:
        return "Yes"   
vehicles["VEHICLE_DEFECT_NEW"] = vehicles["VEHICLE_DEFECT"].apply(lambda x: VEHICLE_DEFECT(x))

print(vehicles['VEHICLE_DEFECT_NEW'].value_counts())

Yes    43021
Name: VEHICLE_DEFECT_NEW, dtype: int64


In [51]:
#Group Vehicle Defect into "New" or "Old"

def VEHICLE_YEAR(x):
    if x>2010:
        return "New"
    else:
        return "Old"   
vehicles["VEHICLE_YEAR_NEW"] = vehicles["VEHICLE_YEAR"].apply(lambda x: VEHICLE_YEAR(x))

print(vehicles['VEHICLE_YEAR_NEW'].value_counts())

Old    40539
New    35134
Name: VEHICLE_YEAR_NEW, dtype: int64


In [76]:
print(vehicles['VEHICLE_CONFIG'].value_counts())

TRACTOR/SEMI-TRAILER                   289
SINGLE UNIT TRUCK, 2 AXLES, 6 TIRES    129
BUS                                    116
TRUCK/TRACTOR                           57
TRUCK/TRAILER                           40
UNKNOWN HEAVY TRUCK                     35
SINGLE UNIT TRUCK, 3 OR MORE AXLES      34
TRACTOR/DOUBLES                          4
Name: VEHICLE_CONFIG, dtype: int64


In [75]:
#Remove MAKE and MODEL Features

#vehicles.drop('MAKE', inplace=True, axis=1)
#vehicles.drop('MODEL', inplace=True, axis=1)

#Columns to remove
#MAKE
#MODEL
#TOWED_BY
#TOWED_TO
#all AREAS
#FIRST_CONTACT_POINT
#CMV_ID (commercial motor vehicle ID)
#USDOT_NO (US Dept Transport Number)
#CCMC_NO
#ILCC_NO (IL commerce commisision)
#GVWR (gross vehicle weight rating)
#CARRIER_NAME (towing company)
#CARRIER_STATE
#CARRIER_CITY
#HAZMAT_PLACARDS_I
#HAZMAT_NAME
#UN_NO
#HAZMAT_PRESENT_I
#HAZMAT_REPORT_I
#HAZMAT_REPORT_NO
#MCS_REPORT_I (motor carrier safety)
#MSC_REPORT_NO
#HAZMAT_VIO_CAUSE_CRASH_I
#MCS_VIO_CAUSE_CRASH_I
#IDOT_PERMIT_NO (IL Dept Transport Permit Number)
#WIDE_LOAD_I
#AXLE_CNT
#TOTAL_VEHICLE_LENGTH


KeyError: "['MAKE'] not found in axis"

In [None]:
#FEATURES TO COMBINE/ENGINEER


In [4]:
crashes = pd.read_csv("crashes.sample2020.csv")

In [5]:
crashes.head()

Unnamed: 0,CRASH_RECORD_ID,RD_NO,CRASH_DATE_EST_I,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,...,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,LOCATION
0,0211e1f766f3940dfa87375661d25b716655e908c320cc...,JC301403,,06/11/2019 08:40:00 AM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,...,0.0,0.0,3.0,0.0,8,3,6,41.794779,-87.623828,POINT (-87.623828038036 41.794778764028)
1,02e2ed3606a50dda185f5e97c57a45552087d6fbea1c4b...,JB256393,,05/09/2018 11:30:00 AM,25,NO CONTROLS,NO CONTROLS,RAIN,DAYLIGHT,ANGLE,...,0.0,0.0,2.0,0.0,11,4,5,41.72129,-87.62851,POINT (-87.628509593966 41.72128957001)
2,03def753c76d0105940f82e9eaac6f1d87683b7a574c20...,JB246843,,05/02/2018 12:50:00 PM,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,OTHER OBJECT,...,1.0,0.0,0.0,0.0,12,4,5,41.809781,-87.594213,POINT (-87.594212812011 41.809781151018)
3,14fc616db83000e28d672601062a56d76583c0889a764b...,JB372787,,07/31/2018 11:25:00 AM,35,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,TURNING,...,0.0,0.0,2.0,0.0,11,3,7,41.741804,-87.740954,POINT (-87.740953581987 41.741803598989)
4,1a9af7862e8471daf453354e6aae4561d0ff4f2f1bfb30...,JB490362,,10/25/2018 10:50:00 PM,30,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",FIXED OBJECT,...,0.0,0.0,1.0,0.0,22,5,10,41.946644,-87.686894,POINT (-87.686894106014 41.946643528978)


In [8]:
crashes.shape

(33408, 49)

In [6]:
print(crashes['HIT_AND_RUN_I'].value_counts())

Y    9272
N     422
Name: HIT_AND_RUN_I, dtype: int64
