### Import of all libraries

In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from itertools import combinations
import datetime
import squarify

### Read csv file and basic info of data

In [2]:
df = pd.read_csv('Motor_Vehicle_Collisions_Vehicles.csv',dtype={'VEHICLE_MODEL':'str'})
pd.set_option('display.max_columns', None)

In [3]:
df.shape

(4205372, 25)

In [4]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4205372 entries, 0 to 4205371
Data columns (total 25 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   UNIQUE_ID                    4205372 non-null  int64  
 1   COLLISION_ID                 4205372 non-null  int64  
 2   CRASH_DATE                   4205372 non-null  object 
 3   CRASH_TIME                   4205372 non-null  object 
 4   VEHICLE_ID                   4205372 non-null  object 
 5   STATE_REGISTRATION           3896465 non-null  object 
 6   VEHICLE_TYPE                 3965836 non-null  object 
 7   VEHICLE_MAKE                 2319792 non-null  object 
 8   VEHICLE_MODEL                51380 non-null    object 
 9   VEHICLE_YEAR                 2299693 non-null  float64
 10  TRAVEL_DIRECTION             2535949 non-null  object 
 11  VEHICLE_OCCUPANTS            2420009 non-null  float64
 12  DRIVER_SEX                   1977187 non-n

# Data Cleaning Phase

### Drop useless columns

In [5]:
df.drop(columns=['TRAVEL_DIRECTION',
                 'DRIVER_LICENSE_JURISDICTION',
                 'PUBLIC_PROPERTY_DAMAGE_TYPE',
                 'VEHICLE_MAKE',
                 'VEHICLE_MODEL',
                 'VEHICLE_YEAR',
                 'VEHICLE_ID'
                ], inplace=True)

In [6]:
df.head()

Unnamed: 0,UNIQUE_ID,COLLISION_ID,CRASH_DATE,CRASH_TIME,STATE_REGISTRATION,VEHICLE_TYPE,VEHICLE_OCCUPANTS,DRIVER_SEX,DRIVER_LICENSE_STATUS,PRE_CRASH,POINT_OF_IMPACT,VEHICLE_DAMAGE,VEHICLE_DAMAGE_1,VEHICLE_DAMAGE_2,VEHICLE_DAMAGE_3,PUBLIC_PROPERTY_DAMAGE,CONTRIBUTING_FACTOR_1,CONTRIBUTING_FACTOR_2
0,10385780,100201,09/07/2012,9:03,NY,PASSENGER VEHICLE,,,,,,,,,,,Unspecified,
1,19140702,4213082,09/23/2019,8:15,NY,Station Wagon/Sport Utility Vehicle,1.0,M,Licensed,Going Straight Ahead,Left Front Bumper,Left Front Quarter Panel,,,,N,Driver Inattention/Distraction,Unspecified
2,14887647,3307608,10/02/2015,17:18,NY,TAXI,,,,Going Straight Ahead,,,,,,,Driver Inattention/Distraction,
3,14889754,3308693,10/04/2015,20:34,NY,PASSENGER VEHICLE,,,,Parked,,,,,,,Unspecified,
4,14400270,297666,04/25/2013,21:15,NY,PASSENGER VEHICLE,,,,,,,,,,,Other Vehicular,


### Handle NAs etc

In [7]:
u = "Unspecified"

df['CRASH_DATE'] = pd.to_datetime(df['CRASH_DATE'])
df['CRASH_TIME'] = pd.to_datetime(df['CRASH_TIME']).dt.time

#df['VEHICLE_ID'] = df['VEHICLE_ID'].fillna(u).astype(str)

df['STATE_REGISTRATION'] = df['STATE_REGISTRATION'].fillna(u)
df['VEHICLE_TYPE'] = df['VEHICLE_TYPE'].fillna(u).str.upper()
# df['VEHICLE_MAKE'] = df['VEHICLE_MAKE'].fillna(u)
# df['VEHICLE_MODEL'] = df['VEHICLE_MODEL'].fillna(u)
# df['VEHICLE_YEAR'] = df['VEHICLE_YEAR'].fillna(0).astype(int)

df['VEHICLE_OCCUPANTS'] = df['VEHICLE_OCCUPANTS'].fillna(0).astype(int)
df['VEHICLE_OCCUPANTS'] = df['VEHICLE_OCCUPANTS'].clip(lower=0, upper=100)#Values 0 and 100 are considered to be Unspecified

mapping = {
    'F': 'Female',
    'M': 'Male',
    'U': 'Unspecified'
}

df['DRIVER_SEX'] = df['DRIVER_SEX'].fillna(u).replace(mapping)
df['DRIVER_LICENSE_STATUS'] = df['DRIVER_LICENSE_STATUS'].fillna(u)

mapping = {
    'Other*':'Unspecified',
    'Other' :'Unspecified',
    'N' : 'No',
    'Y' : 'Yes',
    '1' : 'Unspecified',
    '80' : 'Unspecified'
}

df['PRE_CRASH'] = df['PRE_CRASH'].fillna(u).replace(mapping)
df['POINT_OF_IMPACT'] = df['POINT_OF_IMPACT'].fillna(u).replace(mapping)
df['VEHICLE_DAMAGE'] = df['VEHICLE_DAMAGE'].fillna(u).replace(mapping)
df['VEHICLE_DAMAGE_1'] = df['VEHICLE_DAMAGE_1'].fillna(u).replace(mapping)
df['VEHICLE_DAMAGE_2'] = df['VEHICLE_DAMAGE_2'].fillna(u).replace(mapping)
df['VEHICLE_DAMAGE_3'] = df['VEHICLE_DAMAGE_3'].fillna(u).replace(mapping)
df['PUBLIC_PROPERTY_DAMAGE'] = df['PUBLIC_PROPERTY_DAMAGE'].fillna(u).replace(mapping)
df['CONTRIBUTING_FACTOR_1'] = df['CONTRIBUTING_FACTOR_1'].fillna(u).replace(mapping)
df['CONTRIBUTING_FACTOR_2'] = df['CONTRIBUTING_FACTOR_2'].fillna(u).replace(mapping)

In [8]:
df['CRASH_DATE'] = pd.to_datetime(df['CRASH_DATE'])
df['day'] = df['CRASH_DATE'].dt.day
df['month'] = df['CRASH_DATE'].dt.month
df['year'] = df['CRASH_DATE'].dt.year

# df = df[(df['CRASH_DATE'] > '2012') & (df['CRASH_DATE'] < '2024')]
# df.sort_values(by='CRASH_DATE',inplace=True)
# df.reset_index(drop=True,inplace=True)


df = df[(df['year'] > 2012) & (df['year'] < 2024)]
df.sort_values(by='CRASH_DATE',inplace=True)
df.reset_index(drop=True,inplace=True)


In [9]:
#len(list(df['CollisionID'].unique()))

In [10]:
df.head()

Unnamed: 0,UNIQUE_ID,COLLISION_ID,CRASH_DATE,CRASH_TIME,STATE_REGISTRATION,VEHICLE_TYPE,VEHICLE_OCCUPANTS,DRIVER_SEX,DRIVER_LICENSE_STATUS,PRE_CRASH,POINT_OF_IMPACT,VEHICLE_DAMAGE,VEHICLE_DAMAGE_1,VEHICLE_DAMAGE_2,VEHICLE_DAMAGE_3,PUBLIC_PROPERTY_DAMAGE,CONTRIBUTING_FACTOR_1,CONTRIBUTING_FACTOR_2,day,month,year
0,10223579,92832,2013-01-01,07:25:00,NY,PASSENGER VEHICLE,0,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,1,1,2013
1,11805157,171291,2013-01-01,12:51:00,NY,PASSENGER VEHICLE,0,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,1,1,2013
2,11400882,148896,2013-01-01,17:04:00,NJ,UNKNOWN,0,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,1,1,2013
3,14201289,286687,2013-01-01,18:13:00,NY,PASSENGER VEHICLE,0,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,1,1,2013
4,13083694,234642,2013-01-01,18:15:00,NY,SPORT UTILITY / STATION WAGON,0,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,1,1,2013


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3927379 entries, 0 to 3927378
Data columns (total 21 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   UNIQUE_ID               int64         
 1   COLLISION_ID            int64         
 2   CRASH_DATE              datetime64[ns]
 3   CRASH_TIME              object        
 4   STATE_REGISTRATION      object        
 5   VEHICLE_TYPE            object        
 6   VEHICLE_OCCUPANTS       int32         
 7   DRIVER_SEX              object        
 8   DRIVER_LICENSE_STATUS   object        
 9   PRE_CRASH               object        
 10  POINT_OF_IMPACT         object        
 11  VEHICLE_DAMAGE          object        
 12  VEHICLE_DAMAGE_1        object        
 13  VEHICLE_DAMAGE_2        object        
 14  VEHICLE_DAMAGE_3        object        
 15  PUBLIC_PROPERTY_DAMAGE  object        
 16  CONTRIBUTING_FACTOR_1   object        
 17  CONTRIBUTING_FACTOR_2   object        
 18  da

In [12]:
df.isna().sum()

UNIQUE_ID                 0
COLLISION_ID              0
CRASH_DATE                0
CRASH_TIME                0
STATE_REGISTRATION        0
VEHICLE_TYPE              0
VEHICLE_OCCUPANTS         0
DRIVER_SEX                0
DRIVER_LICENSE_STATUS     0
PRE_CRASH                 0
POINT_OF_IMPACT           0
VEHICLE_DAMAGE            0
VEHICLE_DAMAGE_1          0
VEHICLE_DAMAGE_2          0
VEHICLE_DAMAGE_3          0
PUBLIC_PROPERTY_DAMAGE    0
CONTRIBUTING_FACTOR_1     0
CONTRIBUTING_FACTOR_2     0
day                       0
month                     0
year                      0
dtype: int64

### Categorize vehicle types

In [13]:
vehicle_type_mapping = {
    'AMBULANCE': 'AMBULANCE',
    'AMBUL' : 'AMBULANCE',
    'AMBU' : 'AMBULANCE',
    'AMBULENCE' : 'AMBULANCE',
    'AMBULETTE' : 'AMBULANCE',
    'AMBULACE' : 'AMBULANCE',
    'AMBULANE' : 'AMBULANCE',
    'AMBALANCE' : 'AMBULANCE',
    'AMBUKANCE' : 'AMBULANCE',
    'AMBULANVE' : 'AMBULANCE',
    'ALMBULANCE' : 'AMBULANCE',
    'PASSENGER VEHICLE' : 'UNSPECIFIED',
    'OTHER' : 'UNSPECIFIED',
    'UNKNOWN' : 'UNSPECIFIED',
    '4 DR SEDAN' : 'SEDAN',
    '2 DR SEDAN' : 'SEDAN',
    '4SEDN' : 'SEDAN',
    'CAR/SUV' : 'SUV',
    'STATION WAGON/SPORT UTILITY VEHICLE': 'SUV',
    'SPORT UTILITY / STATION WAGON' : 'SUV',
    'PEDAL BIKE' : 'BICYCLE',
    'E-BIKE' : 'BICYCLE',
    'E BIKE W P' : 'BICYCLE',
    'E BIKE' : 'BICYCLE',
    'E BIKE UNI' : 'BICYCLE',
    'EBIKE' : 'BICYCLE',
    'E-SCOOTER' : 'BICYCLE',
    'E SCOOTER' : 'BICYCLE',
    'ESCOOTER S' : 'BICYCLE',
    'BIKE' : 'BICYCLE',
    'MINIBIKE' : 'MOTORCYCLE',
    'DIRT BIKE' : 'MOTORCYCLE',
    'MOTORBIKE' : 'MOTORCYCLE',
    'MOTORSCOOTER' : 'MOTORCYCLE',
    'MOTOR SCOO' : 'MOTORCYCLE',
    'E MOTORCYC' : 'MOTORCYCLE', 
    'MOTORSCOOT' : 'MOTORCYCLE',
    'ELE MOTORC' : 'MOTORCYCLE',
    'MOTOR' : 'MOTORCYCLE',
    'SCOOTER' : 'MOTORCYCLE',
    '50CC SCOOT' : 'MOTORCYCLE',
    'GAS SCOOTE' : 'MOTORCYCLE',
    'SCOOTER GA' : 'MOTORCYCLE',
    'MOTORSCOOT' : 'MOTORCYCLE',
    'KICK SCOOT' : 'MOTORCYCLE',
    'PUSH SCOOT' : 'MOTORCYCLE',
    'SCOOT' : 'MOTORCYCLE',
    'YELLOW TAX' : 'TAXI',
    'YELLOW CAB' : 'TAXI',
    'CHASSIS CAB' : 'TRUCK',
    'LARGE COM VEH(6 OR MORE TIRES)' : 'TRUCK',
    'SMALL COM VEH(4 TIRES)' : 'TRUCK',
    'FLAT BED' : 'TRUCK',
    'FLAT RACK' : 'TRUCK',
    'DUMP' : 'TRUCK',
    'PK' : 'PICKUP'
}

In [14]:
string  = 'VEHICLE_TYPE'
df[string] = df[string].apply(lambda x: 'PICKUP' if 'PICK' in x.upper()  else x)
df[string] = df[string].apply(lambda x: 'TRUCK' if 'TRU' in x.upper() else x)
df[string] = df[string].apply(lambda x: 'TRUCK' if 'FIRE' in x.upper() else x)
df[string] = df[string].apply(lambda x: 'TRUCK' if 'GARBA' in x.upper() else x)
df[string] = df[string].apply(lambda x: 'BUS' if 'BUS' in x.upper() else x)
df[string] = df[string].apply(lambda x: 'VAN' if 'VAN' in x.upper() else x)
df[string] = df[string].map(vehicle_type_mapping).fillna(df[string])
df[string] = df[string].str.title()
allowed_values = list(df[string].value_counts().head(12).index)
df[string] = df[string].apply(lambda x: x if x in allowed_values else 'Unspecified')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3927379 entries, 0 to 3927378
Data columns (total 21 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   UNIQUE_ID               int64         
 1   COLLISION_ID            int64         
 2   CRASH_DATE              datetime64[ns]
 3   CRASH_TIME              object        
 4   STATE_REGISTRATION      object        
 5   VEHICLE_TYPE            object        
 6   VEHICLE_OCCUPANTS       int32         
 7   DRIVER_SEX              object        
 8   DRIVER_LICENSE_STATUS   object        
 9   PRE_CRASH               object        
 10  POINT_OF_IMPACT         object        
 11  VEHICLE_DAMAGE          object        
 12  VEHICLE_DAMAGE_1        object        
 13  VEHICLE_DAMAGE_2        object        
 14  VEHICLE_DAMAGE_3        object        
 15  PUBLIC_PROPERTY_DAMAGE  object        
 16  CONTRIBUTING_FACTOR_1   object        
 17  CONTRIBUTING_FACTOR_2   object        
 18  da

### Change names of columns to increase efficiency

In [16]:
df.rename(columns = {'UNIQUE_ID' : 'UniqueID',
                     'COLLISION_ID' : 'CollisionID',
                     'CRASH_DATE' : 'CrashDate',
                     'CRASH_TIME' : 'CrashTime',
                     'VEHICLE_ID' : 'VehicleID',
                     'STATE_REGISTRATION' : 'StateRegistration',
                     'VEHICLE_TYPE' : 'VehicleType',
                     'VEHICLE_OCCUPANTS' : 'VehicleOccupants',
                     'DRIVER_SEX' : 'DriverSex',
                     'DRIVER_LICENSE_STATUS' : 'DriverLicenseStatus',
                     'PRE_CRASH' : 'PreCrashAction',
                     'POINT_OF_IMPACT' : 'PointOfImpact',
                     'VEHICLE_DAMAGE' : 'VehicleDamage',
                     'VEHICLE_DAMAGE_1' : 'VehicleDamage1',
                     'VEHICLE_DAMAGE_2' : 'VehicleDamage2',
                     'VEHICLE_DAMAGE_3' : 'VehicleDamage3',
                     'PUBLIC_PROPERTY_DAMAGE' : 'PublicPropertyDamage',
                     'CONTRIBUTING_FACTOR_1' : 'ContributingFactor1',
                     'CONTRIBUTING_FACTOR_2' : 'ContributingFactor2',
          }, inplace = True)

In [17]:
df.head()

Unnamed: 0,UniqueID,CollisionID,CrashDate,CrashTime,StateRegistration,VehicleType,VehicleOccupants,DriverSex,DriverLicenseStatus,PreCrashAction,PointOfImpact,VehicleDamage,VehicleDamage1,VehicleDamage2,VehicleDamage3,PublicPropertyDamage,ContributingFactor1,ContributingFactor2,day,month,year
0,10223579,92832,2013-01-01,07:25:00,NY,Unspecified,0,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,1,1,2013
1,11805157,171291,2013-01-01,12:51:00,NY,Unspecified,0,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,1,1,2013
2,11400882,148896,2013-01-01,17:04:00,NJ,Unspecified,0,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,1,1,2013
3,14201289,286687,2013-01-01,18:13:00,NY,Unspecified,0,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,1,1,2013
4,13083694,234642,2013-01-01,18:15:00,NY,Suv,0,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,1,1,2013


## Save to file

In [18]:
#df.to_csv('Vehicles_FINAL.csv', sep=',', index=False, encoding='utf-8')