In [1]:
import pandas as pd
df = pd.read_csv("Crash_Reporting_-_Drivers_Data.csv")

  df = pd.read_csv("Crash_Reporting_-_Drivers_Data.csv")


In [2]:
# Removing unknown records from Driver at Fault
df = df[df["Driver At Fault"]!="Unknown"]

In [3]:
df["Surface Condition"].fillna("Unknown", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Surface Condition"].fillna("Unknown", inplace=True)


In [4]:
#converting everything into lower case 
df=df.applymap(lambda x: str(x).lower() if isinstance(x,str) else x)

  df=df.applymap(lambda x: str(x).lower() if isinstance(x,str) else x)


In [5]:
# converting to datetime object from string
import datetime
df["Crash Date/Time"] = df["Crash Date/Time"].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%Y %I:%M:%S %p"))

In [6]:
df = df.sort_values(by='Crash Date/Time')

In [7]:
# remove null values from this column 156 na found
df = df[df["Vehicle First Impact Location"].notna()]

# combining categories
df["Vehicle First Impact Location"] = df["Vehicle First Impact Location"].str.replace("roof top", "top", regex=True)
df["Vehicle First Impact Location"] = df["Vehicle First Impact Location"].str.replace(" ", "", regex=True)

df["Vehicle Movement"] = df["Vehicle Movement"].str.replace("making u-turn", "making u turn", regex=True)
df["Vehicle Movement"] = df["Vehicle Movement"].str.replace("parking", "parked", regex=True)

df["Driver Distracted By"] = df["Driver Distracted By"].str.replace("other distraction", "other action (looking away from task, etc.)", regex=True)
df["Driver Distracted By"] = df["Driver Distracted By"].str.replace("other cellular phone related", "talking or listening to cellular phone", regex=True)
df["Driver Distracted By"] = df["Driver Distracted By"].str.replace("adjusting audio and or climate controls", "using other device controls integral to vehicle", regex=True)

df["Route Type"] = df["Route Type"].str.replace('-', 'unknown', regex=True)
df["Off-Road Description"] = df["Off-Road Description"].str.replace('-', 'unknown', regex=True)

In [8]:
# drop irrelavat or redundant columns

df.drop(columns=['Circumstance', 'Driverless Vehicle', 'Vehicle Year', 'Vehicle Make', 'Vehicle Model','Location','Related Non-Motorist', 'Non-Motorist Substance Abuse','Report Number', 'Local Case Number'], inplace=True)

In [9]:
df.columns

Index(['Agency Name', 'ACRS Report Type', 'Crash Date/Time', 'Route Type',
       'Road Name', 'Cross-Street Name', 'Off-Road Description',
       'Municipality', 'Collision Type', 'Weather', 'Surface Condition',
       'Light', 'Traffic Control', 'Driver Substance Abuse', 'Person ID',
       'Driver At Fault', 'Injury Severity', 'Driver Distracted By',
       'Drivers License State', 'Vehicle ID', 'Vehicle Damage Extent',
       'Vehicle First Impact Location', 'Vehicle Body Type',
       'Vehicle Movement', 'Vehicle Going Dir', 'Speed Limit',
       'Parked Vehicle', 'Latitude', 'Longitude'],
      dtype='object')

In [10]:
df["Driver At Fault"].value_counts()

Driver At Fault
yes    96882
no     84854
Name: count, dtype: int64

In [11]:
df["Crash Date/Time"][0]

Timestamp('2021-05-27 19:40:00')

In [12]:
# Function to determine the time period
def get_time_period(time):
    if time.hour >= 20 or time.hour < 6:  # 8 PM to 5:59 AM
        return 'Night'
    elif 17 <= time.hour < 20:  # 5 PM to 7:59 PM
        return 'Evening'
    elif 12 <= time.hour < 17:  # 12 PM to 4:59 PM
        return 'Afternoon'
    else:  # 6 AM to 11:59 AM
        return 'Day'

# Apply the function to the 'Timestamp' column
df['Time_Period'] = df['Crash Date/Time'].apply(get_time_period)



In [13]:
crosstab_5 = pd.crosstab(df["Time_Period"], df["Driver At Fault"])
crosstab_5["Y_Percentage"] = crosstab_5["yes"]/(crosstab_5["yes"]+crosstab_5["no"])
crosstab_5.sort_values(by="yes", ascending=False)

Driver At Fault,no,yes,Y_Percentage
Time_Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afternoon,29033,31423,0.519766
Day,24306,27215,0.528231
Night,13789,19893,0.590612
Evening,17726,18351,0.508662


In [14]:
df['Hour_Class'] = df['Crash Date/Time'].dt.hour

In [15]:
crosstab_5 = pd.crosstab(df["Hour_Class"], df["Driver At Fault"])
crosstab_5["Y_Percentage"] = crosstab_5["yes"]/(crosstab_5["yes"]+crosstab_5["no"])
crosstab_5.sort_values(by="yes", ascending=False)

Driver At Fault,no,yes,Y_Percentage
Hour_Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17,7254,7341,0.50298
16,7065,7207,0.504975
15,6831,7163,0.511862
18,6194,6367,0.506886
14,5363,5948,0.52586
8,5341,5751,0.518482
13,4970,5566,0.528284
12,4804,5539,0.535531
9,4671,5184,0.526027
11,4161,4817,0.536534


In [16]:
df["Traffic Control"].value_counts()

Traffic Control
no controls                                                                 73817
traffic signal                                                              56939
stop sign                                                                   12976
traffic control signal                                                       4688
flashing traffic signal                                                      2100
other                                                                        1992
yield sign                                                                   1838
flashing traffic control signal                                               289
person                                                                        275
unknown                                                                       216
lane use control signal                                                       123
pedestrian crossing sign                                                       88


In [17]:
crosstab_5 = pd.crosstab(df["Traffic Control"], df["Driver At Fault"])
crosstab_5["Y_Percentage"] = crosstab_5["yes"]/(crosstab_5["yes"]+crosstab_5["no"])
crosstab_5.sort_values(by="yes", ascending=False)

Driver At Fault,no,yes,Y_Percentage
Traffic Control,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no controls,32459,41358,0.560277
traffic signal,29016,27923,0.490402
stop sign,6179,6797,0.523813
traffic control signal,3292,1396,0.297782
flashing traffic signal,971,1129,0.537619
other,875,1117,0.560743
yield sign,892,946,0.51469
person,121,154,0.56
unknown,79,137,0.634259
warning sign,45,107,0.703947


In [18]:
df.columns

Index(['Agency Name', 'ACRS Report Type', 'Crash Date/Time', 'Route Type',
       'Road Name', 'Cross-Street Name', 'Off-Road Description',
       'Municipality', 'Collision Type', 'Weather', 'Surface Condition',
       'Light', 'Traffic Control', 'Driver Substance Abuse', 'Person ID',
       'Driver At Fault', 'Injury Severity', 'Driver Distracted By',
       'Drivers License State', 'Vehicle ID', 'Vehicle Damage Extent',
       'Vehicle First Impact Location', 'Vehicle Body Type',
       'Vehicle Movement', 'Vehicle Going Dir', 'Speed Limit',
       'Parked Vehicle', 'Latitude', 'Longitude', 'Time_Period', 'Hour_Class'],
      dtype='object')

In [19]:
df["Driver Substance Abuse"].value_counts()

Driver Substance Abuse
none detected                                          119228
not suspect of alcohol use, not suspect of drug use     12422
unknown                                                 11527
alcohol present                                          4066
unknown, unknown                                         1513
alcohol contributed                                      1432
suspect of alcohol use, not suspect of drug use           369
illegal drug present                                      258
medication present                                        116
illegal drug contributed                                  102
combined substance present                                 90
medication contributed                                     63
other                                                      59
combination contributed                                    47
suspect of alcohol use, unknown                            44
unknown, not suspect of drug use               

In [20]:
df["Driver Substance Abuse"] = df["Driver Substance Abuse"].str.replace("not suspect of alcohol use, not suspect of drug use", "none detected", regex=True)
df["Driver Substance Abuse"] = df["Driver Substance Abuse"].str.replace("unknown, unknown", "unknown", regex=True)
df["Driver Substance Abuse"] = df["Driver Substance Abuse"].str.replace("not suspect of alcohol use, suspect of drug use", "suspect of drug use", regex=True)
df["Driver Substance Abuse"] = df["Driver Substance Abuse"].str.replace("not suspect of alcohol use, unknown", "unknown", regex=True)
df["Driver Substance Abuse"] = df["Driver Substance Abuse"].str.replace("suspect of alcohol use, not suspect of drug use", "suspect of alcohol use", regex=True)
df["Driver Substance Abuse"] = df["Driver Substance Abuse"].str.replace("alcohol contributed", "alcohol present", regex=True)
df["Driver Substance Abuse"] = df["Driver Substance Abuse"].str.replace("illegal drug contributed", "illegal drug present", regex=True)
df["Driver Substance Abuse"] = df["Driver Substance Abuse"].str.replace("combination contributed", "combined substance present", regex=True)
df["Driver Substance Abuse"] = df["Driver Substance Abuse"].str.replace("suspect of alcohol use, unknown", "suspect of alcohol use", regex=True)
df["Driver Substance Abuse"] = df["Driver Substance Abuse"].str.replace("suspect of alcohol use, suspect of drug use", "combined substance present", regex=True)
df["Driver Substance Abuse"] = df["Driver Substance Abuse"].str.replace("not suspect of alcohol use, unknown", "unknown", regex=True)
df["Driver Substance Abuse"] = df["Driver Substance Abuse"].str.replace("unknown, suspect of drug use", "suspect of drug use", regex=True)
df["Driver Substance Abuse"] = df["Driver Substance Abuse"].str.replace("other", "unknown", regex=True)
df["Driver Substance Abuse"] = df["Driver Substance Abuse"].str.replace("unknown, not suspect of drug use", "none detected", regex=True)
df["Driver Substance Abuse"] = df["Driver Substance Abuse"].str.replace("medication contributed", "medication present", regex=True)



In [21]:
df["Driver Substance Abuse"].value_counts()

Driver Substance Abuse
none detected                 131682
unknown                        13124
alcohol present                 5498
suspect of alcohol use           413
illegal drug present             360
medication present               179
combined substance present       163
suspect of drug use               20
Name: count, dtype: int64

In [22]:
crosstab_5 = pd.crosstab(df["Driver Substance Abuse"], df["Driver At Fault"])
crosstab_5["Y_Percentage"] = crosstab_5["yes"]/(crosstab_5["yes"]+crosstab_5["no"])
crosstab_5.sort_values(by="yes", ascending=False)

Driver At Fault,no,yes,Y_Percentage
Driver Substance Abuse,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
none detected,67870,63812,0.484592
unknown,1921,11203,0.853627
alcohol present,113,5385,0.979447
illegal drug present,12,348,0.966667
suspect of alcohol use,140,273,0.661017
medication present,18,161,0.899441
combined substance present,9,154,0.944785
suspect of drug use,6,14,0.7


In [23]:
df.columns

Index(['Agency Name', 'ACRS Report Type', 'Crash Date/Time', 'Route Type',
       'Road Name', 'Cross-Street Name', 'Off-Road Description',
       'Municipality', 'Collision Type', 'Weather', 'Surface Condition',
       'Light', 'Traffic Control', 'Driver Substance Abuse', 'Person ID',
       'Driver At Fault', 'Injury Severity', 'Driver Distracted By',
       'Drivers License State', 'Vehicle ID', 'Vehicle Damage Extent',
       'Vehicle First Impact Location', 'Vehicle Body Type',
       'Vehicle Movement', 'Vehicle Going Dir', 'Speed Limit',
       'Parked Vehicle', 'Latitude', 'Longitude', 'Time_Period', 'Hour_Class'],
      dtype='object')

In [24]:
df.drop(columns=['Agency Name', 'ACRS Report Type', 'Crash Date/Time', 'Route Type',
       'Road Name', 'Cross-Street Name', 'Off-Road Description',
       'Municipality','Weather', 'Surface Condition',
       'Light', 'Traffic Control','Person ID','Drivers License State', 'Vehicle ID', 'Vehicle Damage Extent','Vehicle Body Type','Vehicle Going Dir', 'Speed Limit',
       'Parked Vehicle', 'Latitude', 'Longitude', 'Time_Period', 'Hour_Class'], inplace=True)

In [25]:
df.columns

Index(['Collision Type', 'Driver Substance Abuse', 'Driver At Fault',
       'Injury Severity', 'Driver Distracted By',
       'Vehicle First Impact Location', 'Vehicle Movement'],
      dtype='object')

In [26]:
df["Vehicle Movement"].isna().sum()

1389

In [27]:
df.dropna(subset=["Vehicle Movement"],inplace=True)

In [28]:
list(df["Vehicle Movement"].unique())

['stopped in traffic lane',
 'slowing or stopping',
 'making left turn',
 'moving constant speed',
 'starting from parked',
 'parked',
 'making right turn',
 'accelerating',
 'unknown',
 'starting from lane',
 'backing',
 'changing lanes',
 'entering traffic lane',
 'skidding',
 'other',
 'making u turn',
 'negotiating a curve',
 'passing',
 'leaving traffic lane',
 'right turn on red',
 'driverless moving veh.',
 'overtaking/passing',
 'turning left',
 'stopped in traffic']

In [29]:
df.columns

Index(['Collision Type', 'Driver Substance Abuse', 'Driver At Fault',
       'Injury Severity', 'Driver Distracted By',
       'Vehicle First Impact Location', 'Vehicle Movement'],
      dtype='object')

In [30]:
df['Driver Substance Abuse'].isna().sum()
df['Driver Substance Abuse'].fillna("unknown", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Driver Substance Abuse'].fillna("unknown", inplace=True)


In [31]:
df['Driver Substance Abuse'].isna().sum()

0

In [32]:
df.to_csv("EDA_3.csv")

In [158]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.naive_bayes import CategoricalNB

X  = np.array(df[['Collision Type', 'Driver Substance Abuse', 
       'Injury Severity', 'Driver Distracted By',
       'Vehicle First Impact Location', 'Vehicle Movement']]
)

Y = np.array(df['Driver At Fault'])

#ColumnTransformer takes 3 values 
"""1. name 
2. Transform or function to implement
3. where to implement that function"""

preprocessor = ColumnTransformer([('Collision Type', OneHotEncoder(), [0]),
                        ('Driver Substance Abuse', OneHotEncoder(), [1]),
                        ('Injury Severity', OneHotEncoder(), [2]),
                        ('Driver Distracted By', OneHotEncoder(), [3]),
                        ('Vehicle First Impact Location', OneHotEncoder(), [4]),
                        ('Vehicle Movement', OneHotEncoder(), [5])])



In [169]:
X[0]

array(['same dir rear end', 'unknown', 'no apparent injury',
       'not distracted', 'sixoclock', 'stopped in traffic lane'],
      dtype=object)

In [168]:
OneHotEncoder(X[0][0])

TypeError: OneHotEncoder.__init__() takes 1 positional argument but 2 were given

In [159]:
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', CategoricalNB())
])

In [160]:
encoder = OneHotEncoder()
Y = Y.reshape(-1, 1)
Y = encoder.fit_transform(Y).toarray()

In [170]:
X = X.toarray()

AttributeError: 'numpy.ndarray' object has no attribute 'toarray'

In [161]:
Y

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]])

In [164]:
X

array([['same dir rear end', 'unknown', 'no apparent injury',
        'not distracted', 'sixoclock', 'stopped in traffic lane'],
       ['same dir rear end', 'alcohol present', 'no apparent injury',
        'other action (looking away from task, etc.)', 'twelveoclock',
        'slowing or stopping'],
       ['same dir rear end', 'none detected', 'no apparent injury',
        'not distracted', 'sixoclock', 'making left turn'],
       ...,
       ['single vehicle', 'unknown', 'possible injury', 'unknown',
        'underside', 'moving constant speed'],
       ['rear to side', 'none detected', 'no apparent injury',
        'not distracted', 'elevenoclock', 'moving constant speed'],
       ['rear to side', 'none detected', 'no apparent injury',
        'not distracted', 'oneoclock', 'accelerating']], dtype=object)

In [162]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.33, random_state=42)

In [163]:
pipeline.fit(X_train, y_train)

TypeError: Sparse data was passed for X, but dense data is required. Use '.toarray()' to convert to a dense numpy array.