In [297]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [288]:
df = pd.read_csv("dataset_traffic_accident_prediction1.csv")
df.head()

Unnamed: 0,Weather,Road_Type,Time_of_Day,Traffic_Density,Speed_Limit,Number_of_Vehicles,Driver_Alcohol,Accident_Severity,Road_Condition,Vehicle_Type,Driver_Age,Driver_Experience,Road_Light_Condition,Accident
0,Rainy,City Road,Morning,1.0,100.0,5.0,0.0,,Wet,Car,51.0,48.0,Artificial Light,0.0
1,Clear,Rural Road,Night,,120.0,3.0,0.0,Moderate,Wet,Truck,49.0,43.0,Artificial Light,0.0
2,Rainy,Highway,Evening,1.0,60.0,4.0,0.0,Low,Icy,Car,54.0,52.0,Artificial Light,0.0
3,Clear,City Road,Afternoon,2.0,60.0,3.0,0.0,Low,Under Construction,Bus,34.0,31.0,Daylight,0.0
4,Rainy,Highway,Morning,1.0,195.0,11.0,0.0,Low,Dry,Car,62.0,55.0,Artificial Light,1.0


# Pre-processing the data for the classifier

In [289]:
columns = list(df.columns)
columns_dict = {}
for column in columns:
    columns_dict[column] = df[column].unique()

print(columns_dict)

{'Weather': array(['Rainy', 'Clear', 'Foggy', 'Stormy', nan, 'Snowy'], dtype=object), 'Road_Type': array(['City Road', 'Rural Road', 'Highway', nan, 'Mountain Road'],
      dtype=object), 'Time_of_Day': array(['Morning', 'Night', 'Evening', 'Afternoon', nan], dtype=object), 'Traffic_Density': array([ 1., nan,  2.,  0.]), 'Speed_Limit': array([100., 120.,  60., 195.,  30.,  nan,  50.,  80., 200., 206., 178.,
       208., 213., 190., 196., 188., 194., 189., 193., 185., 199., 192.,
       198., 212., 181.]), 'Number_of_Vehicles': array([ 5.,  3.,  4., 11.,  2.,  1., nan, 12., 13., 14., 10.]), 'Driver_Alcohol': array([ 0.,  1., nan]), 'Accident_Severity': array([nan, 'Moderate', 'Low', 'High'], dtype=object), 'Road_Condition': array(['Wet', 'Icy', 'Under Construction', 'Dry', nan], dtype=object), 'Vehicle_Type': array(['Car', 'Truck', 'Bus', 'Motorcycle', nan], dtype=object), 'Driver_Age': array([51., 49., 54., 34., 62., 27., 29., 38., 50., 33., 47., 25., 67.,
       61., 46., 48., 53., 35

In [290]:
fill_values = {
    'Traffic_Density' : 0.0, 
    'Speed Limit' : df['Speed_Limit'].mean(),
    'Number_of_Vehicles' : df['Number_of_Vehicles'].mode(),
    'Driver_Alcohol' : 0.0,
    'Driver_Age' : df['Driver_Age'].median(),
    'Driver_Experience' : df['Driver_Experience'].median()
}
df.fillna(value=fill_values, inplace=True)

In [291]:
df.dropna(inplace=True)
df.shape

(503, 14)

In [292]:
df.shape

(503, 14)

In [293]:
priority_mapping = {'Low': 0, 'Moderate': 1, 'High': 2}
df['Accident_Severity'] = df['Accident_Severity'].map(priority_mapping)

time_of_day_mapping = {'Morning': 0, 'Night': 3, 'Evening': 2, 'Afternoon': 1}
df['Time_of_Day'] = df['Time_of_Day'].map(time_of_day_mapping)

road_light_mapping = {'Artificial Light': 1, 'Daylight': 0, 'No Light': 2}
df['Road_Light_Condition'] = df['Road_Light_Condition'].map(road_light_mapping)

ohe_road_type = OneHotEncoder()
road_type_encoder = ohe_road_type.fit_transform(df[['Road_Type']])
road_type_encoder_df = pd.DataFrame(road_type_encoder.toarray(), columns=ohe_road_type.get_feature_names_out(['Road_Type']), index=df.index)
df = pd.concat([df.drop("Road_Type", axis=1), road_type_encoder_df], axis=1)

ohe_weather = OneHotEncoder()
weather_encoder = ohe_weather.fit_transform(df[['Weather']])
weather_encoder_df = pd.DataFrame(weather_encoder.toarray(), columns=ohe_weather.get_feature_names_out(['Weather']), index=df.index)
df = pd.concat([df.drop("Weather", axis=1), weather_encoder_df], axis=1)

ohe_road_cond = OneHotEncoder()
road_cond_encoder = ohe_road_cond.fit_transform(df[['Road_Condition']])
road_cond_encoder_df = pd.DataFrame(road_cond_encoder.toarray(), columns=ohe_road_cond.get_feature_names_out(['Road_Condition']), index=df.index)
df = pd.concat([df.drop("Road_Condition", axis=1), road_cond_encoder_df], axis=1)

ohe_vehicle = OneHotEncoder()
vehicle_encoder = ohe_vehicle.fit_transform(df[["Vehicle_Type"]])
vehicle_encoder_df = pd.DataFrame(vehicle_encoder.toarray(), columns=ohe_vehicle.get_feature_names_out(['Vehicle_Type']), index=df.index)
df = pd.concat([df.drop("Vehicle_Type", axis=1), vehicle_encoder_df], axis=1)

df['Speed_Limit'] = pd.qcut(df['Speed_Limit'], q=3, labels=['Low', 'Moderate', 'High'])
df['Speed_Limit'] = df['Speed_Limit'].map(priority_mapping)

df['Driver_Age'] = pd.qcut(df['Driver_Age'], q=3, labels=['Low', 'Moderate', 'High'])
df['Driver_Age'] = df['Driver_Age'].map(priority_mapping)

df['Driver_Experience'] = pd.qcut(df['Driver_Experience'], q=3, labels=['Low', 'Moderate', 'High'])
df['Driver_Experience'] = df['Driver_Experience'].map(priority_mapping)

df['Number_of_Vehicles'] = pd.qcut(df['Number_of_Vehicles'], q=3, labels=['Low', 'Moderate', 'High'])
df['Number_of_Vehicles'] = df['Number_of_Vehicles'].map(priority_mapping)

df.head()

Unnamed: 0,Time_of_Day,Traffic_Density,Speed_Limit,Number_of_Vehicles,Driver_Alcohol,Accident_Severity,Driver_Age,Driver_Experience,Road_Light_Condition,Accident,...,Weather_Snowy,Weather_Stormy,Road_Condition_Dry,Road_Condition_Icy,Road_Condition_Under Construction,Road_Condition_Wet,Vehicle_Type_Bus,Vehicle_Type_Car,Vehicle_Type_Motorcycle,Vehicle_Type_Truck
1,3,0.0,2,1,0.0,1,1,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,2,1.0,0,1,0.0,0,2,2,1,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1,2.0,0,1,0.0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0,1.0,2,2,0.0,0,2,2,1,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,1,0.0,0,1,0.0,0,0,0,0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [294]:
dt = DecisionTreeClassifier(criterion='gini', random_state=42)

## Prepare X and y
X = df.drop(['Accident'], axis=1)
y = df['Accident']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   # Split the data

print(X.shape)
print(y.shape)

(503, 26)
(503,)


In [295]:
dt.fit(X_train, y_train)

In [298]:
y_pred = dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred)

print(f"Decision Tree Accuracy: {accuracy_dt:.2f}")
print('Classification report for Decision Tree: ')
print(classification_report(y_test, y_pred))

Decision Tree Accuracy: 0.56
Classification report for Decision Tree: 
              precision    recall  f1-score   support

         0.0       0.68      0.60      0.64        65
         1.0       0.41      0.50      0.45        36

    accuracy                           0.56       101
   macro avg       0.55      0.55      0.54       101
weighted avg       0.59      0.56      0.57       101



In [307]:
dt_reg = DecisionTreeClassifier(
    max_depth=5,              # Limit the depth of the tree
    min_samples_split=10,     # Minimum samples required to split a node
    min_samples_leaf=5,       # Minimum samples required at a leaf node
    max_features='sqrt',      
    ccp_alpha=0.01,  
    random_state=42
)

dt_reg.fit(X_train, y_train)

y_pred = dt_reg.predict(X_test)
accuracy_dt_reg = accuracy_score(y_test, y_pred)

print(f"Decision Tree Accuracy: {accuracy_dt_reg:.2f}")
print('Classification report for Decision Tree: ')
print(classification_report(y_test, y_pred))

Decision Tree Accuracy: 0.64
Classification report for Decision Tree: 
              precision    recall  f1-score   support

         0.0       0.64      1.00      0.78        65
         1.0       0.00      0.00      0.00        36

    accuracy                           0.64       101
   macro avg       0.32      0.50      0.39       101
weighted avg       0.41      0.64      0.50       101



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
