In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sdv.metadata import SingleTableMetadata

In [4]:
data=pd.read_csv("Data Sheet - Sheet1.csv")
data.describe()

Unnamed: 0,Vehicle_Speed,Crash_Time,Age,Number_of_Lanes,Lane_Width,Speed_Limit
count,300.0,300.0,300.0,300.0,300.0,300.0
mean,64.366667,11.69,48.663333,2.0,3.272374,74.746667
std,31.951974,6.740327,18.432104,0.825999,0.143053,26.857903
min,10.0,0.0,18.0,1.0,3.001781,30.0
25%,37.0,5.75,31.0,1.0,3.150568,51.0
50%,60.5,12.0,51.0,2.0,3.28562,75.0
75%,94.0,17.0,65.0,3.0,3.394545,97.25
max,120.0,23.0,80.0,3.0,3.497986,120.0


In [5]:
metadata = SingleTableMetadata()

metadata.add_column(column_name='Vehicle_Speed', sdtype="numerical")
metadata.add_column(column_name='Crash_Time', sdtype="categorical")
metadata.add_column(column_name="Age", sdtype="numerical")
metadata.add_column(column_name="Gender", sdtype="categorical")
metadata.add_column(column_name='Vehicle_Type', sdtype="categorical")
metadata.add_column(column_name="Number_of_Lanes", sdtype="numerical")
metadata.add_column(column_name="Lane_Width", sdtype="numerical", )
metadata.add_column(column_name="Road_Type", sdtype="categorical")
metadata.add_column(column_name="Alcohol_Consumption", sdtype="categorical")
metadata.add_column(column_name="Crash_Type", sdtype="categorical")
metadata.add_column(column_name="Seatbelt_Usage", sdtype="categorical")
metadata.add_column(column_name="Speed_Limit", sdtype="numerical")
metadata.add_column(column_name="Road_Surface_Condition", sdtype="categorical")
metadata.add_column(column_name='Crash_Severity', sdtype="categorical")



In [6]:
from sdv.single_table import CTGANSynthesizer

model = CTGANSynthesizer(metadata)

model.fit(data)

synthetic_data = model.sample(700)


print("\nSynthetic Data Summary:")
print(synthetic_data.describe())

synthetic_data.to_csv("synthetic_data.csv", index=False)

print("\nSynthetic data has been saved to 'synthetic_data.csv'.")
# synthetic_data=pd.read_csv('synthetic_data.csv')

In [7]:
data.columns

Index(['Crash_Severity', 'Vehicle_Speed', 'Crash_Time', 'Age', 'Gender',
       'Vehicle_Type', 'Number_of_Lanes', 'Lane_Width', 'Road_Type',
       'Alcohol_Consumption', 'Crash_Type', 'Seatbelt_Usage', 'Speed_Limit',
       'Road_Surface_Condition'],
      dtype='object')

In [13]:
data = pd.concat([data, synthetic_data])

In [14]:
#feature engineering

data['Speed_Ratio']=data['Vehicle_Speed']/data['Speed_Limit']
data['total_lane']=data['Number_of_Lanes']*data['Lane_Width']

data['Speeding_Indicator'] = data['Vehicle_Speed'] > data['Speed_Limit']
data['Alcohol_Crash'] = data['Alcohol_Consumption'] + '-' + data['Crash_Type']

data['Speed_Category'] = pd.cut(data['Vehicle_Speed'], bins=[0, 30, 70, 100, 120], labels=[0, 1,2, 3])
data['gen']=(data['Gender']=='Female')
data['Relative_Speed'] = data['Vehicle_Speed'] - data['Speed_Limit']
data['Time_of_Day'] = pd.cut(data['Crash_Time'],
                           bins=[-1, 6, 12, 18, 24],
                           labels=[3, 2, 0, 1])
data['AgeCategory'] = pd.cut(data['Age'],
                           bins=[-float('inf'), 12, 19, 48, 64, float('inf')],
                           labels=[10,6,2, 4, 5])

data['Avg_Speed_by_Road_Type'] = data.groupby('Road_Type')['Vehicle_Speed'].transform('mean')
data['Crash_Count_By_Vehicle_Type'] = data.groupby('Vehicle_Type')['Vehicle_Type'].transform('count')
data['Road_Surface_Risk_Score'] = data['Road_Surface_Condition'].replace({'Dry': 1, 'Wet': 2, 'Icy': 3})

data['Vehicle_Type_Frequency'] = data['Vehicle_Type'].map(data['Vehicle_Type'].value_counts())
data['Crash_Type_Target_Encoding'] = data.groupby('Crash_Type')['Crash_Type'].transform('count')

data['Rush_Hour_Indicator'] = data['Crash_Time'].apply(lambda x: 7 <= int(x) <= 9 or 17 <= int(x) <= 19)

data['Road_Safety'] = (data['Seatbelt_Usage'] == 'Yes') & (data['Alcohol_Consumption'] == 'No')
data['Rural_Unsafe_Roads'] = (data['Road_Type'] == 'Rural') & (data['Road_Surface_Condition'] != 'Dry')

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
data['Driver_Cluster'] = kmeans.fit_predict(data[['Vehicle_Speed', 'Age', 'Speed_Limit']])

data['Crash_Risk_Score'] = (data['Vehicle_Speed'] > 50).astype(int) + \
                         (data['Alcohol_Consumption'] == 'Yes').astype(int) + \
                         (data['Road_Surface_Condition'].map({'Dry': 0, 'Wet': 1, 'Icy': 2}))

data['Driver_Risk_Factor'] = (data['AgeCategory'] == 2).astype(int) + \
                           (data['Gender'] == 'Male').astype(int) + \
                           (data['Seatbelt_Usage'] == 'No').astype(int)




In [15]:
data.columns

Index(['Crash_Severity', 'Vehicle_Speed', 'Crash_Time', 'Age', 'Gender',
       'Vehicle_Type', 'Number_of_Lanes', 'Lane_Width', 'Road_Type',
       'Alcohol_Consumption', 'Crash_Type', 'Seatbelt_Usage', 'Speed_Limit',
       'Road_Surface_Condition', 'Speed_Ratio', 'total_lane',
       'Speeding_Indicator', 'Alcohol_Crash', 'Speed_Category', 'gen',
       'Relative_Speed', 'Time_of_Day', 'AgeCategory',
       'Avg_Speed_by_Road_Type', 'Crash_Count_By_Vehicle_Type',
       'Road_Surface_Risk_Score', 'Vehicle_Type_Frequency',
       'Crash_Type_Target_Encoding', 'Rush_Hour_Indicator', 'Road_Safety',
       'Rural_Unsafe_Roads', 'Driver_Cluster', 'Crash_Risk_Score',
       'Driver_Risk_Factor'],
      dtype='object')

In [16]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

x = data.drop(columns=['Crash_Severity'])
y = data['Crash_Severity']

# Use a Random Forest Classifier
model = RandomForestClassifier(random_state=123)
categorical_to_encode = ['Alcohol_Crash','Speeding_Indicator','Gender','Crash_Type','AgeCategory','Crash_Time', 'Vehicle_Type', 'Road_Type', 'Alcohol_Consumption','Seatbelt_Usage','Road_Surface_Condition']  # Specify columns to encode

# One-Hot Encoding for selected columns
X_encoded = pd.get_dummies(x[categorical_to_encode], drop_first=True)

# Combine with non-encoded features
non_encoded_features = x.drop(columns=categorical_to_encode)
X_transformed = pd.concat([non_encoded_features, X_encoded], axis=1)

# Transform the data
# Recursive Feature Elimination
rfe = RFE(model, n_features_to_select=15)  # Keep top 10 features
rfe.fit(X_transformed, y)

# Selected features
selected_features = X_transformed.columns[rfe.support_]
print("Selected Features:", selected_features)
print(X_transformed.columns)


Selected Features: Index(['Vehicle_Speed', 'Age', 'Lane_Width', 'Speed_Limit', 'Speed_Ratio',
       'total_lane', 'Relative_Speed', 'Time_of_Day',
       'Crash_Count_By_Vehicle_Type', 'Road_Surface_Risk_Score',
       'Vehicle_Type_Frequency', 'Driver_Cluster', 'Crash_Risk_Score',
       'Driver_Risk_Factor', 'Crash_Time'],
      dtype='object')
Index(['Vehicle_Speed', 'Age', 'Number_of_Lanes', 'Lane_Width', 'Speed_Limit',
       'Speed_Ratio', 'total_lane', 'Speed_Category', 'gen', 'Relative_Speed',
       'Time_of_Day', 'Avg_Speed_by_Road_Type', 'Crash_Count_By_Vehicle_Type',
       'Road_Surface_Risk_Score', 'Vehicle_Type_Frequency',
       'Crash_Type_Target_Encoding', 'Rush_Hour_Indicator', 'Road_Safety',
       'Rural_Unsafe_Roads', 'Driver_Cluster', 'Crash_Risk_Score',
       'Driver_Risk_Factor', 'Speeding_Indicator', 'Crash_Time',
       'Alcohol_Crash_No-Rear-end', 'Alcohol_Crash_Yes-Head-on',
       'Alcohol_Crash_Yes-Rear-end', 'Gender_Male', 'Crash_Type_Rear-end',
      

In [17]:
x = data[['Vehicle_Speed', 'Age', 'Lane_Width', 'Speed_Limit', 'Speed_Ratio',
       'total_lane', 'Relative_Speed', 'Crash_Risk_Score',
       'Driver_Risk_Factor', 'Crash_Time']]
y = data['Crash_Severity']


In [18]:
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.33, random_state=123)
model1=RandomForestClassifier(n_estimators=100, n_jobs=-1 , random_state=123, max_depth=15, max_features=6)
model2=GradientBoostingClassifier(n_estimators=200,  random_state=123, learning_rate=0.09)
model3 = CatBoostClassifier(iterations=400, verbose=50, learning_rate=0.05)
model1.fit(x_train, y_train)
print(model1.score(x_test, y_test))
print(model1.score(x_train, y_train))
model2.fit(x_train, y_train)
print(model2.score(x_test, y_test))
print(model2.score(x_train, y_train))
model3.fit(x_train, y_train)
print(model3.score(x_test, y_test))
print(model3.score(x_train, y_train))

0.6599326599326599
1.0
0.6363636363636364
0.9917081260364843
0:	learn: 1.0938293	total: 177ms	remaining: 1m 10s
50:	learn: 0.9038520	total: 290ms	remaining: 1.98s
100:	learn: 0.7795353	total: 401ms	remaining: 1.19s
150:	learn: 0.6783586	total: 515ms	remaining: 849ms
200:	learn: 0.5966236	total: 627ms	remaining: 621ms
250:	learn: 0.5193932	total: 785ms	remaining: 466ms
300:	learn: 0.4550401	total: 912ms	remaining: 300ms
350:	learn: 0.4061156	total: 1.04s	remaining: 146ms
399:	learn: 0.3629213	total: 1.17s	remaining: 0us
0.622895622895623
1.0


In [19]:
importances = model1.feature_importances_

# Get Feature Names

feature_names=['Vehicle_Speed', 'Age', 'Lane_Width', 'Speed_Limit', 'Speed_Ratio',
       'total_lane', 'Relative_Speed', 'Crash_Risk_Score',
       'Driver_Risk_Factor', 'Crash_Time']
# Create a DataFrame for easier interpretation
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Gini Importance': importances
}).sort_values(by='Gini Importance', ascending=False)
# Display Results
print(importance_df)

              Feature  Gini Importance
5          total_lane         0.130733
3         Speed_Limit         0.122806
9          Crash_Time         0.119422
2          Lane_Width         0.111068
1                 Age         0.106972
0       Vehicle_Speed         0.106829
4         Speed_Ratio         0.102193
6      Relative_Speed         0.092633
7    Crash_Risk_Score         0.054616
8  Driver_Risk_Factor         0.052728


In [20]:
# Create the XGBoost Classifier
model4 = XGBClassifier(
    objective="multi:softmax",  # Use softmax for multi-class classification
    num_class=3,               # Number of classes in the dataset
    max_depth=4,               # Maximum depth of a tree
    learning_rate=0.09,         # Learning rate
    n_estimators=200,          # Number of boosting rounds
    random_state=123
)
mapping = {'Minor injury': 0, 'Major injury': 1, 'Fatal crash': 2}

# Apply the mapping
y_test_mapped = y_test.map(mapping)
y_train_mapped = y_train.map(mapping)
# Train the model
model4.fit(x_train, y_train_mapped)
print(model4.score(x_test, y_test_mapped))
print(model4.score(x_train, y_train_mapped))

0.6464646464646465
0.9983416252072969


In [21]:
type(y_test)

pandas.core.series.Series

In [22]:
y_pred1=model1.predict(x_test)
y_pred2=model2.predict(x_test)
y_pred3=model3.predict(x_test)
y_pred4=model4.predict(x_test)
y_pred3=y_pred3[:,0]
mapping = {0:'Minor injury',1: 'Major injury',2: 'Fatal crash'}
# Apply the mapping
y_pred4 = pd.Series(y_pred4)

y_pred4 = y_pred4.map(mapping)

In [29]:
predictions = np.array([y_pred1, y_pred3, y_pred2, y_pred4])
predictions[:, 0]
print(predictions.shape[1])

297


In [30]:
from collections import Counter
y_pred_f=[0]*predictions.shape[1]
for i in range(0,predictions.shape[1]):
  vote_counts = Counter(predictions[:,i])
  y_pred_f[i]=vote_counts.most_common(1)[0][0]

In [31]:
print(accuracy_score(y_test, y_pred_f))

0.6430976430976431
