# Import packages and mount Drive

In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
import seaborn as sns
import random
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from geopy.distance import geodesic
from scipy.spatial import KDTree
import joblib

In [2]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Import data

In [3]:
# Read in Boston car accident data
filtered_file_path = '/content/drive/My Drive/Boston_US_Accidents_March23.csv'

df_MA_accidents = pd.read_csv(filtered_file_path)

df_MA_accidents = df_MA_accidents.drop_duplicates()
# Crash set to 1 if accident occured; non-accident rows will have Crash set to 0
df_MA_accidents["Crash"] = 1

# Find and remove or impute null values

In [4]:
null_counts = df_MA_accidents.isnull().sum()

important_columns = [
    'ID', 'Severity', 'Start_Time', 'Start_Lat', 'Start_Lng',
       'Distance(mi)','Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
       'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight', 'Crash']

filtered_null_counts = null_counts[important_columns]

display(filtered_null_counts)

ID                          0
Severity                    0
Start_Time                  0
Start_Lat                   0
Start_Lng                   0
Distance(mi)                0
Temperature(F)              0
Wind_Chill(F)             934
Humidity(%)                 0
Pressure(in)                1
Visibility(mi)              0
Wind_Direction              0
Wind_Speed(mph)            32
Precipitation(in)        1342
Weather_Condition           2
Amenity                     0
Bump                        0
Crossing                    0
Give_Way                    0
Junction                    0
No_Exit                     0
Railway                     0
Roundabout                  0
Station                     0
Stop                        0
Traffic_Calming             0
Traffic_Signal              0
Turning_Loop                0
Sunrise_Sunset              0
Civil_Twilight              0
Nautical_Twilight           0
Astronomical_Twilight       0
Crash                       0
dtype: int

In [5]:
columns_to_drop_nulls = ['ID', 'Severity', 'Start_Time', 'Start_Lat', 'Start_Lng', 'Distance(mi)','Temperature(F)', 'Humidity(%)', 'Pressure(in)','Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)','Weather_Condition', 'Amenity', 'Bump', 'Crossing','Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station','Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop','Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight','Astronomical_Twilight', 'Crash']

df_MA_accidents.dropna(subset=columns_to_drop_nulls, inplace=True)

In [6]:
# assume windchill is same as temp if null
df_MA_accidents['Wind_Chill(F)'] = df_MA_accidents['Wind_Chill(F)'].fillna(df_MA_accidents['Temperature(F)'])

# assume rain not present if null
df_MA_accidents['Precipitation(in)'] = df_MA_accidents['Precipitation(in)'].fillna(0)

In [7]:
# Set random coordinates based on Boston's range
def generate_random_Boston_coordinates(num_samples):
    min_lat, max_lat = 42.2279, 42.4008
    min_lon, max_lon = -71.1912, -70.9860
    latitudes = np.random.uniform(min_lat, max_lat, num_samples)
    longitudes = np.random.uniform(min_lon, max_lon, num_samples)
    return latitudes, longitudes

# Generate non-accident data
num_samples = 2000
start_date = datetime(2016, 3, 23)
end_date = datetime(2023, 2, 28)
date_range = (end_date - start_date).days
non_accidents_MA = {
    'ID': [f'nc_{i+1}' for i in range(num_samples)],
    'Severity': [0] * num_samples,
    'Start_Time': [start_date + timedelta(days=random.randint(0, date_range)) for _ in range(num_samples)],
    'Start_Lat': [],
    'Start_Lng': [],
    'Distance(mi)': [0] * num_samples,
    'Description': ['No crash'] * num_samples,
    # Columns below are initialized with placeholder; updated with merging
    'Temperature(F)': [np.NaN] * num_samples,
    'Wind_Chill(F)': [np.NaN] * num_samples,
    'Humidity(%)': [np.NaN] * num_samples,
    'Pressure(in)': [np.NaN] * num_samples,
    'Visibility(mi)': [np.NaN] * num_samples,
    'Wind_Speed(mph)': [np.NaN] * num_samples,
    'Precipitation(in)': [np.NaN] * num_samples,
    'Weather_Condition': ['Placeholder'] * num_samples,
    'Amenity': [False] * num_samples,
    'Bump': [False] * num_samples,
    'Crossing': [False] * num_samples,
    'Give_Way': [False] * num_samples,
    'Junction': [False] * num_samples,
    'No_Exit': [False] * num_samples,
    'Railway': [False] * num_samples,
    'Roundabout': [False] * num_samples,
    'Station': [False] * num_samples,
    'Stop': [False] * num_samples,
    'Traffic_Calming': [False] * num_samples,
    'Traffic_Signal': [False] * num_samples,
    'Turning_Loop': [False] * num_samples,
}

latitudes, longitudes = generate_random_Boston_coordinates(num_samples)
non_accidents_MA['Start_Lat'] = latitudes
non_accidents_MA['Start_Lng'] = longitudes

# Set Crash to 0 where accident did not occur
non_accidents_MA["Crash"] = 0
df_non_accidents_MA = pd.DataFrame(non_accidents_MA)

# T/F column imputation for non-accident rows

In [8]:
# Round the coordinates to 3 decimal places; original 6 decimals too precise
df_MA_accidents['Start_Lat_Rounded'] = df_MA_accidents['Start_Lat'].round(3)
df_MA_accidents['Start_Lng_Rounded'] = df_MA_accidents['Start_Lng'].round(3)
df_non_accidents_MA['Start_Lat_Rounded'] = df_non_accidents_MA['Start_Lat'].round(3)
df_non_accidents_MA['Start_Lng_Rounded'] = df_non_accidents_MA['Start_Lng'].round(3)

# List of True/False columns
true_false_columns = ['Amenity','Bump','Crossing','Give_Way','Junction','No_Exit','Railway','Roundabout','Station','Stop','Traffic_Calming','Traffic_Signal','Turning_Loop']

# Merge datasets using coordinates
merged_df = pd.merge(df_non_accidents_MA, df_MA_accidents[['Start_Lat_Rounded', 'Start_Lng_Rounded'] + true_false_columns],
                     on=['Start_Lat_Rounded', 'Start_Lng_Rounded'],
                     how='left',
                     suffixes=('', '_acc'))

# Update True/False columns in artificial dataset
for col in true_false_columns:
    merged_df[col] = merged_df[col] | merged_df[col + '_acc']

df_non_accidents_MA = merged_df.drop(columns=['Start_Lat_Rounded', 'Start_Lng_Rounded'] + [col + '_acc' for col in true_false_columns])

# Merge accident and non-accident data

In [9]:
# drop unneccessary columns from accidents data
df_MA_accidents.drop(columns=['End_Time','End_Lat','End_Lng','Source','Street','City','County','State','Zipcode','Country','Timezone','Airport_Code',
                              'Weather_Timestamp','Wind_Direction','Sunrise_Sunset','Civil_Twilight','Nautical_Twilight','Astronomical_Twilight'], inplace=True)

df_all_instances_MA = pd.concat([df_MA_accidents, df_non_accidents_MA], ignore_index=True)

In [10]:
# drop any columns where location, date and time are the same, as this would make the non-accident row invalid
duplicate_mask = df_all_instances_MA.duplicated(subset=['Start_Lat','Start_Lng','Start_Time'], keep=False)

df_all_instances_MA = df_all_instances_MA[~duplicate_mask]

# Numerical weather-related imputation for non-accident rows

In [11]:
# Forward fill categorical colum
df_all_instances_MA['Weather_Condition'] = df_all_instances_MA['Weather_Condition'].ffill()

In [12]:
# Ensure Start_Time is in datetime format
df_all_instances_MA['Start_Time'] = pd.to_datetime(df_all_instances_MA['Start_Time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

# Sort DataFrame by Start_Time
df_all_instances_MA.sort_values(by='Start_Time', inplace=True)

# Create function to impute data with closest location and start_time
def fill_missing_with_closest(df, time_col, value_col):
    for i in range(len(df)):
        if pd.isna(df.iloc[i][value_col]):
            # Determine closest previous non-null value
            previous_index = df.iloc[:i][value_col].last_valid_index()
            previous_value = df.at[previous_index, value_col] if previous_index is not None else np.nan
            previous_time = df.at[previous_index, time_col] if previous_index is not None else pd.NaT

            # Determine closest next non-null value
            next_index = df.iloc[i+1:][value_col].first_valid_index()
            next_value = df.at[next_index, value_col] if next_index is not None else np.nan
            next_time = df.at[next_index, time_col] if next_index is not None else pd.NaT

            # Compare distances and select nearest neighbor
            if pd.isna(previous_value):
                df.at[df.index[i], value_col] = next_value
            elif pd.isna(next_value):
                df.at[df.index[i], value_col] = previous_value
            else:
                previous_time_diff = abs(df.iloc[i][time_col] - previous_time) if previous_time is not pd.NaT else pd.Timedelta.max
                next_time_diff = abs(df.iloc[i][time_col] - next_time) if next_time is not pd.NaT else pd.Timedelta.max
                df.at[df.index[i], value_col] = previous_value if previous_time_diff <= next_time_diff else next_value
    return df

In [13]:
# Run function on numeric columns
df_all_instances_MA = fill_missing_with_closest(df_all_instances_MA, 'Start_Time', 'Temperature(F)')
df_all_instances_MA = fill_missing_with_closest(df_all_instances_MA, 'Start_Time', 'Wind_Chill(F)')
df_all_instances_MA = fill_missing_with_closest(df_all_instances_MA, 'Start_Time', 'Humidity(%)')
df_all_instances_MA = fill_missing_with_closest(df_all_instances_MA, 'Start_Time', 'Pressure(in)')
df_all_instances_MA = fill_missing_with_closest(df_all_instances_MA, 'Start_Time', 'Visibility(mi)')
df_all_instances_MA = fill_missing_with_closest(df_all_instances_MA, 'Start_Time', 'Wind_Speed(mph)')
df_all_instances_MA = fill_missing_with_closest(df_all_instances_MA, 'Start_Time', 'Precipitation(in)')

# Pull in IMPACT Boston data from MA gov to pull road surface conditions

In [14]:
# import road surface data
road_surface_MA = '/content/drive/My Drive/MA_road_surface_conditions.xlsx'
df_road_surface_conditions_MA = pd.read_excel(road_surface_MA)

In [15]:
df_road_surface_conditions_MA['Crash Date'] = df_road_surface_conditions_MA['Crash Date'].astype(str)
df_road_surface_conditions_MA['Crash Time'] = df_road_surface_conditions_MA['Crash Time'].astype(str)

# Remove rows without Crash Date and Crash Time 'Crash Date' or 'Crash Time'
df_road_surface_conditions_MA = df_road_surface_conditions_MA[df_road_surface_conditions_MA['Crash Date'] != 'nan']
df_road_surface_conditions_MA = df_road_surface_conditions_MA[df_road_surface_conditions_MA['Crash Time'] != 'nan']

# Concatenate date and time and ensure proper format
df_road_surface_conditions_MA['Crash_DateTime'] = pd.to_datetime(df_road_surface_conditions_MA['Crash Date'] + ' ' + df_road_surface_conditions_MA['Crash Time'], errors='coerce')

# Remove rows with null locations and times
df_road_surface_conditions_MA = df_road_surface_conditions_MA.dropna(subset=['Crash_DateTime'])

df_all_instances_MA['Start_Lat_Rounded'] = df_all_instances_MA['Start_Lat'].round(3)
df_all_instances_MA['Start_Lng_Rounded'] = df_all_instances_MA['Start_Lng'].round(3)
df_road_surface_conditions_MA['Latitude_Rounded'] = df_road_surface_conditions_MA['Latitude'].round(3)
df_road_surface_conditions_MA['Longitude_Rounded'] = df_road_surface_conditions_MA['Longitude'].round(3)

# Remove rows with invalid latitude, longitude, or datetime based on overlap
df_all_instances_MA = df_all_instances_MA.dropna(subset=['Start_Lat_Rounded', 'Start_Lng_Rounded', 'Start_Time'])
df_road_surface_conditions_MA = df_road_surface_conditions_MA.dropna(subset=['Latitude_Rounded', 'Longitude_Rounded', 'Crash_DateTime'])

# Build a KDTree for spatial lookup
coords_road = df_road_surface_conditions_MA[['Latitude_Rounded', 'Longitude_Rounded']].values
tree = KDTree(coords_road)

# Prepare time data for broadcasting
time_road = df_road_surface_conditions_MA['Crash_DateTime'].values.astype('datetime64[s]')
time_all = df_all_instances_MA['Start_Time'].values.astype('datetime64[s]')

# Find the nearest spatial neighbors
coords_all = df_all_instances_MA[['Start_Lat_Rounded', 'Start_Lng_Rounded']].values
distances, indices = tree.query(coords_all)

# Calculate temporal distances
temporal_distances = np.abs(time_all[:, None] - time_road[indices]).astype('timedelta64[s]').astype(int)

# Combine distances
combined_distances = distances + temporal_distances

# Select the closest matches
closest_indices = combined_distances.argmin(axis=1)

# Add the closest road surface condition to df_all_instances_MA
df_all_instances_MA['Road_Surface_Condition'] = df_road_surface_conditions_MA.iloc[closest_indices]['Road Surface Condition'].values

df_all_instances_MA.drop(columns=['Start_Lat_Rounded', 'Start_Lng_Rounded'], inplace=True)

  df_road_surface_conditions_MA['Crash_DateTime'] = pd.to_datetime(df_road_surface_conditions_MA['Crash Date'] + ' ' + df_road_surface_conditions_MA['Crash Time'], errors='coerce')


In [16]:
# Only take in most select most common categories to eliminate outliers
df_all_instances_MA = df_all_instances_MA[df_all_instances_MA['Road_Surface_Condition'].isin(['Wet','Dry','Ice','Snow'])]

# Encode Categorical Variables

In [17]:
# Categorical columns With multiple options

label_encoder = LabelEncoder()

df_all_instances_MA['Weather_Condition'] = label_encoder.fit_transform(df_all_instances_MA['Weather_Condition'])
df_all_instances_MA['Road_Surface_Condition'] = label_encoder.fit_transform(df_all_instances_MA['Road_Surface_Condition'])


# T/F columns

df_all_instances_MA['Amenity'] = df_all_instances_MA['Amenity'].astype(int)
df_all_instances_MA['Bump'] = df_all_instances_MA['Bump'].astype(int)
df_all_instances_MA['Crossing'] = df_all_instances_MA['Crossing'].astype(int)
df_all_instances_MA['Give_Way'] = df_all_instances_MA['Give_Way'].astype(int)
df_all_instances_MA['Junction'] = df_all_instances_MA['Junction'].astype(int)
df_all_instances_MA['No_Exit'] = df_all_instances_MA['No_Exit'].astype(int)
df_all_instances_MA['Railway'] = df_all_instances_MA['Railway'].astype(int)
df_all_instances_MA['Roundabout'] = df_all_instances_MA['Roundabout'].astype(int)
df_all_instances_MA['Station'] = df_all_instances_MA['Station'].astype(int)
df_all_instances_MA['Stop'] = df_all_instances_MA['Stop'].astype(int)
df_all_instances_MA['Traffic_Calming'] = df_all_instances_MA['Traffic_Calming'].astype(int)
df_all_instances_MA['Traffic_Signal'] = df_all_instances_MA['Traffic_Signal'].astype(int)
df_all_instances_MA['Turning_Loop'] = df_all_instances_MA['Turning_Loop'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all_instances_MA['Weather_Condition'] = label_encoder.fit_transform(df_all_instances_MA['Weather_Condition'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all_instances_MA['Road_Surface_Condition'] = label_encoder.fit_transform(df_all_instances_MA['Road_Surface_Condition'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

# Extract date time information

In [18]:
df_all_instances_MA['Start_Time'] = pd.to_datetime(df_all_instances_MA['Start_Time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

df_all_instances_MA['Year_Start_Time'] = df_all_instances_MA['Start_Time'].dt.year
df_all_instances_MA['Month_Start_Time'] = df_all_instances_MA['Start_Time'].dt.month
df_all_instances_MA['Day_Start_Time'] = df_all_instances_MA['Start_Time'].dt.day
df_all_instances_MA['Hour_Start_Time'] = df_all_instances_MA['Start_Time'].dt.hour
df_all_instances_MA['Minute_Start_Time'] = df_all_instances_MA['Start_Time'].dt.minute
df_all_instances_MA['Second_Start_Time'] = df_all_instances_MA['Start_Time'].dt.second
df_all_instances_MA['DayOfWeek_Start_Time'] = df_all_instances_MA['Start_Time'].dt.dayofweek

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all_instances_MA['Start_Time'] = pd.to_datetime(df_all_instances_MA['Start_Time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all_instances_MA['Year_Start_Time'] = df_all_instances_MA['Start_Time'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

# Build initial regression models

In [19]:
# drop unneccessary columns
df_all_instances_MA.drop(columns=['ID','Start_Time','Description','Crash'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all_instances_MA.drop(columns=['ID','Start_Time','Description','Crash'], inplace=True)


In [20]:
# logistic regression
y = df_all_instances_MA['Severity']
df_all_instances_MA.drop(columns=['Severity'], inplace=True)
x = df_all_instances_MA
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=14)

logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

coefficients = pd.DataFrame({'Feature': x.columns, 'Coefficient': logreg.coef_[0]})
print(coefficients)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all_instances_MA.drop(columns=['Severity'], inplace=True)


Accuracy: 0.786042944785276
Accuracy: 0.786042944785276
Confusion Matrix:
[[398   0   0   0   0]
 [  0   0  27   0   0]
 [  1   0 627   0   0]
 [  2   0 245   0   0]
 [  0   0   4   0   0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       398
           1       0.00      0.00      0.00        27
           2       0.69      1.00      0.82       628
           3       0.00      0.00      0.00       247
           4       0.00      0.00      0.00         4

    accuracy                           0.79      1304
   macro avg       0.34      0.40      0.36      1304
weighted avg       0.64      0.79      0.70      1304

                   Feature  Coefficient
0                Start_Lat    -0.000384
1                Start_Lng    -0.000302
2             Distance(mi)    -0.001611
3           Temperature(F)    -0.001196
4            Wind_Chill(F)    -0.000785
5              Humidity(%)    -0.005283
6             Pres

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# multiple linear regression
linear_reg = LinearRegression()

linear_reg.fit(x_train, y_train)

y_pred = linear_reg.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

coefficients = pd.DataFrame({'Feature': x.columns, 'Coefficient': linear_reg.coef_})
print("Coefficients:\n", coefficients)
print("Intercept:", linear_reg.intercept_)

Mean Squared Error: 0.359311907664662
R^2 Score: 0.7133574617812052
Coefficients:
                    Feature   Coefficient
0                Start_Lat  3.710987e+00
1                Start_Lng  1.829879e+00
2             Distance(mi)  1.290018e-01
3           Temperature(F)  8.381691e-03
4            Wind_Chill(F) -8.192793e-03
5              Humidity(%)  3.454368e-03
6             Pressure(in)  8.622581e-02
7           Visibility(mi)  1.206123e-02
8          Wind_Speed(mph) -7.721972e-04
9        Precipitation(in)  4.530440e-01
10       Weather_Condition -1.244375e-02
11                 Amenity -5.998241e-02
12                    Bump -7.498700e-04
13                Crossing -1.125172e-01
14                Give_Way -3.321535e-02
15                Junction  1.635439e-01
16                 No_Exit  1.708626e-01
17                 Railway  8.615075e-02
18              Roundabout -6.661338e-16
19                 Station -8.765948e-02
20                    Stop -9.618532e-02
21         Traf

In [22]:
# Random forest regression
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)

rf_reg.fit(x_train, y_train)

y_pred = rf_reg.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

feature_importances = pd.DataFrame({'Feature': x.columns, 'Importance': rf_reg.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print("Feature Importances:\n", feature_importances)

Mean Squared Error: 0.08325268404907975
R^2 Score: 0.9335848321185416
Feature Importances:
                    Feature  Importance
28         Hour_Start_Time    0.861326
1                Start_Lng    0.026132
0                Start_Lat    0.020295
13                Crossing    0.015375
25         Year_Start_Time    0.011016
2             Distance(mi)    0.007555
29       Minute_Start_Time    0.006276
30       Second_Start_Time    0.005707
6             Pressure(in)    0.005624
5              Humidity(%)    0.005225
26        Month_Start_Time    0.005175
27          Day_Start_Time    0.004794
8          Wind_Speed(mph)    0.004268
10       Weather_Condition    0.003461
4            Wind_Chill(F)    0.003437
3           Temperature(F)    0.003412
31    DayOfWeek_Start_Time    0.003325
11                 Amenity    0.001768
7           Visibility(mi)    0.000919
19                 Station    0.000879
9        Precipitation(in)    0.000852
24  Road_Surface_Condition    0.000766
15         

# Final random forest regression model after feature selection

In [23]:
x = df_all_instances_MA[['Road_Surface_Condition','Start_Lat','Start_Lng','Hour_Start_Time','Crossing','Second_Start_Time','Minute_Start_Time','Month_Start_Time','Day_Start_Time',
                         'Amenity','Station','Traffic_Signal','Railway','Give_Way','Junction','Stop']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=14)

rf_reg = RandomForestRegressor(n_estimators=100, random_state=14)

rf_reg.fit(x_train, y_train)

y_pred = rf_reg.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 0.10129363496932517
R^2 Score: 0.9191925900209429


In [24]:
# Define the path to save final model in Google Drive
model_path = '/content/drive/My Drive/random_forest_accident_likelihood_model.pkl'

# Save model
joblib.dump(rf_reg, model_path)

['/content/drive/My Drive/random_forest_accident_likelihood_model.pkl']