In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [11]:
from google.colab import drive
drive.mount('/content/drive')
# Load the data
df = pd.read_csv('/content/drive/My Drive/final_data.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df.head()

Unnamed: 0,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in),...,Turning_Loop,Road_Surface_Condition,Year_Start_Time,Month_Start_Time,Day_Start_Time,Hour_Start_Time,Minute_Start_Time,Second_Start_Time,DayOfWeek_Start_Time,Severity
0,42.3783,-71.139821,0.0,41.0,32.8,93.0,29.82,4.0,16.1,0.04,...,0,0,2016,3,23,0,0,0,2,0
1,42.283485,-70.988096,0.0,41.0,32.8,93.0,29.82,4.0,16.1,0.04,...,0,0,2016,3,27,0,0,0,6,0
2,42.252249,-71.134358,0.0,41.0,32.8,93.0,29.82,4.0,16.1,0.04,...,0,3,2016,3,28,0,0,0,0,0
3,42.35277,-71.05516,0.026,41.0,32.8,93.0,29.82,4.0,16.1,0.04,...,0,3,2016,3,28,10,9,39,0,2
4,42.322209,-71.089809,0.0,70.0,70.0,41.0,29.86,10.0,28.8,0.0,...,0,0,2016,3,31,0,0,0,3,0


In [4]:
pip install geopy



In [5]:
# Define coordinate ranges for major cities in Massachusetts
city_ranges = {
    'Boston': (42.2279, 42.3975, -71.1912, -70.9228),
    'Cambridge': (42.3584, 42.4045, -71.1667, -71.0639),
    'Worcester': (42.2370, 42.3136, -71.8745, -71.7533),
    'Springfield': (42.0759, 42.1276, -72.6181, -72.4862),
    'Lowell': (42.6054, 42.6664, -71.3796, -71.2713),
    'Brockton': (42.0336, 42.1095, -71.0720, -70.9644),
    'Quincy': (42.2179, 42.2851, -71.0520, -70.9515),
    'Lynn': (42.4396, 42.4993, -70.9920, -70.9109),
    'New Bedford': (41.6149, 41.6859, -70.9607, -70.8830),
    'Fall River': (41.6690, 41.7359, -71.1867, -71.1225),
    'Newton': (42.283, 42.367, -71.258, -71.172),
    'Somerville': (42.373, 42.408, -71.125, -71.075),
    'Framingham': (42.270, 42.340, -71.460, -71.380),
    'Waltham': (42.348, 42.420, -71.272, -71.200),
    'Haverhill': (42.736, 42.815, -71.145, -70.970),
    'Malden': (42.408, 42.450, -71.090, -71.020),
    'Medford': (42.400, 42.460, -71.150, -71.050),
    'Taunton': (41.870, 41.950, -71.150, -71.030),
    'Chicopee': (42.140, 42.210, -72.660, -72.520),
    'Weymouth': (42.160, 42.240, -70.950, -70.850),
    'Revere': (42.380, 42.440, -71.020, -70.940),
    'Peabody': (42.510, 42.570, -70.970, -70.870),
    'Methuen': (42.700, 42.750, -71.230, -71.130),
    'Barnstable': (41.630, 41.710, -70.360, -70.220),
    'Pittsfield': (42.430, 42.470, -73.300, -73.210),
    'Attleboro': (41.930, 41.990, -71.330, -71.250),
    'Arlington': (42.400, 42.440, -71.190, -71.120),
    'Everett': (42.390, 42.430, -71.080, -71.020),
    'Salem': (42.510, 42.530, -70.920, -70.860),
    'Beverly': (42.540, 42.590, -70.930, -70.840),
    'Chelsea': (42.380, 42.400, -71.040, -71.020)
}

def assign_city(lat, lon):
    for city, (lat_min, lat_max, lon_min, lon_max) in city_ranges.items():
        if lat_min <= lat <= lat_max and lon_min <= lon <= lon_max:
            return city
    return 'Other'

# Apply the function to create the 'City' column
df['City'] = df.apply(lambda row: assign_city(row['Start_Lat'], row['Start_Lng']), axis=1)

# Print the first few rows to verify
print(df[['Start_Lat', 'Start_Lng', 'City']].head())

# Print a summary of city assignments
print(df['City'].value_counts())

# Save the updated DataFrame back to a CSV file if needed
df.head()

   Start_Lat  Start_Lng    City
0  42.378300 -71.139821  Boston
1  42.283485 -70.988096  Boston
2  42.252249 -71.134358  Boston
3  42.352770 -71.055160  Boston
4  42.322209 -71.089809  Boston
City
Boston       6466
Cambridge      22
Revere          9
Everett         5
Arlington       1
Medford         1
Name: count, dtype: int64


Unnamed: 0,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in),...,Road_Surface_Condition,Year_Start_Time,Month_Start_Time,Day_Start_Time,Hour_Start_Time,Minute_Start_Time,Second_Start_Time,DayOfWeek_Start_Time,Severity,City
0,42.3783,-71.139821,0.0,41.0,32.8,93.0,29.82,4.0,16.1,0.04,...,0,2016,3,23,0,0,0,2,0,Boston
1,42.283485,-70.988096,0.0,41.0,32.8,93.0,29.82,4.0,16.1,0.04,...,0,2016,3,27,0,0,0,6,0,Boston
2,42.252249,-71.134358,0.0,41.0,32.8,93.0,29.82,4.0,16.1,0.04,...,3,2016,3,28,0,0,0,0,0,Boston
3,42.35277,-71.05516,0.026,41.0,32.8,93.0,29.82,4.0,16.1,0.04,...,3,2016,3,28,10,9,39,0,2,Boston
4,42.322209,-71.089809,0.0,70.0,70.0,41.0,29.86,10.0,28.8,0.0,...,0,2016,3,31,0,0,0,3,0,Boston


In [7]:
df[df['City'] == 'Revere']

Unnamed: 0,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in),...,Road_Surface_Condition,Year_Start_Time,Month_Start_Time,Day_Start_Time,Hour_Start_Time,Minute_Start_Time,Second_Start_Time,DayOfWeek_Start_Time,Severity,City
401,42.399419,-70.989129,0.0,66.0,66.0,90.0,29.81,10.0,12.7,0.0,...,3,2016,10,19,0,0,0,2,0,Revere
869,42.397755,-71.001463,0.0,19.9,4.3,33.0,30.48,10.0,19.6,0.0,...,3,2017,3,23,0,0,0,3,0,Revere
989,42.397953,-70.991402,0.0,59.0,59.0,44.0,29.95,10.0,9.2,0.0,...,0,2017,6,1,0,0,0,3,0,Revere
2648,42.39939,-71.005129,0.0,69.0,69.0,45.0,29.83,10.0,20.0,0.0,...,3,2019,6,15,0,0,0,5,0,Revere
3189,42.398043,-71.012547,0.0,22.0,8.0,37.0,30.23,10.0,17.0,0.0,...,3,2019,12,20,0,0,0,4,0,Revere
5439,42.399671,-70.996437,0.0,57.0,57.0,83.0,30.28,10.0,0.0,0.0,...,0,2021,10,7,0,0,0,3,0,Revere
5496,42.399507,-71.002557,0.0,74.0,74.0,59.0,29.89,10.0,12.0,0.0,...,0,2021,10,15,0,0,0,4,0,Revere
6166,42.398194,-71.002756,0.0,76.0,76.0,40.0,29.86,10.0,12.0,0.0,...,3,2022,6,14,0,0,0,1,0,Revere
6474,42.399482,-71.003757,0.0,45.0,38.0,68.0,29.39,10.0,16.0,0.0,...,0,2023,1,29,0,0,0,6,0,Revere


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from datetime import datetime

# Given date and time string
date_string = "7/30/2024 13:40:05"

# Parse the string into a datetime object
dt = datetime.strptime(date_string, "%m/%d/%Y %H:%M:%S")

# Extract individual components
Day_Start_Time = dt.day
Month_Start_Time = dt.month
Year_Start_Time = dt.year
Hour_Start_Time = dt.hour
Minute_Start_Time = dt.minute
Second_Start_Time = dt.second

# Print the extracted components
print(f"Day: {Day_Start_Time}")
print(f"Month: {Month_Start_Time}")
print(f"Year: {Year_Start_Time}")
print(f"Hour: {Hour_Start_Time}")
print(f"Minute: {Minute_Start_Time}")
print(f"Second: {Second_Start_Time}")
road_surface_condition = 0
city = 'Revere'

# Function to train models for each target column
def predict_for_city(df, city, road_surface_condition, Day_Start_Time, Month_Start_Time,
                     Hour_Start_Time, Minute_Start_Time, Second_Start_Time):

    # Filter for city
    filtered_df = df[df['City'] == city].copy()
    if filtered_df.empty:
        raise ValueError(f"No data found for city: {city}")

    print(f"Filtered DataFrame for {city}:")
    print(filtered_df.head())

    # Define features and target columns based on available columns
    numeric_features = ['Start_Lat', 'Start_Lng']
    categorical_features = ['City', 'Road_Surface_Condition']


    target_columns = ['Amenity','Crossing', 'Station','Traffic_Signal','Railway','Give_Way','Junction','Stop']

    print("Numeric features:", numeric_features)
    print("Categorical features:", categorical_features)
    print("Target columns:", target_columns)

    # Prepare the data
    X = filtered_df[numeric_features + categorical_features]
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])

    # Train models for each target column
    models = {}
    for target in target_columns:
        y = filtered_df[target]
        clf = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
        ])
        clf.fit(X, y)
        models[target] = clf

    #Check if city is in Massachusetts and extract Latitude and Longitude min and max values
    if city in city_ranges:
        lat_min, lat_max, lng_min, lng_max = city_ranges[city]
    else:
        print("City not present in Massachusetts")


    # Define the grid of latitudes and longitudes for the city
    latitudes = np.round(np.arange(lat_min, lat_max, 0.001), 3)
    longitudes = np.round(np.arange(lng_min, lng_max, 0.001), 3)

    # Function to predict for a single location
    def predict_location(lat, lng):
        sample = pd.DataFrame({
            'Start_Lat': [lat],
            'Start_Lng': [lng],
            'City': [city],
        })
        if 'Road_Surface_Condition' in categorical_features:
            sample['Road_Surface_Condition'] = [road_surface_condition]

        predictions = {}
        for target, model in models.items():
            predictions[target] = bool(model.predict(sample)[0])

        predictions.update({
            'Start_Lat': lat,
            'Start_Lng': lng,
            'Day_Start_Time': Day_Start_Time,
            'Month_Start_Time': Month_Start_Time,
            'Hour_Start_Time': Hour_Start_Time,
            'Minute_Start_Time': Minute_Start_Time,
            'Second_Start_Time': Second_Start_Time,
            'Road_Surface_Condition': road_surface_condition
        })

        return predictions

    # Generate predictions for all unique locations
    results = []
    for lat in latitudes:
        for lng in longitudes:
            result = predict_location(lat, lng)
            results.append(result)

    # Create the final DataFrame
    final_df = pd.DataFrame(results)

    # Convert True/False to 1/0
    for col in target_columns:
        final_df[col] = final_df[col].astype(int)

    # Reorder columns
    column_order = ['Start_Lat', 'Start_Lng', 'Day_Start_Time', 'Month_Start_Time',
                    'Hour_Start_Time', 'Minute_Start_Time', 'Second_Start_Time', 'Road_Surface_Condition'] + target_columns

    final_df = final_df[column_order]

    return final_df

result_df = predict_for_city(df, city, road_surface_condition, Day_Start_Time, Month_Start_Time,
                        Hour_Start_Time, Minute_Start_Time, Second_Start_Time)
print(result_df.head())


Day: 30
Month: 7
Year: 2024
Hour: 13
Minute: 40
Second: 5
Filtered DataFrame for Revere:
      Start_Lat  Start_Lng  Distance(mi)  Temperature(F)  Wind_Chill(F)  \
401   42.399419 -70.989129           0.0            66.0           66.0   
869   42.397755 -71.001463           0.0            19.9            4.3   
989   42.397953 -70.991402           0.0            59.0           59.0   
2648  42.399390 -71.005129           0.0            69.0           69.0   
3189  42.398043 -71.012547           0.0            22.0            8.0   

      Humidity(%)  Pressure(in)  Visibility(mi)  Wind_Speed(mph)  \
401          90.0         29.81            10.0             12.7   
869          33.0         30.48            10.0             19.6   
989          44.0         29.95            10.0              9.2   
2648         45.0         29.83            10.0             20.0   
3189         37.0         30.23            10.0             17.0   

      Precipitation(in)  ...  Road_Surface_Conditio

In [9]:
#Define columns for reordering the database
new_order = ['Road_Surface_Condition', 'Start_Lat', 'Start_Lng', 'Hour_Start_Time', 'Crossing',
             'Second_Start_Time', 'Minute_Start_Time', 'Month_Start_Time', 'Day_Start_Time',
             'Amenity', 'Station', 'Traffic_Signal', 'Railway', 'Give_Way', 'Junction', 'Stop']

# Reorder the DataFrame
result_df = result_df.reindex(columns=new_order)

# Display the first few rows to verify the new order
print(result_df.head())


   Road_Surface_Condition  Start_Lat  Start_Lng  Hour_Start_Time  Crossing  \
0                       0      42.38    -71.020               13         0   
1                       0      42.38    -71.019               13         0   
2                       0      42.38    -71.018               13         0   
3                       0      42.38    -71.017               13         0   
4                       0      42.38    -71.016               13         0   

   Second_Start_Time  Minute_Start_Time  Month_Start_Time  Day_Start_Time  \
0                  5                 40                 7              30   
1                  5                 40                 7              30   
2                  5                 40                 7              30   
3                  5                 40                 7              30   
4                  5                 40                 7              30   

   Amenity  Station  Traffic_Signal  Railway  Give_Way  Junction  St

In [12]:
import joblib

# Load the model from the .pkl file
with open('/content/drive/My Drive/random_forest_accident_likelihood_model.pkl', 'rb') as file:
    loaded_model = joblib.load(file)


In [13]:
from math import ceil

# Create a copy of the DataFrame
pred_df = result_df.copy()


# Make predictions using the loaded model
predictions = loaded_model.predict(pred_df)

# Add predictions to the DataFrame
pred_df['Severity'] = predictions

# Display the first few rows to verify the new column
print(pred_df.head())

   Road_Surface_Condition  Start_Lat  Start_Lng  Hour_Start_Time  Crossing  \
0                       0      42.38    -71.020               13         0   
1                       0      42.38    -71.019               13         0   
2                       0      42.38    -71.018               13         0   
3                       0      42.38    -71.017               13         0   
4                       0      42.38    -71.016               13         0   

   Second_Start_Time  Minute_Start_Time  Month_Start_Time  Day_Start_Time  \
0                  5                 40                 7              30   
1                  5                 40                 7              30   
2                  5                 40                 7              30   
3                  5                 40                 7              30   
4                  5                 40                 7              30   

   Amenity  Station  Traffic_Signal  Railway  Give_Way  Junction  St

In [14]:
pred_df.to_csv('/content/drive/MyDrive/predictions.csv')