In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# Load dataset
file_path = "/Users/rsoedarnadi/Documents/GitHub/Datathon-Dataverse/Excel Files/Traffic_Data_Department_Total.csv"
df = pd.read_csv(file_path)

2025-03-10 22:40:36.297616: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from tensorflow.keras.layers import LSTM, Bidirectional, Dropout
from sklearn.metrics import mean_absolute_percentage_error
import joblib
from tensorflow.keras.models import load_model
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
# Encode categorical variables
label_encoder = LabelEncoder()
df["Department"] = label_encoder.fit_transform(df["Department"])

# Normalize numerical features
scaler = MinMaxScaler()
df[["Accident_Count"]] = scaler.fit_transform(df[["Accident_Count"]])

# Sort by department and year
df = df.sort_values(by=["Department", "Year"]).reset_index(drop=True)

# Define sequence length
SEQ_LENGTH = 10
X, y = [], []

departments = df["Department"].unique()
for dept in departments:
    dept_data = df[df["Department"] == dept].reset_index(drop=True)
    for i in range(len(dept_data) - SEQ_LENGTH):
        X.append(dept_data.iloc[i:i+SEQ_LENGTH][["Accident_Count"]].values)
        y.append(dept_data.iloc[i+SEQ_LENGTH]["Accident_Count"])

X, y = np.array(X), np.array(y)

In [5]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)


model = Sequential([Bidirectional(LSTM(100, activation='relu', return_sequences=True), input_shape=(SEQ_LENGTH, 1)),
    Dropout(0.2),
    Bidirectional(LSTM(100, activation='relu')),
    Dropout(0.3),
    Dense(1)
])

model.compile(optimizer=Adam(learning_rate=0.01), loss='mae')

# Train the model
model.fit(X_train, y_train, epochs=60, batch_size=16, validation_data=(X_test, y_test))

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<keras.callbacks.History at 0x7f8e3e01cb20>

In [6]:
X_test

array([[[0.32766825],
        [0.89734874],
        [0.91298436],
        [0.90890551],
        [0.7675051 ],
        [0.88987084],
        [0.92318151],
        [0.47382733],
        [0.64649898],
        [0.92318151]],

       [[0.89734874],
        [0.91298436],
        [0.90890551],
        [0.7675051 ],
        [0.88987084],
        [0.92318151],
        [0.47382733],
        [0.64649898],
        [0.92318151],
        [0.73555404]],

       [[0.91298436],
        [0.90890551],
        [0.7675051 ],
        [0.88987084],
        [0.92318151],
        [0.47382733],
        [0.64649898],
        [0.92318151],
        [0.73555404],
        [0.760707  ]],

       [[0.90890551],
        [0.7675051 ],
        [0.88987084],
        [0.92318151],
        [0.47382733],
        [0.64649898],
        [0.92318151],
        [0.73555404],
        [0.760707  ],
        [0.77022434]],

       [[0.4085656 ],
        [0.36165874],
        [0.22229776],
        [0.59755269],
        [0.73351462],
  

In [9]:
# Predict
y_pred = model.predict(X_test)

# Convert predictions back to original scale
y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()
y_pred_original = scaler.inverse_transform(y_pred).flatten()

print(f"MAPE: {mean_absolute_percentage_error(y_test_original,y_pred_original)}")

MAPE: 0.03745787058961402


In [10]:
# Forecast for 2024-2026
future_years = [2024, 2025, 2026]
forecast_results = {}

for dept in departments:
    dept_data = df[df["Department"] == dept].reset_index(drop=True)
    last_sequence = dept_data.iloc[-SEQ_LENGTH:][["Accident_Count"]].values.reshape(1, SEQ_LENGTH, 1)
    predictions = []
    
    for year in future_years:
        pred = model.predict(last_sequence)[0, 0]
        predictions.append(pred)
        last_sequence = np.roll(last_sequence, -1)
        last_sequence[0, -1, 0] = pred
    
    # Convert predictions back to original scale
    predictions_original = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
    forecast_results[dept] = dict(zip(future_years, predictions_original))

# Print future predictions
for dept, predictions in forecast_results.items():
    # Transform the encoded department value back to the original name
    original_dept = label_encoder.inverse_transform([dept])[0]
    print(f"Department {original_dept}:")
    for year, pred in predictions.items():
        print(f"  Year {year}: Predicted Accidents {pred:.2f}")

Department Al Maamora:
  Year 2024: Predicted Accidents 1293.90
  Year 2025: Predicted Accidents 1289.85
  Year 2026: Predicted Accidents 1285.83
Department Al Rayyan:
  Year 2024: Predicted Accidents 1280.42
  Year 2025: Predicted Accidents 1280.20
  Year 2026: Predicted Accidents 1279.26
Department Al Shammal:
  Year 2024: Predicted Accidents 1272.50
  Year 2025: Predicted Accidents 1270.66
  Year 2026: Predicted Accidents 1269.12
Department Al Thumama (Al Mattar):
  Year 2024: Predicted Accidents 1251.67
  Year 2025: Predicted Accidents 1258.48
  Year 2026: Predicted Accidents 1262.06
Department Dukhan:
  Year 2024: Predicted Accidents 187.80
  Year 2025: Predicted Accidents 180.49
  Year 2026: Predicted Accidents 174.98
Department Industerid area:
  Year 2024: Predicted Accidents 883.20
  Year 2025: Predicted Accidents 879.53
  Year 2026: Predicted Accidents 880.27
Department Madinatt khalifah:
  Year 2024: Predicted Accidents 1279.96
  Year 2025: Predicted Accidents 1279.40
  Year

In [78]:
# Create an empty list to store the results
results = []
# Loop through forecast results and compile the data
for dept, predictions in forecast_results.items():
    original_dept = label_encoder.inverse_transform([dept])[0]
    for year, pred in predictions.items():
        results.append({
            "Department": original_dept,
            "Year": year,
            "Accident Count": pred
        })

# Create a DataFrame from the results
df_results = pd.DataFrame(results)

# Sort the DataFrame by year
df_results = df_results.sort_values(by="Year")

# Reset the index
df_results = df_results.reset_index(drop=True)

# Display the DataFrame
print(df_results)

                Department  Year  Accident Count
0               Al Maamora  2024     1287.551758
1        Madinatt khalifah  2024     1272.090088
2                   South   2024     1254.616333
3                Al Rayyan  2024     1272.573608
4                   Dukhan  2024      241.565079
5               Al Shammal  2024     1264.872681
6          Industerid area  2024      672.217712
7   Al Thumama (Al Mattar)  2024     1237.489136
8                   Dukhan  2025      236.500839
9                   South   2025     1250.442383
10       Madinatt khalifah  2025     1270.533569
11              Al Shammal  2025     1261.274536
12               Al Rayyan  2025     1270.730469
13              Al Maamora  2025     1281.941895
14  Al Thumama (Al Mattar)  2025     1240.850342
15         Industerid area  2025      560.402100
16       Madinatt khalifah  2026     1265.881348
17  Al Thumama (Al Mattar)  2026     1244.165283
18                  Dukhan  2026      233.462418
19              Al S

In [79]:
df_results.to_csv("./Traffic Dept Predictions.csv")

In [14]:
model.save('/Users/rsoedarnadi/Downloads/traffic.h5')

In [11]:
joblib.dump(scaler,"/Users/rsoedarnadi/Desktop/DataVerse/scaler.pkl")
joblib.dump(label_encoder,"/Users/rsoedarnadi/Desktop/DataVerse/encoder.pkl")

['/Users/rsoedarnadi/Desktop/DataVerse/encoder.pkl']

In [12]:
# Recreate the exact same model, including its weights and the optimizer
new_model = load_model('/Users/rsoedarnadi/Desktop/DataVerse/traffic.h5')

# Show the model architecture
new_model.summary()

Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_22 (Bidirecti  (None, 10, 200)          81600     
 onal)                                                           
                                                                 
 dropout_6 (Dropout)         (None, 10, 200)           0         
                                                                 
 bidirectional_23 (Bidirecti  (None, 200)              240800    
 onal)                                                           
                                                                 
 dropout_7 (Dropout)         (None, 200)               0         
                                                                 
 dense_25 (Dense)            (None, 1)                 201       
                                                                 
Total params: 322,601
Trainable params: 322,601
Non-t

In [23]:
import numpy as np
import pandas as pd
import joblib
from tensorflow.keras.models import load_model

# Load the saved pipeline components
new_label_encoder = joblib.load("/Users/rsoedarnadi/Desktop/DataVerse/encoder.pkl")
new_scaler = joblib.load("/Users/rsoedarnadi/Desktop/DataVerse/scaler.pkl")

# Define sequence length
SEQ_LENGTH = 10

# Load dataset for reference
file_path = "/Users/rsoedarnadi/Desktop/DataVerse/Traffic_Data_Department_Total.csv"
df = pd.read_csv(file_path)

In [24]:
# Ensure department names are consistent
df["Department"] = df["Department"].astype(str).str.strip()

In [25]:
# Function to make predictions
def predict_accidents(department, year_to_forecast):
    # Verify department exists
    if department not in df["Department"].unique():
        raise ValueError(f"Department '{department}' not found. Available departments: {df['Department'].unique()}")
    
    encoded_dept = new_label_encoder.transform([department])[0]
    dept_data = df[df["Department"] == department].reset_index(drop=True)
    
    if len(dept_data) < SEQ_LENGTH:
        raise ValueError("Not enough data for this department to make a prediction")
    
    # Get last SEQ_LENGTH accident counts and normalize
    last_sequence = dept_data.iloc[-SEQ_LENGTH:][["Accident_Count"]]
    last_sequence = pd.DataFrame(last_sequence, columns=["Accident_Count"])
    last_sequence = new_scaler.transform(last_sequence).reshape(1, SEQ_LENGTH, 1)
    
    predictions = []
    for _ in range(year_to_forecast - dept_data["Year"].max()):
        pred = new_model.predict(last_sequence)[0, 0]
        predictions.append(pred)
        last_sequence = np.roll(last_sequence, -1)
        last_sequence[0, -1, 0] = pred
    
    # Convert predictions back to original scale
    predictions_original = new_scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
    
    return predictions_original



In [26]:
# Example usage
department = "Dukhan"  # Change this to an actual department from your dataset
year_to_forecast = 2026

try:
    predictions = predict_accidents(department, year_to_forecast)
    print(f"Predicted accident counts for {department} in {year_to_forecast}: {predictions[-1]:.2f}")
except ValueError as e:
    print(f"Error: {e}")


Error: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.
