In [8]:
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv('/kaggle/input/congestion-data/us_congestion_2016_2022_sample_2m.csv')

In [10]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, accuracy_score

In [11]:
df = df.drop(["Description", "Street", "City", "County", "State", "Country", "ZipCode", "LocalTimeZone",
               "WeatherStation_AirportCode", "WeatherTimeStamp","ID"], axis=1, errors="ignore")

In [12]:
df_original = pd.read_csv('/kaggle/input/congestion-data/us_congestion_2016_2022_sample_2m.csv')

In [14]:
encoding_maps = {}

In [15]:
categorical_cols = ["Weather_Event", "Weather_Conditions", "WindDir","Congestion_Speed"]
for col in categorical_cols:
    encoder = LabelEncoder()
    encoder.fit(df_original[col].astype(str))  # Fit on the original unencoded data
    encoding_maps[col] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))  # Store mapping

In [16]:
categorical_cols = ["Weather_Event", "Weather_Conditions", "WindDir","Congestion_Speed"]
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

In [17]:
weather_conditions_map = encoding_maps["Weather_Conditions"]
print(weather_conditions_map)

{'Blowing Dust': 0, 'Blowing Dust / Windy': 1, 'Blowing Sand': 2, 'Blowing Snow': 3, 'Blowing Snow / Windy': 4, 'Blowing Snow Nearby': 5, 'Blowing Snow Nearby / Windy': 6, 'Clear': 7, 'Cloudy': 8, 'Cloudy / Windy': 9, 'Drifting Snow': 10, 'Drizzle': 11, 'Drizzle / Windy': 12, 'Drizzle and Fog': 13, 'Fair': 14, 'Fair / Windy': 15, 'Fog': 16, 'Fog / Windy': 17, 'Freezing Drizzle': 18, 'Freezing Rain': 19, 'Freezing Rain / Windy': 20, 'Funnel Cloud': 21, 'Hail': 22, 'Haze': 23, 'Haze / Windy': 24, 'Heavy Blowing Snow': 25, 'Heavy Drizzle': 26, 'Heavy Freezing Drizzle': 27, 'Heavy Freezing Rain': 28, 'Heavy Ice Pellets': 29, 'Heavy Rain': 30, 'Heavy Rain / Windy': 31, 'Heavy Rain Shower': 32, 'Heavy Rain Showers': 33, 'Heavy Sleet': 34, 'Heavy Sleet / Windy': 35, 'Heavy Sleet and Thunder': 36, 'Heavy Snow': 37, 'Heavy Snow / Windy': 38, 'Heavy Snow Showers': 39, 'Heavy Snow with Thunder': 40, 'Heavy T-Storm': 41, 'Heavy T-Storm / Windy': 42, 'Heavy Thunderstorm': 43, 'Heavy Thunderstorms a

In [18]:
df["StartTime"] = pd.to_datetime(df["StartTime"], utc=True, errors="coerce")
df["EndTime"] = pd.to_datetime(df["EndTime"], utc=True, errors="coerce")

In [19]:
df = df.dropna(subset=["StartTime", "EndTime"])

In [20]:
df["StartTime"] = (df["StartTime"] - df["StartTime"].min()).dt.total_seconds()
df["EndTime"] = (df["EndTime"] - df["EndTime"].min()).dt.total_seconds()

In [21]:
X = df.drop(["DelayFromTypicalTraffic(mins)", "Congestion_Speed"], axis=1)
y_regression = df["DelayFromTypicalTraffic(mins)"]  # Regression target
y_classification = df["Congestion_Speed"]  # Binary Classification (0 = Fast, 1 = Moderate, 2 = Slow)

In [22]:
X_train, X_test, y_reg_train, y_reg_test, y_cls_train, y_cls_test = train_test_split(
    X, y_regression, y_classification, test_size=0.2, random_state=42
)

In [23]:
model = xgb.XGBRegressor(objective="reg:squarederror", enable_categorical=True)
clf = xgb.XGBClassifier(objective="binary:logistic", enable_categorical=True)

In [24]:
model.fit(X_train, y_reg_train)
clf.fit(X_train, y_cls_train)

In [25]:
y_reg_pred = model.predict(X_test)
y_cls_pred = clf.predict(X_test)

In [26]:
mae = mean_absolute_error(y_reg_test, y_reg_pred)
accuracy = accuracy_score(y_cls_test, y_cls_pred)

print(f"Mean Absolute Error (MAE) for Delay Prediction: {mae}")
print(f"Accuracy for Congestion Prediction: {accuracy}")

Mean Absolute Error (MAE) for Delay Prediction: 0.7487718644020829
Accuracy for Congestion Prediction: 0.7246844628612037


In [27]:
from sklearn.metrics import classification_report
print(classification_report(y_cls_test, y_cls_pred))

              precision    recall  f1-score   support

           0       0.80      0.74      0.77    143084
           1       0.67      0.74      0.70    159343
           2       0.71      0.68      0.70     97209

    accuracy                           0.72    399636
   macro avg       0.73      0.72      0.72    399636
weighted avg       0.73      0.72      0.73    399636



In [28]:
importance = model.feature_importances_
for feature, score in zip(X_train.columns, importance):
    print(f"{feature}: {score:.4f}")

Severity: 0.0288
Start_Lat: 0.0275
Start_Lng: 0.0342
StartTime: 0.0393
EndTime: 0.0543
Distance(mi): 0.0496
DelayFromFreeFlowSpeed(mins): 0.5441
Temperature(F): 0.0516
WindChill(F): 0.0228
Humidity(%): 0.0216
Pressure(in): 0.0230
Visibility(mi): 0.0209
WindDir: 0.0156
WindSpeed(mph): 0.0116
Precipitation(in): 0.0163
Weather_Event: 0.0257
Weather_Conditions: 0.0131


In [29]:
import optuna

In [31]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",  # Regression for DelayFromTypicalTraffic
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
    }

    # Train model with current parameters
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_reg_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_reg_test, y_pred)

    return mae  # Optuna minimizes this

In [32]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)

[I 2025-03-03 15:33:06,277] A new study created in memory with name: no-name-97d5e6a9-eddd-40ed-b8ba-d14e62c95662
[I 2025-03-03 15:34:14,749] Trial 0 finished with value: 0.7884443106836089 and parameters: {'n_estimators': 900, 'max_depth': 5, 'learning_rate': 0.01842132618982219, 'subsample': 0.9468920920110832, 'colsample_bytree': 0.7268331130027608, 'reg_alpha': 0.14439331954332535, 'reg_lambda': 0.8836715108124366, 'min_child_weight': 2}. Best is trial 0 with value: 0.7884443106836089.
[I 2025-03-03 15:35:41,392] Trial 1 finished with value: 0.7081139886267813 and parameters: {'n_estimators': 500, 'max_depth': 11, 'learning_rate': 0.21171462370255129, 'subsample': 0.6082622759734102, 'colsample_bytree': 0.68637915938959, 'reg_alpha': 0.28026193728143656, 'reg_lambda': 0.419512342645055, 'min_child_weight': 6}. Best is trial 1 with value: 0.7081139886267813.
[I 2025-03-03 15:36:49,504] Trial 2 finished with value: 0.7243212898066591 and parameters: {'n_estimators': 700, 'max_depth':

In [33]:
best_params = study.best_params
print("Best parameters:", best_params)

Best parameters: {'n_estimators': 900, 'max_depth': 12, 'learning_rate': 0.05343119005856983, 'subsample': 0.897285760757781, 'colsample_bytree': 0.9980782272439057, 'reg_alpha': 0.018441364690612083, 'reg_lambda': 0.19361190668945039, 'min_child_weight': 6}


In [34]:
best_model = xgb.XGBRegressor(**best_params)
best_model.fit(X_train, y_reg_train)

In [35]:
y_pred_final = best_model.predict(X_test)
mae_final = mean_absolute_error(y_reg_test, y_pred_final)

In [36]:
print(f"Final Mean Absolute Error (MAE): {mae_final}")

Final Mean Absolute Error (MAE): 0.653894843878839


In [37]:
def objective_cls(trial):
    params = {
        "objective": "multi:softmax",  # Multi-class classification
        "num_class": 3,  # 3 congestion levels
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
    }

    clf = xgb.XGBClassifier(**params)
    clf.fit(X_train, y_cls_train)

    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_cls_test, y_pred)

    return 1 - acc  # Optuna minimizes, so we minimize (1 - accuracy)

In [38]:
study_cls = optuna.create_study(direction="minimize")
study_cls.optimize(objective_cls, n_trials=30)

[I 2025-03-03 16:36:42,533] A new study created in memory with name: no-name-f3994644-e5a3-4f3a-b808-a4d59c092dd9
[I 2025-03-03 16:40:58,724] Trial 0 finished with value: 0.27717723128046523 and parameters: {'n_estimators': 300, 'max_depth': 11, 'learning_rate': 0.013362835407451522, 'subsample': 0.6307269062478414, 'colsample_bytree': 0.9337203525709021, 'reg_alpha': 0.1890617755080245, 'reg_lambda': 0.6600169324590445, 'min_child_weight': 6}. Best is trial 0 with value: 0.27717723128046523.
[I 2025-03-03 16:41:54,221] Trial 1 finished with value: 0.2623512396280615 and parameters: {'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.24423659508830714, 'subsample': 0.6071485076343227, 'colsample_bytree': 0.7128083864813551, 'reg_alpha': 0.9571865408113802, 'reg_lambda': 0.010450057714587513, 'min_child_weight': 3}. Best is trial 1 with value: 0.2623512396280615.
[I 2025-03-03 16:47:08,867] Trial 2 finished with value: 0.2516139687115275 and parameters: {'n_estimators': 500, 'max_d

In [39]:
best_params_cls = study_cls.best_params
print("Best parameters for classification:", best_params_cls)

Best parameters for classification: {'n_estimators': 800, 'max_depth': 10, 'learning_rate': 0.09219203992461676, 'subsample': 0.9440038119184523, 'colsample_bytree': 0.6556610437910383, 'reg_alpha': 0.5921488001440582, 'reg_lambda': 0.27768888744961756, 'min_child_weight': 1}


In [40]:
best_clf = xgb.XGBClassifier(**best_params_cls)
best_clf.fit(X_train, y_cls_train)

In [41]:
y_pred_cls = best_clf.predict(X_test)
acc_final = accuracy_score(y_cls_test, y_pred_cls)

In [42]:
print(f"Final Classification Accuracy: {acc_final}")

Final Classification Accuracy: 0.7544390395259686


In [None]:
!pip install transformers accelerate torch sentencepiece --quiet

In [None]:
!pip install -U bitsandbytes

In [None]:
from huggingface_hub import login
login()

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    load_in_8bit=True,  # Quantized model to save RAM
    device_map="auto"  # Uses GPU efficiently
)

In [None]:
llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
weather_conditions_map_rev = {v: k for k, v in weather_conditions_map.items()}  # Reverse mapping
weather_event_map_rev = {v: k for k, v in weather_event_map.items()}  # Reverse mapping for events

In [None]:
def generate_explanation(features, delay_pred, congestion_pred):
    # Correct decoding by using the reversed dictionary
    weather_condition = weather_conditions_map_rev.get(features["Weather_Conditions"], "Unknown")
    weather_event = weather_event_map_rev.get(features["Weather_Event"], "Unknown")

    prompt = f"""
    Given the following traffic data:
    - Weather Conditions: {weather_condition} (Event: {weather_event})
    - Temperature: {features['Temperature(F)']}°F
    - Wind Speed: {features['WindSpeed(mph)']} mph
    - Wind Direction: {features['WindDir']}
    - Visibility: {features['Visibility(mi)']} miles
    - Precipitation: {features['Precipitation(in)']} inches

    The model has predicted:
    - Delay: {delay_pred:.2f} minutes
    - Congestion Level: {"Fast" if congestion_pred == 0 else "Moderate" if congestion_pred == 1 else "Slow"}
    
    Explain why the delay and congestion level might be as predicted. Keep the explanation concise and suitable for traffic management personnel.
    """

    response = llm_pipeline(prompt, max_new_tokens=200)[0]["generated_text"]
    return response

In [None]:
sample_idx = 18
sample_features = df.iloc[sample_idx].to_dict()
sample_delay_pred = y_reg_pred[sample_idx]
sample_congestion_pred = y_cls_pred[sample_idx]

In [None]:
explanation = generate_explanation(sample_features, sample_delay_pred, sample_congestion_pred)
print(explanation)