In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/kaggle/input/congestion-data/us_congestion_2016_2022_sample_2m.csv')

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, accuracy_score

In [None]:
df = df.drop(["Description", "Street", "City", "County", "State", "Country", "ZipCode", "LocalTimeZone",
               "WeatherStation_AirportCode", "WeatherTimeStamp","ID"], axis=1, errors="ignore")

In [None]:
df_original = pd.read_csv('/kaggle/input/congestion-data/us_congestion_2016_2022_sample_2m.csv')

In [None]:
encoding_maps = {}

In [None]:
categorical_cols = ["Weather_Event", "Weather_Conditions", "WindDir","Congestion_Speed"]
for col in categorical_cols:
    encoder = LabelEncoder()
    encoder.fit(df_original[col].astype(str))  # Fit on the original unencoded data
    encoding_maps[col] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))  # Store mapping

In [None]:
weather_conditions_map = encoding_maps["Weather_Conditions"]
print(weather_conditions_map)

In [None]:
df["StartTime"] = pd.to_datetime(df["StartTime"], utc=True, errors="coerce")
df["EndTime"] = pd.to_datetime(df["EndTime"], utc=True, errors="coerce")

In [None]:
df = df.dropna(subset=["StartTime", "EndTime"])

In [None]:
df["StartTime"] = (df["StartTime"] - df["StartTime"].min()).dt.total_seconds()
df["EndTime"] = (df["EndTime"] - df["EndTime"].min()).dt.total_seconds()

In [None]:
X = df.drop(["DelayFromTypicalTraffic(mins)", "Congestion_Speed"], axis=1)
y_regression = df["DelayFromTypicalTraffic(mins)"]  # Regression target
y_classification = df["Congestion_Speed"]  # Binary Classification (0 = Fast, 1 = Moderate, 2 = Slow)

In [None]:
X_train, X_test, y_reg_train, y_reg_test, y_cls_train, y_cls_test = train_test_split(
    X, y_regression, y_classification, test_size=0.2, random_state=42
)

In [None]:
model = xgb.XGBRegressor(objective="reg:squarederror", enable_categorical=True)
clf = xgb.XGBClassifier(objective="binary:logistic", enable_categorical=True)

In [None]:
model.fit(X_train, y_reg_train)
clf.fit(X_train, y_cls_train)

In [None]:
y_reg_pred = model.predict(X_test)
y_cls_pred = clf.predict(X_test)

In [None]:
mae = mean_absolute_error(y_reg_test, y_reg_pred)
accuracy = accuracy_score(y_cls_test, y_cls_pred)

print(f"Mean Absolute Error (MAE) for Delay Prediction: {mae}")
print(f"Accuracy for Congestion Prediction: {accuracy}")

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_cls_test, y_cls_pred))

In [None]:
importance = model.feature_importances_
for feature, score in zip(X_train.columns, importance):
    print(f"{feature}: {score:.4f}")

In [None]:
!pip install transformers accelerate torch sentencepiece --quiet

In [None]:
!pip install -U bitsandbytes

In [None]:
from huggingface_hub import login
login()

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    load_in_8bit=True,  # Quantized model to save RAM
    device_map="auto"  # Uses GPU efficiently
)

In [None]:
llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
weather_conditions_map_rev = {v: k for k, v in weather_conditions_map.items()}  # Reverse mapping
weather_event_map_rev = {v: k for k, v in weather_event_map.items()}  # Reverse mapping for events

In [None]:
def generate_explanation(features, delay_pred, congestion_pred):
    # Correct decoding by using the reversed dictionary
    weather_condition = weather_conditions_map_rev.get(features["Weather_Conditions"], "Unknown")
    weather_event = weather_event_map_rev.get(features["Weather_Event"], "Unknown")

    prompt = f"""
    Given the following traffic data:
    - Weather Conditions: {weather_condition} (Event: {weather_event})
    - Temperature: {features['Temperature(F)']}°F
    - Wind Speed: {features['WindSpeed(mph)']} mph
    - Wind Direction: {features['WindDir']}
    - Visibility: {features['Visibility(mi)']} miles
    - Precipitation: {features['Precipitation(in)']} inches

    The model has predicted:
    - Delay: {delay_pred:.2f} minutes
    - Congestion Level: {"Fast" if congestion_pred == 0 else "Moderate" if congestion_pred == 1 else "Slow"}
    
    Explain why the delay and congestion level might be as predicted. Keep the explanation concise and suitable for traffic management personnel.
    """

    response = llm_pipeline(prompt, max_new_tokens=200)[0]["generated_text"]
    return response

In [None]:
sample_idx = 18
sample_features = df.iloc[sample_idx].to_dict()
sample_delay_pred = y_reg_pred[sample_idx]
sample_congestion_pred = y_cls_pred[sample_idx]

In [None]:
explanation = generate_explanation(sample_features, sample_delay_pred, sample_congestion_pred)
print(explanation)