In [1]:
# ============================================================
# Risk Score Prediction from Scene Descriptions
# ============================================================

!pip install -q scikit-learn pandas numpy

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# ===============================
# STEP 1: Load + Clean Dataset
# ===============================
DATA_PATH = "risk.json"

df = pd.read_json(DATA_PATH)

# Detect the correct description + next scene columns automatically
desc_col = next(c for c in df.columns if "description" in c.lower())
next_col = next(c for c in df.columns if "next" in c.lower())

# Combine both text fields (some rows may be NaN)
df["text"] = df[desc_col].fillna("") + " " + df[next_col].fillna("")

# Drop rows with no text or missing risk_score
df = df.dropna(subset=["risk_score"])
df = df[df["text"].str.strip() != ""]

print(f" Loaded {len(df)} valid samples")

# ===============================
# STEP 2: Train-Test Split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["risk_score"], test_size=0.2, random_state=42
)

# ===============================
# STEP 3: Text Feature Extraction (TF-IDF)
# ===============================
vectorizer = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"TF-IDF features created: {X_train_tfidf.shape[1]}")

# ===============================
# STEP 4: Train a Regression Model
# ===============================
model = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    max_depth=20,
    n_jobs=-1
)
model.fit(X_train_tfidf, y_train)

# ===============================
# STEP 5: Evaluate Model
# ===============================
y_pred = model.predict(X_test_tfidf)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n Model Performance:")
print(f"Mean Absolute Error: {mae:.3f}")
print(f"R² Score: {r2:.3f}")

import pickle
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
with open("risk_model.pkl", "wb") as f:
    pickle.dump(model, f)


# ===============================
# STEP 6: Predict Risk for New Scene
# ===============================
def predict_risk(description, next_scene):
    """Predict numeric risk score (0–10)."""
    combined_text = (description or "") + " " + (next_scene or "")
    vec = vectorizer.transform([combined_text])
    risk = model.predict(vec)[0]
    risk_percent = np.clip(risk / 10 * 100, 0, 100)
    return risk, risk_percent

# Example test
desc = "A man wearing a helmet and jeans is falling forward off an electric scooter onto a sidewalk, catching himself with his hands"
next_scene = "He lands hard on his back, rolling onto his side. His helmet flies off, and he lies there, panting, for a moment before sitting up and trying to get his bearings.."
pred_score, pred_percent = predict_risk(desc, next_scene)

print("\n Example Prediction:")
print(f"Predicted Risk Score: {pred_score:.2f} / 10")
print(f"Predicted Risk Percentage: {pred_percent:.1f}%")


 Loaded 271 valid samples
TF-IDF features created: 3585

 Model Performance:
Mean Absolute Error: 1.677
R² Score: 0.238

 Example Prediction:
Predicted Risk Score: 6.00 / 10
Predicted Risk Percentage: 60.0%
