In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import joblib
import numpy as np

# Load processed dataset (adjust path if needed)
df = pd.read_csv("../data/HR_processed_data.csv")

# Drop rows with missing target
df = df.dropna(subset=["HR Incidence"])

# Create binary classification target (tune threshold if needed)
df["HR_Risk_High"] = (df["HR Incidence"] > 0.2).astype(int)

# Select categorical and numeric features
categorical = ["Orchard", "Variety"]
numeric = [
    "Days over 40C", "Days over 35C", "Total Amount of rain ml since 1 Jan",
    "No of rain days since 1 Jan", "Amt rain in last event<5days from assess",
    "Total rain Jan mm", "Total rain Feb mm"
]

# Encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
X_cat = encoder.fit_transform(df[categorical])

# Combine with numeric features
X = np.hstack([X_cat, df[numeric].values])
y = df["HR_Risk_High"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Train Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Save model and encoder
joblib.dump(rf, "hr_risk_model.pkl")
joblib.dump(encoder, "encoder.pkl")

print("✅ Random Forest model and encoder saved successfully.")


✅ Random Forest model and encoder saved successfully.
