In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import os

In [2]:
# --- Configuration ---
# The name of the dataset file you downloaded from UCI
TRAINING_DATA_FILE = 'forestfires.csv' 
# The directory where the final model files will be saved
MODEL_DIR = 'model'

In [3]:
# --- 1. Load the Dataset ---
print(f"Loading data from '{TRAINING_DATA_FILE}'...")
try:
    df = pd.read_csv(TRAINING_DATA_FILE)
except FileNotFoundError:
    print(f"Error: The file '{TRAINING_DATA_FILE}' was not found. Please make sure it's in the correct directory.")
    exit()


Loading data from 'forestfires.csv'...


In [4]:
# --- 2. Prepare the Data for Training ---
print("Preparing data for training...")

# The original target is 'area' (a number). We will convert this into a binary
# classification problem: does a fire exist (1) or not (0)?
df['fire'] = (df['area'] > 0).astype(int)

# For this model, we'll drop the original text-based and target columns.
df = df.drop(columns=['month', 'day', 'area'])

# 'X' will contain our features (the clues), and 'y' will contain our new binary label.
X = df.drop('fire', axis=1)
y = df['fire']

# IMPORTANT: Save the list of feature names. The Streamlit app will need this.
feature_names = list(X.columns)

# Scale the features. This helps the model perform better.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into a training set (to teach the model) and a testing set (to evaluate it).
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

Preparing data for training...


In [5]:
# --- 3. Train the Random Forest Model ---
print("Training the Random Forest model...")
# n_jobs=-1 uses all available CPU cores to speed up training.
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("Model training complete.")

Training the Random Forest model...
Model training complete.


In [6]:
# --- 4. Evaluate the Model's Performance ---
print("\n--- Model Evaluation ---")
y_pred = model.predict(X_test)
# The report shows key metrics like precision and recall.
print(classification_report(y_test, y_pred, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("----------------------\n")



--- Model Evaluation ---
              precision    recall  f1-score   support

           0       0.60      0.58      0.59        50
           1       0.62      0.65      0.64        54

    accuracy                           0.62       104
   macro avg       0.61      0.61      0.61       104
weighted avg       0.61      0.62      0.61       104

Confusion Matrix:
[[29 21]
 [19 35]]
----------------------



In [7]:
# --- 5. Save the Final Model, Scaler, and Feature List ---
# Create the /model directory if it doesn't exist
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

print("Saving model, scaler, and feature names...")
joblib.dump(model, os.path.join(MODEL_DIR, 'fire_prediction_model.joblib'))
joblib.dump(scaler, os.path.join(MODEL_DIR, 'feature_scaler.joblib'))
joblib.dump(feature_names, os.path.join(MODEL_DIR, 'feature_names.joblib'))

print(f"✅ Success! All model files have been saved to the '{MODEL_DIR}' directory.")

Saving model, scaler, and feature names...
✅ Success! All model files have been saved to the 'model' directory.
