In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# load data
data = pd.read_csv('../data/train.csv')


# split features and target
X_train = data.drop('FWI', axis=1)
y_train = data['FWI'].copy()

# create a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                                  test_size=0.2, 
                                                  random_state=42)

print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_val shape: {X_val.shape}, y_val shape: {y_val.shape}')

X_train shape: (319, 10), y_train shape: (319,)
X_val shape: (80, 10), y_val shape: (80,)


In [2]:
num_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X_train.select_dtypes('object').columns.tolist()

print(f'Numerical features: {num_features}')
print(f'Categorical features: {cat_features}')

Numerical features: ['Temperature', 'RH', 'Ws', 'Rain', 'FFMC', 'DMC', 'DC', 'ISI', 'Classes', 'Region']
Categorical features: []


In [3]:
# create a pipeline for numerical features
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline(
    steps=[
        ('num_imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('cat_imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder())
    ]
)

preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ]
)


# create a pipeline for model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


model_pipeline = Pipeline(
    steps=[
        ('preprocessing', preprocessing_pipeline),
        ('model', RandomForestRegressor())
    ]
)

model = model_pipeline.fit(X_train, y_train)
y_pred = model.predict(X_val)

from sklearn.metrics import root_mean_squared_error
rmse = root_mean_squared_error(y_val, y_pred)
print(f'RMSE: {rmse}')

model_pipeline

RMSE: 3.3255133306754323


In [4]:
# save the model
import joblib

joblib.dump(model, '../models/model_pipeline.pkl')

['../models/model_pipeline.pkl']

In [5]:
# load the model
import pandas as pd
from sklearn.metrics import root_mean_squared_error
import joblib
model = joblib.load('../models/model_pipeline.pkl')

# test the model
data = pd.read_csv('../data/test.csv')

X_test = data.drop('FWI', axis=1)
y_test = data['FWI'].copy()

y_pred = model.predict(X_test)

rmse = root_mean_squared_error(y_test, y_pred)

print(f'RMSE: {rmse}')

RMSE: 3.184175703380705


In [12]:
import joblib
import pandas as pd

# create a sample input
sample_input = {
    'FFMC': [86.2],
    'DMC': [26.2],
    'DC': [94.3],
    'ISI': [5.1],
    'Temperature': [8.2],
    'RH': [51],
    'Rain': [0.0],
    'Ws': [2.2],
    'Region': [0],
    'BUI': [7.9],
    'Classes': [0]
}

# Convert to DataFrame properly
sample_input_df = pd.DataFrame(sample_input)

# Ensure correct dtype
sample_input_df = sample_input_df.astype(float)

# Load the model
model = joblib.load('../models/model_pipeline.pkl')

# Make prediction
result = model.predict(sample_input_df)

print(f'FWI: {result[0]}')


FWI: 7.778
