In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1. Pre-Processing

In [11]:
df = pd.read_csv("/Users/hannahwurzel/Desktop/dl-firefighters/DL-firefighters/data/data660k.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,start_date,fire_ID,lat,lon,size,perimeter,start_DOY,end_date,end_DOY,duration,expansion,fire_line,speed,direction,direction_s,landcover,landcover_s,tile_ID,geometry
0,2566038,2005-07-02,584383,-6.7438,22.3535,0.21,1.85,183,2005-07-02,183,1,0.21,0.46,0.46,0,none,8,Woody savannas,h20v09,"POLYGON ((22.35037406370953 -6.74166666610823,..."
1,3258696,2005-12-06,250743,8.8188,3.2881,0.21,1.85,340,2005-12-06,340,1,0.21,0.46,0.46,0,none,9,Savannas,h18v08,POLYGON ((3.2846822924633345 8.820833332498871...
2,5600233,2008-07-27,640420,-15.0104,27.6267,0.86,3.7,209,2008-07-30,212,4,0.21,0.46,0.5,1,north,9,Savannas,h20v10,POLYGON ((27.620844048341272 -15.0041666653687...
3,3644868,2006-09-10,637240,-16.2812,24.7243,1.29,4.63,253,2006-09-10,253,1,1.29,2.78,0.85,0,none,8,Woody savannas,h20v10,POLYGON ((24.720269461331405 -16.2791666652545...
4,7274504,2010-09-24,396369,-12.9688,18.096,1.71,6.48,267,2010-09-25,268,2,0.86,1.85,0.86,4,southeast,8,Woody savannas,h19v10,POLYGON ((18.090459431003897 -12.9666666655510...


In [23]:
# Inspect the columns
df.columns

Index(['Unnamed: 0', 'start_date', 'fire_ID', 'lat', 'lon', 'size',
       'perimeter', 'start_DOY', 'end_date', 'end_DOY', 'duration',
       'expansion', 'fire_line', 'speed', 'direction', 'direction_s',
       'landcover', 'landcover_s', 'tile_ID', 'geometry'],
      dtype='object')

In [22]:
# Check the number of rows and columns
print(df.shape)

(662530, 20)


In [21]:
# Check for missing values
print(df.isnull().sum())

Unnamed: 0     0
start_date     0
fire_ID        0
lat            0
lon            0
size           0
perimeter      0
start_DOY      0
end_date       0
end_DOY        0
duration       0
expansion      0
fire_line      0
speed          0
direction      0
direction_s    0
landcover      0
landcover_s    0
tile_ID        0
geometry       0
dtype: int64


# 2. Feature Selection

In [15]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
import numpy as np

# Assuming you have already loaded the dataframe 'df' with (662530, 20) shape

# Separate target variable (expansion) from the features
X = df.drop(columns=['expansion', 'geometry'])
y = df['expansion']

# Lists of nominal and ordinal categorical features
nominal_categorical_features = ['start_date', 'end_date', 'direction_s', 'landcover_s', 'tile_ID']
ordinal_categorical_features = ['fire_ID', 'lat', 'lon', 'size', 'start_DOY', 'end_DOY', 'duration', 'perimeter', 'fire_line', 'speed', 'direction', 'landcover']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: Define the transformations for the features
ordinal_transformer = StandardScaler()
nominal_transformer = OneHotEncoder(handle_unknown='ignore')

# Column transformer to apply different preprocessing to nominal and ordinal features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', ordinal_transformer, ordinal_categorical_features),
        ('cat', nominal_transformer, nominal_categorical_features)
    ])

# Create the ElasticNet regression model
alpha = 0.1  # Set the regularization strength (decrease the value to reduce regularization)
l1_ratio = 0.5  # Set the balance between L1 and L2 penalties (0.5 for equal L1 and L2, 1.0 for LASSO)
elastic_net_model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)

# Combine preprocessing and model into a single pipeline
elastic_net_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('elasticnet', elastic_net_model)])

# Fit the ElasticNet model to the training data
elastic_net_pipeline.fit(X_train, y_train)

# Get the feature importances (coefficients) from the ElasticNet model
# Note: Coefficients with value 0 are considered unimportant and are removed by ElasticNet
feature_names = preprocessor.get_feature_names_out(input_features=X_train.columns)
feature_importances = elastic_net_model.coef_

# Create a dataframe to show the feature importance values with their corresponding feature names
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': np.abs(feature_importances)})

# Sort the dataframe by importance (descending order) to see the most important features first
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Filter to keep only the features with non-zero importance values
non_zero_features_df = feature_importance_df[feature_importance_df['Importance'] != 0]

print(non_zero_features_df)


          Feature  Importance
8  num__fire_line    1.163352
3       num__size    0.413359
9      num__speed    0.044186
6   num__duration    0.035140


# 3. Modeling

#### Model Training using Tree-based feature importance results 

In [37]:
# Normalize the data
data_temp = df.drop('expansion', axis=1)
df_norm = (data_temp-data_temp.min())/(data_temp.max()-data_temp.min())
df_norm = pd.concat((df_norm, df.expansion), 1)

# Assuming 'expansion' is the name of your target variable column
X = df_norm[["fire_line", "speed", "size", "perimeter"]]
y = df["expansion"]

# Splitting the data into training and testing sets (80:20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVR Model
svr_model = SVR(kernel="linear")
svr_model.fit(X_train, y_train)

# Linear Regression
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)

# Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Predicting on the training set 
svr_train_predictions = svr_model.predict(X_train)
linear_reg_train_predictions = linear_reg_model.predict(X_train)
rf_train_predictions = rf_model.predict(X_train)

# Calculating metrics for SVR on the training set
svr_mae_train = mean_absolute_error(y_train, svr_train_predictions)
svr_mse_train = mean_squared_error(y_train, svr_train_predictions)
svr_rmse_train = np.sqrt(svr_mse_train)
svr_r2_train = r2_score(y_train, svr_train_predictions)

# Calculating metrics for Linear Regression on the training set
linear_reg_mae_train = mean_absolute_error(y_train, linear_reg_train_predictions)
linear_reg_mse_train = mean_squared_error(y_train, linear_reg_train_predictions)
linear_reg_rmse_train = np.sqrt(linear_reg_mse_train)
linear_reg_r2_train = r2_score(y_train, linear_reg_train_predictions)

# Calculating metrics for Random Forest Regressor on the training set
rf_mae_train = mean_absolute_error(y_train, rf_train_predictions)
rf_mse_train = mean_squared_error(y_train, rf_train_predictions)
rf_rmse_train = np.sqrt(rf_mse_train)
rf_r2_train = r2_score(y_train, rf_train_predictions)

# Printing the metrics for SVR
print("Linear Regression - Training Set")
print("MAE:", svr_mae_train)
print("MSE:", svr_mse_train)
print("RMSE:", svr_rmse_train)
print("R²:", svr_r2_train)

# Printing the metrics for Linear Regression
print("Linear Regression - Training Set")
print("MAE:", linear_reg_mae_train)
print("MSE:", linear_reg_mse_train)
print("RMSE:", linear_reg_rmse_train)
print("R²:", linear_reg_r2_train)

# Printing the metrics for Random Forest Regressor
print("\nRandom Forest Regressor - Training Set")
print("MAE:", rf_mae_train)
print("MSE:", rf_mse_train)
print("RMSE:", rf_rmse_train)
print("R²:", rf_r2_train)

# Predicting on the test set for all models
svr_test_predictions = svr_model.predict(X_test)
linear_reg_test_predictions = linear_reg_model.predict(X_test)
rf_test_predictions = rf_model.predict(X_test)

# Calculating metrics for SVR on the test set
svr_mae_test = mean_absolute_error(y_test, svr_test_predictions)
svr_mse_test = mean_squared_error(y_test, svr_test_predictions)
svr_rmse_test = np.sqrt(svr_mse_test)
svr_r2_test = r2_score(y_test, svr_test_predictions)

# Calculating metrics for Linear Regression on the test set
linear_reg_mae_test = mean_absolute_error(y_test, linear_reg_test_predictions)
linear_reg_mse_test = mean_squared_error(y_test, linear_reg_test_predictions)
linear_reg_rmse_test = np.sqrt(linear_reg_mse_test)
linear_reg_r2_test = r2_score(y_test, linear_reg_test_predictions)

# Calculating metrics for Random Forest Regressor on the test set
rf_mae_test = mean_absolute_error(y_test, rf_test_predictions)
rf_mse_test = mean_squared_error(y_test, rf_test_predictions)
rf_rmse_test = np.sqrt(rf_mse_test)
rf_r2_test = r2_score(y_test, rf_test_predictions)

# Printing the metrics for SVR on the test set
print("\nLinear Regression - Test Set")
print("MAE:", svr_mae_test)
print("MSE:", svr_mse_test)
print("RMSE:", svr_rmse_test)
print("R²:", svr_r2_test)

# Printing the metrics for Linear Regression on the test set
print("\nLinear Regression - Test Set")
print("MAE:", linear_reg_mae_test)
print("MSE:", linear_reg_mse_test)
print("RMSE:", linear_reg_rmse_test)
print("R²:", linear_reg_r2_test)

# Printing the metrics for Random Forest Regressor on the test set
print("\nRandom Forest Regressor - Test Set")
print("MAE:", rf_mae_test)
print("MSE:", rf_mse_test)
print("RMSE:", rf_rmse_test)
print("R²:", rf_r2_test)

# 4. Charts

In [None]:
# Bar plot for model metrics
metrics_df = pd.DataFrame({
    'Model': ['SVR', 'Linear Regression', 'Random Forest Regressor'],
    'MAE (Test Set)': [svr_mae_test, linear_reg_mae_test, rf_mae_test],
    'MSE (Test Set)': [svr_mse_test, linear_reg_mse_test, rf_mse_test],
    'RMSE (Test Set)': [svr_rmse_test, linear_reg_rmse_test, rf_rmse_test],
    'R² (Test Set)': [svr_r2_test, linear_reg_r2_test, rf_r2_test]
})

plt.figure(figsize=(10, 6))
metrics_df.plot(x='Model', kind='bar')
plt.title('Model Performance Comparison - Test Set')
plt.ylabel('Value')
plt.xlabel('Model')
plt.legend(loc='upper left')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Model Comparison Scatter Plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test, svr_test_predictions, alpha=0.5, label='SVR')
plt.scatter(y_test, linear_reg_test_predictions, alpha=0.5, label='Linear Regression')
plt.scatter(y_test, rf_test_predictions, alpha=0.5, label='Random Forest Regressor')
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Model Comparison - Actual vs Predicted (Test Set)")
plt.legend()
plt.show()
