# Forest Fire Size Prediction

### Importing all the required libraries.

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import category_encoders as ce

### Reading the CSV files

In [3]:
df = pd.read_csv("data/FW_Veg_Rem_Combined.csv")
df.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,fire_name,fire_size,fire_size_class,stat_cause_descr,latitude,longitude,state,disc_clean_date,...,Wind_cont,Hum_pre_30,Hum_pre_15,Hum_pre_7,Hum_cont,Prec_pre_30,Prec_pre_15,Prec_pre_7,Prec_cont,remoteness
0,0,0,,10.0,C,Missing/Undefined,18.105072,-66.753044,PR,2/11/2007,...,3.250413,78.21659,76.79375,76.381579,78.72437,0.0,0.0,0.0,0.0,0.017923
1,1,1,,3.0,B,Arson,35.03833,-87.61,TN,12/11/2006,...,2.12232,70.84,65.858911,55.505882,81.682678,59.8,8.4,0.0,86.8,0.184355
2,2,2,,60.0,C,Arson,34.9478,-88.7225,MS,2/29/2004,...,3.36905,75.531629,75.868613,76.812834,65.0638,168.8,42.2,18.1,124.5,0.194544
3,3,3,WNA 1,1.0,B,Debris Burning,39.6414,-119.3083,NV,6/6/2005,...,0.0,44.778429,37.140811,35.353846,0.0,10.4,7.2,0.0,0.0,0.487447
4,4,4,,2.0,B,Miscellaneous,30.7006,-90.5914,LA,9/22/1999,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.214633


### Removing the redundant and unnecessary columns

In [4]:
df = df.drop(['Unnamed: 0.1', 'Unnamed: 0', 'fire_name', 'state', 'cont_clean_date',
         'discovery_month', 'disc_date_final', 'cont_date_final', 'putout_time', 'disc_pre_year', 'disc_pre_month',
         'wstation_usaf', 'dstation_m', 'wstation_wban', 'fire_mag', 'weather_file'],axis=1)

#### Removed columns with null values, redundant columns like fire_mag, fire_size and date variables

In [5]:
df.head(5)

Unnamed: 0,fire_size,fire_size_class,stat_cause_descr,latitude,longitude,disc_clean_date,disc_date_pre,wstation_byear,wstation_eyear,Vegetation,...,Wind_cont,Hum_pre_30,Hum_pre_15,Hum_pre_7,Hum_cont,Prec_pre_30,Prec_pre_15,Prec_pre_7,Prec_cont,remoteness
0,10.0,C,Missing/Undefined,18.105072,-66.753044,2/11/2007,1/12/2007,1945,2018,12,...,3.250413,78.21659,76.79375,76.381579,78.72437,0.0,0.0,0.0,0.0,0.017923
1,3.0,B,Arson,35.03833,-87.61,12/11/2006,11/11/2006,1978,2020,15,...,2.12232,70.84,65.858911,55.505882,81.682678,59.8,8.4,0.0,86.8,0.184355
2,60.0,C,Arson,34.9478,-88.7225,2/29/2004,1/30/2004,1978,2020,16,...,3.36905,75.531629,75.868613,76.812834,65.0638,168.8,42.2,18.1,124.5,0.194544
3,1.0,B,Debris Burning,39.6414,-119.3083,6/6/2005,5/7/2005,1942,2020,0,...,0.0,44.778429,37.140811,35.353846,0.0,10.4,7.2,0.0,0.0,0.487447
4,2.0,B,Miscellaneous,30.7006,-90.5914,9/22/1999,8/23/1999,1987,2016,12,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.214633


#### As the target class is unbalanced we decided to club the smaller group of classes as 1.
#### Clubbing (C,D,E,F,G) as (1) class and A,B as (0) class.
#### 0 idicates small fire <25 Acres and 1 represents a widespread fire >25Acres.

In [6]:
class_mapping = {'A': 0, 'B': 0, 'C':1, 'D':1, 'E':1, 'F':1, 'G':1}
df = df.replace(class_mapping)

In [7]:
df.fire_size_class.value_counts()

fire_size_class
0    36522
1    18845
Name: count, dtype: int64

In [8]:
(df.fire_size_class.value_counts()/df.shape[0])*100

fire_size_class
0    65.96348
1    34.03652
Name: count, dtype: float64

### Extracting the date and month and removing the redundant columns.

In [9]:
# Extract day, month, year from discovery clean date
df['disc_clean_date'] = pd.to_datetime(df['disc_clean_date'])

df['disc_month'] = df['disc_clean_date'].dt.month

# Drop the columns which are not required
df = df.drop(['disc_clean_date', 'disc_date_pre', \
              'wstation_byear', 'wstation_eyear'],axis=1)

In [10]:
df= df.drop(["fire_size"], axis = 1)

In [11]:
df['Vegetation'] = df['Vegetation'].astype(object)

### Applying MinMaxScaler to the weather variables

In [12]:
from sklearn.preprocessing import MinMaxScaler

trans = MinMaxScaler()
df.iloc[:, 5:21] = trans.fit_transform(df.iloc[:, 5:21])

In [13]:
X = df.drop('fire_size_class',axis=1)
y = df['fire_size_class']

In [14]:
X.columns

Index(['stat_cause_descr', 'latitude', 'longitude', 'Vegetation',
       'Temp_pre_30', 'Temp_pre_15', 'Temp_pre_7', 'Temp_cont', 'Wind_pre_30',
       'Wind_pre_15', 'Wind_pre_7', 'Wind_cont', 'Hum_pre_30', 'Hum_pre_15',
       'Hum_pre_7', 'Hum_cont', 'Prec_pre_30', 'Prec_pre_15', 'Prec_pre_7',
       'Prec_cont', 'remoteness', 'disc_month'],
      dtype='object')

### Defining the function for target encoding

In [15]:
def target_encode_multiclass(X,y): #X,y are pandas df and series
    y=y.astype(str)   #convert to string to onehot encode
    enc=ce.OneHotEncoder().fit(y)
    y_onehot=enc.transform(y)
    class_names=y_onehot.columns  #names of onehot encoded columns
    X_obj=X.select_dtypes('object') #separate categorical columns
    X=X.select_dtypes(exclude='object')
    for class_ in class_names:
        enc=ce.TargetEncoder()
        enc.fit(X_obj,y_onehot[class_]) #convert all categorical
        temp=enc.transform(X_obj)       #columns for class_
        temp.columns=[str(x)+'_'+str(class_) for x in temp.columns]
        X=pd.concat([X,temp],axis=1)    #add to original dataset

    return X

In [16]:
X = target_encode_multiclass(X,y)

In [17]:
from sklearn.metrics import roc_curve, accuracy_score
from sklearn.metrics import auc, classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

### Random Forest

In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num_folds = 5

# Set up k-fold cross-validation
stratified_kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

In [19]:
RF = RandomForestClassifier()

# Lists to store metrics for each fold
RF_precision_list = []
RF_recall_list = []
RF_f1_list = []

for train_index, val_index in stratified_kf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
    RF.fit(X_train_fold, y_train_fold)
    RF_pred = RF.predict(X_val_fold)

    # Calculate metrics for each fold
    classification_report_fold = classification_report(y_val_fold, RF_pred, output_dict=True)
    
    RF_precision_list.append(classification_report_fold['weighted avg']['precision'])
    RF_recall_list.append(classification_report_fold['weighted avg']['recall'])
    RF_f1_list.append(classification_report_fold['weighted avg']['f1-score'])

# Calculate mean metrics across all folds
mean_RF_precision = np.mean(RF_precision_list)
mean_RF_recall = np.mean(RF_recall_list)
mean_RF_f1 = np.mean(RF_f1_list)

# Print or use the mean metrics as needed
print(f'Mean Precision for RF: {mean_RF_precision}')
print(f'Mean Recall for RF: {mean_RF_recall}')
print(f'Mean F1-Score for RF: {mean_RF_f1}')

Mean Precision for RF: 0.7446038540182546
Mean Recall for RF: 0.750276560013198
Mean F1-Score for RF: 0.7315982271038048


In [20]:
import pickle

Pkl_Filename = "RandomForest_ForestFire.pkl"

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(RF, file)