In [1]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import pickle
import time

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,RandomForestRegressor
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
with open('pickles/df_description.pkl', 'rb') as f:
    df_description = pickle.load(f)
    
with open('pickles/df_model.pkl', 'rb') as f:
    df_model = pickle.load(f)

In [3]:
df_description

Unnamed: 0,SOURCE_REPORTING_UNIT_NAME,FIRE_YEAR,DISCOVERY_DOY,FIRE_SIZE,LATITUDE,LONGITUDE,STATE,duration,STAT_CAUSE_DESCR
0,Plumas National Forest,2005,33,0.10,40.036944,-121.005833,CA,0.0,Miscellaneous
1,Eldorado National Forest,2004,133,0.25,38.933056,-120.404444,CA,0.0,Lightning
2,Eldorado National Forest,2004,152,0.10,38.984167,-120.735556,CA,0.0,Debris Burning
3,Eldorado National Forest,2004,180,0.10,38.559167,-119.913333,CA,5.0,Lightning
4,Eldorado National Forest,2004,180,0.10,38.559167,-119.933056,CA,5.0,Lightning
...,...,...,...,...,...,...,...,...,...
1880456,Tehama-Glenn Unit,2015,165,2.22,40.019907,-122.391398,CA,0.0,Missing/Undefined
1880457,Shasta-Trinity Unit,2015,273,1.00,40.588583,-123.069617,CA,1.0,Arson
1880458,Humboldt-Del Norte Unit,2015,213,4.00,40.244833,-123.544167,CA,5.0,Lightning
1880459,Sonoma-Lake Napa Unit,2015,148,0.50,38.415608,-122.660044,CA,0.0,Miscellaneous


### Dummifying Columns

In [4]:
categorical = ['STAT_CAUSE_DESCR',
               'STATE','SOURCE_SYSTEM_TYPE']
numeric = ['DISCOVERY_DOY','duration']

In [5]:
df_dum = pd.get_dummies(df_model[categorical],drop_first= True)
df_num = df_model[numeric]

In [6]:
df = pd.concat([df_num,df_dum],axis=1)
df['FIRE_SIZE_CLASS'] = df_model['FIRE_SIZE_CLASS']

### Random Sampling

Because our dataset is so large, we'll obtain a representative subset from our dataset to help us tune our parameters. This will help us discover the optimal hyperparameters that we can then apply to our full Random Forest model.

In [7]:
df_sample = df.sample(frac = .1, random_state = 2023)
df_sample.shape

(98893, 68)

#### Train Test Split with Sample X & y

In [8]:
X = df_sample.drop('FIRE_SIZE_CLASS',axis=1)
y = df_sample['FIRE_SIZE_CLASS']

In [9]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=2023)

#### Random Forest with Sample

In [10]:
model_rf_sample = RandomForestClassifier(n_jobs=-1)

In [11]:
model_rf_sample.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [12]:
params_rf = {
    'n_estimators': [320,340,360],
    'max_depth': [25,26,27,28],
    'min_samples_split': [58, 60, 62],
    'min_samples_leaf': [2, 4, 6],
}

In [13]:
gs = GridSearchCV(model_rf_sample,params_rf,cv=5)
gs.fit(X_train,y_train)

In [14]:
best_params_rf = gs.best_params_
best_params_rf

{'max_depth': 27,
 'min_samples_leaf': 2,
 'min_samples_split': 58,
 'n_estimators': 340}

From:

params_rf = {
    'n_estimators': [320,340,360],
    'max_depth': [25,26,27,28],
    'min_samples_split': [58, 60, 62],
    'min_samples_leaf': [2, 4, 6],
}

Result:

{'max_depth': 27,
 'min_samples_leaf': 2,
 'min_samples_split': 58,
 'n_estimators': 340}

In [15]:
model_rf_sample.set_params(**best_params_rf)

In [16]:
model_rf_sample.fit(X_train,y_train)

In [17]:
y_pred_rf = model_rf_sample.predict(X_test)
score_rf = model_rf_sample.score(X_test,y_test)
score_rf

0.6230850902472319

### Model Evaluation (Random Forest)

We'll evaluate several models

In [18]:
X = df
y = df_model['FIRE_SIZE_CLASS']

In [19]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=2023)

#### Baseline Model

In [20]:
null_model = y.value_counts(normalize= True)
null_model

B    0.434500
A    0.426586
C    0.104777
D    0.015896
E    0.009117
F    0.005867
G    0.003257
Name: FIRE_SIZE_CLASS, dtype: float64

### Random Forest

In [21]:
model_rf = RandomForestClassifier()

In [22]:
model_rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [23]:
# params_rf = {
#     'n_estimators': 100,
#     'max_depth': None,
#     'min_samples_split': 2,
#     'n_jobs': -1
# }

In [24]:
#gs = GridSearchCV(model_rf,params_rf,cv=5)

In [25]:
#gs.fit(X_train,y_train)

In [26]:
best_params_rf = params_rf
best_params_rf

{'n_estimators': [320, 340, 360],
 'max_depth': [25, 26, 27, 28],
 'min_samples_split': [58, 60, 62],
 'min_samples_leaf': [2, 4, 6]}

In [27]:
model_rf.set_params(**best_params_rf)

In [28]:
model_rf.fit(X_train,y_train)

InvalidParameterError: The 'max_depth' parameter of RandomForestClassifier must be an int in the range [1, inf) or None. Got [25, 26, 27, 28] instead.

In [None]:
y_pred_rf = model_rf.predict(X_test)
score_rf = model_rf.score(X_test,y_test)
score_rf

# accuracy_rf = model_rf.score(X_test, y_test)
# precision_rf = precision_score(y_test_rf, y_pred)
# recall_rf = recall_score(y_test, y_pred_rf)
# f1_rf = f1_score(y_test, y_pred_rf)

In [None]:
ConfusionMatrixDisplay.from_estimator(model_rf,X_test,y_test, cmap = 'Reds',normalize= 'true');
plt.title("Confusion Matrix");