In [36]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import pickle
import time

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,RandomForestRegressor
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [37]:
with open('pickles/df_description.pkl', 'rb') as f:
    df_description = pickle.load(f)
    
with open('pickles/df_model.pkl', 'rb') as f:
    df_model = pickle.load(f)

In [None]:
data['properties']['layers']

In [None]:
df_soil = pd.DataFrame(columns=['LATITUDE', 'LONGITUDE', 'soil_type', 'value'])

# Iterate over coordinates
for index, row in df_api.iterrows():
    lat = row['LATITUDE']
    lon = row['LONGITUDE']
    
    # Make API request
    params = {
        'lon': lon,
        'lat': lat,
        'property': ['clay', 'sand', 'silt'],
        'depth': ['0-5cm'],
        'value': ['mean']
    }
    
    headers = {'accept': 'application/json'}
    response = requests.get(url, params=params, headers=headers)
    data = response.json()
    
    # Extract results for clay, sand, and silt and append to dataframe
    for layer in data['properties']['layers']:
        name = layer['name']
        if name in ['clay', 'sand', 'silt']:
            for depth in layer['depths']:
                value = depth['values']['mean']
                df_soil = df_soil.append({'LATITUDE': lat, 'LONGITUDE': lon, 'soil_type': name, 'value': value}, ignore_index=True)


In [None]:
df_pivot = df_soil.pivot_table(index=['LATITUDE', 'LONGITUDE'], columns='soil_type', values='value').reset_index()[['LATITUDE', 'LONGITUDE', 'clay', 'sand', 'silt']]


In [None]:
merged_df = pd.merge(df_model, df_pivot, on=['LATITUDE','LONGITUDE'], how='outer')


### Dummifying Columns

In [None]:
categorical = ['STAT_CAUSE_DESCR',
               'STATE','SOURCE_SYSTEM_TYPE']
numeric = ['DISCOVERY_DOY','duration']

In [None]:
df_dum = pd.get_dummies(df_model[categorical],drop_first= True)
df_num = df_model[numeric]

In [None]:
df = pd.concat([df_num,df_dum],axis=1)
df['FIRE_SIZE_CLASS'] = df_model['FIRE_SIZE_CLASS']

### Random Sampling

Because our dataset is so large, we'll obtain a representative subset from our dataset to help us tune our parameters. This will help us discover the optimal hyperparameters that we can then apply to our full Random Forest model.

In [None]:
df_sample = df.sample(frac = .1, random_state = 2023)
df_sample.shape

#### Train Test Split with Sample X & y

In [None]:
X = df_sample.drop('FIRE_SIZE_CLASS',axis=1)
y = df_sample['FIRE_SIZE_CLASS']

In [None]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=2023)

#### Random Forest with Sample

In [None]:
model_rf_sample = RandomForestClassifier(n_jobs=-1)

In [None]:
model_rf_sample.get_params()

In [None]:
params_rf = {
    'n_estimators': [320,340,360],
    'max_depth': [25,26,27,28],
    'min_samples_split': [58, 60, 62],
    'min_samples_leaf': [2, 4, 6],
}

In [None]:
gs = GridSearchCV(model_rf_sample,params_rf,cv=5)
gs.fit(X_train,y_train)

In [None]:
best_params_rf = gs.best_params_
best_params_rf

From:

params_rf = {
    'n_estimators': [320,340,360],
    'max_depth': [25,26,27,28],
    'min_samples_split': [58, 60, 62],
    'min_samples_leaf': [2, 4, 6],
}

Result:

{'max_depth': 27,
 'min_samples_leaf': 2,
 'min_samples_split': 58,
 'n_estimators': 340}

In [None]:
model_rf_sample.set_params(**best_params_rf)

In [None]:
model_rf_sample.fit(X_train,y_train)

In [None]:
y_pred_rf = model_rf_sample.predict(X_test)
score_rf = model_rf_sample.score(X_test,y_test)
score_rf

### Model Evaluation (Random Forest)

We'll evaluate several models

In [None]:
X = df
y = df_model['FIRE_SIZE_CLASS']

In [None]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=2023)

#### Baseline Model

In [None]:
null_model = y.value_counts(normalize= True)
null_model

### Random Forest

In [None]:
model_rf = RandomForestClassifier()

In [None]:
model_rf.get_params()

In [None]:
# params_rf = {
#     'n_estimators': 100,
#     'max_depth': None,
#     'min_samples_split': 2,
#     'n_jobs': -1
# }

In [None]:
#gs = GridSearchCV(model_rf,params_rf,cv=5)

In [None]:
#gs.fit(X_train,y_train)

In [None]:
best_params_rf = params_rf
best_params_rf

In [None]:
model_rf.set_params(**best_params_rf)

In [None]:
model_rf.fit(X_train,y_train)

In [None]:
y_pred_rf = model_rf.predict(X_test)
score_rf = model_rf.score(X_test,y_test)
score_rf

# accuracy_rf = model_rf.score(X_test, y_test)
# precision_rf = precision_score(y_test_rf, y_pred)
# recall_rf = recall_score(y_test, y_pred_rf)
# f1_rf = f1_score(y_test, y_pred_rf)

In [None]:
ConfusionMatrixDisplay.from_estimator(model_rf,X_test,y_test, cmap = 'Reds',normalize= 'true');
plt.title("Confusion Matrix");