In [40]:
import pandas as pd
from sklearn import linear_model
from sklearn import tree
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import numpy as np
import statsmodels.api as sm
import sklearn.metrics as metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV

#### [Cortez and Morais, 2007] P. Cortez and A. Morais. A Data Mining Approach to Predict Forest Fires using Meteorological Data. In J. Neves, M. F. Santos and J. Machado Eds., New Trends in Artificial Intelligence, Proceedings of the 13th EPIA 2007 - Portuguese Conference on Artificial Intelligence, December, Guimarães, Portugal, pp. 512-523, 2007. APPIA, ISBN-13 978-989-95618-0-9.

In [41]:
df = pd.read_csv('/Users/ryanc/Desktop/forestfires.csv')
df['area_t'] = np.log(df['area'] + 1)
df = df.join(pd.get_dummies(df['month']))
df = df.join(pd.get_dummies(df['day']))

In [42]:
df_s = df.iloc[:, [0,1]]
df_t = df.iloc[:, 14:33]
df_fwi = df.iloc[:, 4:8]
df_m = df.iloc[:, 8:12]
df_st = df_s.join(df_t)
df_stfwi = df_st.join(df_fwi)
df_stm = df_st.join(df_m)
feature_list = [df_stfwi, df_stm, df_fwi, df_m]
features_names = ['STFWI', 'STM', 'FWI', 'M']

In [50]:
def regression_results(y_true, y_pred, model):

    # Regression metrics
    explained_variance = metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error = metrics.mean_absolute_error(y_true, y_pred) 
    mse = metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error = metrics.median_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)
    
    print('explained_variance: ', round(explained_variance, 4))    
    print('r2: ', round(r2, 4))
    print('MAE: ', round(mean_absolute_error, 4))
    print('MSE: ', round(mse, 4))
    print('RMSE: ', round(np.sqrt(mse), 4))

In [45]:
reg = linear_model.LinearRegression()
dt = tree.DecisionTreeRegressor()
svm_func = svm.SVR()
rf = RandomForestRegressor()
nn = MLPRegressor()

In [46]:
for idx, features in enumerate(feature_list):
    print('Linear Regression: ' + features_names[idx])
    X_train, X_test, y_train, y_test = train_test_split(features, df['area_t'], test_size=0.3, random_state = 0 )
    model = reg.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred = (np.exp(y_pred))-1
    y_pred[y_pred < 0] = 0
    y_true = y_test
    regression_results(y_true, y_pred, 'reg')
    print('\n')
    y_pred = model.predict(features)
    y_pred = (np.exp(y_pred))-1
    y_pred[y_pred < 0] = 0
    y_true = df['area']
    regression_results(y_true, y_pred, 'reg')
    print('\n')

Linear Regression: STFWI
explained_variance:  -0.7638
r2:  -1.0608
MAE:  1.6921
MSE:  4.6465
RMSE:  2.1556


explained_variance:  0.0024
r2:  -0.0262
MAE:  12.7956
MSE:  4150.1836
RMSE:  64.4219


Linear Regression: STM
explained_variance:  -0.6183
r2:  -0.9271
MAE:  1.6416
MSE:  4.3453
RMSE:  2.0845


explained_variance:  0.0021
r2:  -0.0266
MAE:  12.7996
MSE:  4151.6442
RMSE:  64.4333


Linear Regression: FWI
explained_variance:  0.0078
r2:  -0.1315
MAE:  1.399
MSE:  2.5513
RMSE:  1.5973


explained_variance:  0.0001
r2:  -0.0299
MAE:  12.9386
MSE:  4164.9944
RMSE:  64.5368


Linear Regression: M
explained_variance:  -0.0346
r2:  -0.1764
MAE:  1.4135
MSE:  2.6526
RMSE:  1.6287


explained_variance:  -0.0002
r2:  -0.0301
MAE:  12.9253
MSE:  4165.8529
RMSE:  64.5434




In [48]:
kf = KFold(n_splits = 10, shuffle = True, random_state = 0)
df_target = df['area_t'].round().astype(int)

for idx, features in enumerate(feature_list):
    print('Decision Tree: ' + features_names[idx])
    X_train, X_test, y_train, y_test = train_test_split(features, df_target, test_size=0.3, random_state = 0 )
    parameters = {'max_leaf_nodes': list(range(2, 10)), 'min_samples_split': list(range(2, 10))}
    grid = GridSearchCV(dt, parameters, cv=kf, scoring = 'neg_mean_squared_error')
    grid.fit(X_train, y_train)
    model = grid.best_estimator_
    y_pred = model.predict(X_test)
    y_pred = (np.exp(y_pred))-1
    y_pred[y_pred < 0] = 0
    y_true = y_test
    regression_results(y_true, y_pred, 'dt')
    print('\n')
    y_pred = model.predict(features)
    y_pred = (np.exp(y_pred))-1
    y_pred[y_pred < 0] = 0
    y_true = df['area']
    regression_results(y_true, y_pred, 'dt')
    print('\n')

Decision Tree: STFWI
explained_variance:  -0.1028
r2:  -0.2192
MAE:  1.4122
MSE:  2.7452
RMSE:  1.6569


explained_variance:  -0.0002
r2:  -0.0301
MAE:  12.802
MSE:  4166.1244
RMSE:  64.5455


Decision Tree: STM
explained_variance:  -0.2412
r2:  -0.3598
MAE:  1.4392
MSE:  3.0618
RMSE:  1.7498


explained_variance:  -0.0004
r2:  -0.0303
MAE:  12.8123
MSE:  4166.5941
RMSE:  64.5492


Decision Tree: FWI
explained_variance:  -0.0629
r2:  -0.2239
MAE:  1.4444
MSE:  2.7558
RMSE:  1.6601


explained_variance:  -0.0002
r2:  -0.0303
MAE:  12.9175
MSE:  4166.7345
RMSE:  64.5502


Decision Tree: M
explained_variance:  -0.2412
r2:  -0.3598
MAE:  1.4392
MSE:  3.0618
RMSE:  1.7498


explained_variance:  -0.0004
r2:  -0.0303
MAE:  12.8123
MSE:  4166.5941
RMSE:  64.5492




In [49]:
kf = KFold(n_splits = 10, shuffle = True, random_state = 0)
df_target = df['area_t'].round().astype(int)

for idx, features in enumerate(feature_list):
    print('SVM: ' + features_names[idx])
    features = preprocessing.scale(features)
    X_train, X_test, y_train, y_test = train_test_split(features, df_target, test_size=0.3, random_state = 0 )
    parameters = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001, .00001], 'kernel': ['rbf']} 
    grid = GridSearchCV(svm_func, parameters, cv=kf, scoring = 'neg_mean_squared_error')
    grid.fit(X_train, y_train)
    model = grid.best_estimator_
    y_pred = model.predict(X_test)
    y_pred = (np.exp(y_pred))-1
    y_pred[y_pred < 0] = 0
    y_true = y_test
    regression_results(y_true, y_pred, 'svm')
    print('\n')
    y_pred = model.predict(features)
    y_pred = (np.exp(y_pred))-1
    y_pred[y_pred < 0] = 0
    y_true = df['area']
    regression_results(y_true, y_pred, 'svm')
    print('\n')

SVM: STFWI
explained_variance:  -0.5285
r2:  -0.5318
MAE:  1.3988
MSE:  3.4491
RMSE:  1.8572


explained_variance:  0.007
r2:  -0.0246
MAE:  11.9087
MSE:  4143.7624
RMSE:  64.3721


SVM: STM
explained_variance:  -0.245
r2:  -0.2463
MAE:  1.3181
MSE:  2.8063
RMSE:  1.6752


explained_variance:  0.0072
r2:  -0.0242
MAE:  11.8028
MSE:  4142.2328
RMSE:  64.3602


SVM: FWI
explained_variance:  -0.5999
r2:  -0.6378
MAE:  1.4136
MSE:  3.6877
RMSE:  1.9203


explained_variance:  -0.001
r2:  -0.0363
MAE:  12.6287
MSE:  4191.2103
RMSE:  64.7396


SVM: M
explained_variance:  -0.6254
r2:  -0.6825
MAE:  1.4216
MSE:  3.7883
RMSE:  1.9464


explained_variance:  0.0017
r2:  -0.0333
MAE:  12.5614
MSE:  4178.8643
RMSE:  64.6441




The random forest and neural network code blocks are in markdown to avoid running them incidentally.  

kf = KFold(n_splits = 10, shuffle = True, random_state = 0)
df_target = df['area_t'].round().astype(int)

#identify which parameters and ranges to include in the grid search
#technique sourced from https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

print(rf.get_params())

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features, train_labels)

rf_random.best_params_

#use a range around the best parameters as the parameter grid in the grid search

for idx, features in enumerate(feature_list):
    print('RF: ' + features_names[idx])
    X_train, X_test, y_train, y_test = train_test_split(features, df_target, test_size=0.3, random_state = 0 )
    parameters = {}
    grid = GridSearchCV(rf, parameters, cv=kf, scoring = 'neg_mean_absolute_error')
    grid.fit(X_train, y_train)
    model = grid.best_estimator_
    y_pred = model.predict(X_test)
    y_pred = (np.exp(y_pred))-1
    y_pred[y_pred < 0] = 0
    y_true = y_test
    regression_results(y_true, y_pred, 'reg')
    print('\n')
    y_pred = model.predict(features)
    y_pred = (np.exp(y_pred))-1
    y_pred[y_pred < 0] = 0
    y_true = df['area']
    regression_results(y_true, y_pred, 'reg')
    print('\n')

kf = KFold(n_splits = 10, shuffle = True, random_state = 0)
df_target = df['area_t'].round().astype(int)

#identify which parameters and ranges to include in the grid search
#technique sourced from https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

print(nn.get_params())

#replicate the random forest example for identifying the range of best parameters to search for using the neural network parameters


for idx, features in enumerate(feature_list):
    print('NN: ' + features_names[idx])
    features = preprocessing.scale(features)
    X_train, X_test, y_train, y_test = train_test_split(features, df_target, test_size=0.3, random_state = 0 )
    parameters = {}
    grid = GridSearchCV(nn, parameters, cv=kf, scoring = 'neg_mean_absolute_error')
    grid.fit(X_train, y_train)
    model = grid.best_estimator_
    y_pred = model.predict(X_test)
    y_pred = (np.exp(y_pred))-1
    y_pred[y_pred < 0] = 0
    y_true = y_test
    regression_results(y_true, y_pred, 'reg')
    print('\n')
    y_pred = model.predict(features)
    y_pred = (np.exp(y_pred))-1
    y_pred[y_pred < 0] = 0
    y_true = df['area']
    regression_results(y_true, y_pred, 'reg')
    print('\n')