In [36]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std

from patsy import dmatrices

from sklearn import svm, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neural_network import MLPRegressor, MLPClassifier

import imblearn

from scipy import stats
import kaleido

import warnings
warnings.filterwarnings(action='ignore')

In [37]:
import os

if not os.path.exists("images"):
    os.mkdir("images")

In [30]:
f = '/Users/shenchingfeng/Documents/1112/機器學習/Homework 1/forestfires.csv'
df = pd.read_csv(f)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       517 non-null    int64  
 1   Y       517 non-null    int64  
 2   month   517 non-null    object 
 3   day     517 non-null    object 
 4   FFMC    517 non-null    float64
 5   DMC     517 non-null    float64
 6   DC      517 non-null    float64
 7   ISI     517 non-null    float64
 8   temp    517 non-null    float64
 9   RH      517 non-null    int64  
 10  wind    517 non-null    float64
 11  rain    517 non-null    float64
 12  area    517 non-null    float64
dtypes: float64(8), int64(3), object(2)
memory usage: 52.6+ KB


In [31]:
df['scale'] = df['area'].apply(lambda x: 'no fire' if (x == 0) else
                                              'small fire' if (0 < x < 6) else
                                              'large fire'
                                    )

In [39]:
df['scale'].value_counts()

no fire       247
large fire    139
small fire    131
Name: scale, dtype: int64

In [15]:
df['scale'].value_counts(normalize = True).round(4)*100

no fire       47.78
large fire    26.89
small fire    25.34
Name: scale, dtype: float64

In [75]:
# count of each value
value_counts = df['scale'].value_counts()

# convert value_counts to a DataFrame
value_counts_df = value_counts.to_frame().reset_index()

# rename columns
value_counts_df.columns = ['scale', 'count']

# add percentage column
value_counts_df['percentage'] = (value_counts_df['count'] / len(df)) * 100

# round percentage to 2 decimal places
value_counts_df['percentage'] = value_counts_df['percentage'].round(2)

# sort by scale in the order 'no fire', 'small fire', 'large fire'
scale_order = ['no fire', 'small fire', 'large fire']
value_counts_df['scale'] = pd.Categorical(value_counts_df['scale'], categories=scale_order, ordered=True)
value_counts_df = value_counts_df.sort_values('scale')

# display the resulting DataFrame
value_counts_df

Unnamed: 0,scale,count,percentage
0,no fire,247,47.78
2,small fire,131,25.34
1,large fire,139,26.89


# Classification

In [298]:
X = d.drop(['area', 'scale'], axis=1)
y = d['scale']
X = pd.get_dummies(X, ['month', 'day'])
scale_order = ['no fire', 'small fire', 'large fire']
month_order = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
d['scale'] = pd.Categorical(d['scale'], categories = scale_order, ordered = True)
d['month'] = pd.Categorical(d['month'], categories = month_order, ordered = True)
result = d.groupby(['month', 'scale'])['scale'].count().unstack('scale').fillna(0)
result = result.apply(lambda x: x/x.sum()*100, axis = 1).round(2)
result = result.reindex(month_order)
result = result[['no fire', 'small fire', 'large fire']]
result = pd.DataFrame(result)
result.T

month,jan,feb,mar,apr,may,jun,jul,aug,sep,oct,nov,dec
scale,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
no fire,100.0,50.0,64.81,55.56,50.0,52.94,43.75,46.2,43.6,66.67,100.0,0.0
small fire,0.0,20.0,12.96,22.22,0.0,29.41,28.12,30.43,26.74,6.67,0.0,11.11
large fire,0.0,30.0,22.22,22.22,50.0,17.65,28.12,23.37,29.65,26.67,0.0,88.89


# Upsampling

### All data

In [105]:
d = df.copy()
d_up = d.copy()

for m in d['month'].unique():
    if((m != 'aug') & (m != 'sep')):
        temp = d[d['month'] == m].sample(200, replace = True)
        if d is d_up:
            d_up = pd.concat([d, temp], axis = 0)
        else:
            d_up = pd.concat([d_up, temp], axis = 0)

X = d_up.drop(['area', 'scale'], axis=1)
y = d_up['scale']
X = pd.get_dummies(X, ['month', 'day'])
scale_order = ['no fire', 'small fire', 'large fire']
month_order = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
d_up['scale'] = pd.Categorical(d_up['scale'], categories = scale_order, ordered = True)
d_up['month'] = pd.Categorical(d_up['month'], categories = month_order, ordered = True)
result = d_up.groupby(['month', 'scale'])['scale'].count().unstack('scale').fillna(0)
result = result.apply(lambda x: x/x.sum()*100, axis = 1).round(2)
result = result.reindex(month_order)
result = result[['no fire', 'small fire', 'large fire']]
result = pd.DataFrame(result)

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, shuffle = True, random_state = 42)

print(x_train.shape, x_test.shape)


(1761, 29) (756, 29)


### Stepwise selection

In [132]:
d = df.copy()
d_up = d.copy()

for m in d['month'].unique():
    if((m != 'aug') & (m != 'sep')):
        temp = d[d['month'] == m].sample(200, replace = True)
        if d is d_up:
            d_up = pd.concat([d, temp], axis = 0)
        else:
            d_up = pd.concat([d_up, temp], axis = 0)

X = d_up.drop(['area', 'scale'], axis=1)
y = d_up['scale']
X = pd.get_dummies(X, ['month', 'day'])
X = X[['X', 'FFMC', 'DMC', 'ISI', 'temp', 'wind', 'month_dec', 'month_nov', 'day_fri']]
scale_order = ['no fire', 'small fire', 'large fire']
# month_order = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
d_up['scale'] = pd.Categorical(d_up['scale'], categories = scale_order, ordered = True)
# d_up['month'] = pd.Categorical(d_up['month'], categories = month_order, ordered = True)
# result = d_up.groupby(['month', 'scale'])['scale'].count().unstack('scale').fillna(0)
# result = result.apply(lambda x: x/x.sum()*100, axis = 1).round(2)
# result = result.reindex(month_order)
# result = result[['no fire', 'small fire', 'large fire']]
# result = pd.DataFrame(result)

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, shuffle = True, random_state = 42)
print(x_train.shape, x_test.shape)

(1761, 9) (756, 9)


### STFWI

In [116]:
d = df.copy()
d_up = d.copy()

for m in d['month'].unique():
    if((m != 'aug') & (m != 'sep')):
        temp = d[d['month'] == m].sample(200, replace = True)
        if d is d_up:
            d_up = pd.concat([d, temp], axis = 0)
        else:
            d_up = pd.concat([d_up, temp], axis = 0)

X = d_up.drop(['area', 'scale', 'rain', 'wind', 'temp', 'RH'], axis=1)
y = d_up['scale']
X = pd.get_dummies(X, ['month', 'day'])
scale_order = ['no fire', 'small fire', 'large fire']
month_order = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
d_up['scale'] = pd.Categorical(d_up['scale'], categories = scale_order, ordered = True)
d_up['month'] = pd.Categorical(d_up['month'], categories = month_order, ordered = True)
result = d_up.groupby(['month', 'scale'])['scale'].count().unstack('scale').fillna(0)
result = result.apply(lambda x: x/x.sum()*100, axis = 1).round(2)
result = result.reindex(month_order)
result = result[['no fire', 'small fire', 'large fire']]
result = pd.DataFrame(result)

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, shuffle = True, random_state = 42)

print(x_train.shape, x_test.shape)

(1761, 25) (756, 25)


### STM

In [119]:
d = df.copy()
d_up = d.copy()

for m in d['month'].unique():
    if((m != 'aug') & (m != 'sep')):
        temp = d[d['month'] == m].sample(200, replace = True)
        if d is d_up:
            d_up = pd.concat([d, temp], axis = 0)
        else:
            d_up = pd.concat([d_up, temp], axis = 0)

X = d_up.drop(['area', 'scale', 'FFMC', 'DMC', 'DC', 'ISI'], axis=1)
y = d_up['scale']
X = pd.get_dummies(X, ['month', 'day'])
scale_order = ['no fire', 'small fire', 'large fire']
month_order = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
d_up['scale'] = pd.Categorical(d_up['scale'], categories = scale_order, ordered = True)
d_up['month'] = pd.Categorical(d_up['month'], categories = month_order, ordered = True)
result = d_up.groupby(['month', 'scale'])['scale'].count().unstack('scale').fillna(0)
result = result.apply(lambda x: x/x.sum()*100, axis = 1).round(2)
result = result.reindex(month_order)
result = result[['no fire', 'small fire', 'large fire']]
result = pd.DataFrame(result)

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, shuffle = True, random_state = 42)

print(x_train.shape, x_test.shape)

(1761, 25) (756, 25)


### FWI

In [122]:
d = df.copy()
d_up = d.copy()

for m in d['month'].unique():
    if((m != 'aug') & (m != 'sep')):
        temp = d[d['month'] == m].sample(200, replace = True)
        if d is d_up:
            d_up = pd.concat([d, temp], axis = 0)
        else:
            d_up = pd.concat([d_up, temp], axis = 0)

X = d_up.drop(['area', 'scale', 'rain', 'temp', 'RH', 'wind', 'month', 'day', 'X', 'Y'], axis=1)
y = d_up['scale']
# X = pd.get_dummies(X, ['month', 'day'])
scale_order = ['no fire', 'small fire', 'large fire']
# month_order = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
d_up['scale'] = pd.Categorical(d_up['scale'], categories = scale_order, ordered = True)
# d_up['month'] = pd.Categorical(d_up['month'], categories = month_order, ordered = True)
# result = d_up.groupby(['month', 'scale'])['scale'].count().unstack('scale').fillna(0)
# result = result.apply(lambda x: x/x.sum()*100, axis = 1).round(2)
# result = result.reindex(month_order)
# result = result[['no fire', 'small fire', 'large fire']]
# result = pd.DataFrame(result)

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, shuffle = True, random_state = 42)

print(x_train.shape, x_test.shape)
# x_train

(1761, 4) (756, 4)


### M

In [125]:
d = df.copy()
d_up = d.copy()

for m in d['month'].unique():
    if((m != 'aug') & (m != 'sep')):
        temp = d[d['month'] == m].sample(200, replace = True)
        if d is d_up:
            d_up = pd.concat([d, temp], axis = 0)
        else:
            d_up = pd.concat([d_up, temp], axis = 0)

X = d_up.drop(['area', 'scale', 'FFMC', 'DMC', 'DC', 'ISI', 'month', 'day', 'X', 'Y'], axis=1)
y = d_up['scale']
# X = pd.get_dummies(X, ['month', 'day'])
scale_order = ['no fire', 'small fire', 'large fire']
# month_order = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
d_up['scale'] = pd.Categorical(d_up['scale'], categories = scale_order, ordered = True)
# d_up['month'] = pd.Categorical(d_up['month'], categories = month_order, ordered = True)
# result = d_up.groupby(['month', 'scale'])['scale'].count().unstack('scale').fillna(0)
# result = result.apply(lambda x: x/x.sum()*100, axis = 1).round(2)
# result = result.reindex(month_order)
# result = result[['no fire', 'small fire', 'large fire']]
# result = pd.DataFrame(result)

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, shuffle = True, random_state = 42)

print(x_train.shape, x_test.shape)

(1761, 4) (756, 4)


## 1. RandomOverSimpler

In [76]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(sampling_strategy = 'auto')
x_train_ros, y_train_ros = ros.fit_resample(x_train, y_train)
print(x_train_ros.shape, x_test.shape)

(3078, 9) (756, 9)


In [21]:
train_proportions = y_train_ros.value_counts(normalize = True)
print(train_proportions)

no fire       0.333333
large fire    0.333333
small fire    0.333333
Name: scale, dtype: float64


## LogisticRegression

In [604]:
clf = LogisticRegression(max_iter = int(1e7) ,solver = 'liblinear', l1_ratio = 0.75)
clf.fit(x_train_ros, y_train_ros)

y_pred = clf.predict(x_test)

acu = metrics.accuracy_score(y_test, y_pred)
pre_class, r_class, f_class, sup = metrics.precision_recall_fscore_support(y_test, y_pred, average = 'weighted')


print("Acurracy: ", round(acu, 4)*100, '%')
print('Precision:', round(pre_class, 4)*100, '%')
print('Recall:', round(r_class, 4)*100, '%')
print('f1:', round(f_class, 4)*100, '%')

Acurracy:  44.31 %
Precision: 50.239999999999995 %
Recall: 44.31 %
f1: 46.14 %


In [605]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import plotly.graph_objects as go

# predict probabilities for the test set
y_pred_proba = clf.predict_proba(x_test)

# binarize the true labels
y_test_bin = label_binarize(y_test, classes = clf.classes_)

# calculate fpr, tpr and threshold values for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(clf.classes_)):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), y_pred_proba.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# plot ROC curve for each class and micro-average ROC curve
fig = go.Figure()
class_names = clf.classes_
for i in range(len(class_names)):
    fig.add_trace(go.Scatter(x = fpr[i], y = tpr[i],
                             mode = 'lines',
                             line = dict(width=2),
                             name = 'ROC curve of class {} (AUC = {:.2f})'.format(class_names[i], roc_auc[i])))
fig.add_trace(go.Scatter(x = [0, 1], y = [0, 1],
                         mode = 'lines',
                         line = dict(color = 'navy', width = 2, dash = 'dash'),
                         showlegend = False))
fig.add_annotation(x = 0.5, y = -0.2,
                   text = "Micro-average ROC curve (AUC = {0:.2f})"
                   "\nComputed using all classes as positive samples"
                   "\n({1}-way classification)".format(roc_auc["micro"], len(class_names)),
                   showarrow = False,
                   xref = 'paper', yref = 'paper', align = 'center')

fig.update_layout(title={
                'text': 'M / ROC Curve / ROS',
                'x':0.5,
                'font':{'size' : 32}
                },
                  xaxis_title = {
                'text': 'False Positive Rate',
                'font':{'size' : 20}
                },
                  yaxis_title = {
                'text': 'True Positive Rate',
                'font':{'size' : 20}
                },
                  legend=dict(x = 0.1, y = 0.9),
                  width = 1200,
                  height = 800
                  )
fig.show()

## SVC

In [139]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(sampling_strategy = 'minority')
x_train_ros, y_train_ros = ros.fit_resample(x_train, y_train)

clf = svm.SVC(C = 1, kernel = 'rbf', gamma = 'auto', max_iter = int(5e4), cache_size = 1000)
clf.fit(x_train_ros, y_train_ros)

y_pred = clf.predict(x_test)

acu = metrics.accuracy_score(y_test, y_pred)
pre_class, r_class, f_class, sup = metrics.precision_recall_fscore_support(y_test, y_pred, average = 'weighted')

print("Acurracy: ", round(acu, 4)*100, '%')
print('Precision:', round(pre_class, 4)*100, '%')
print('Recall:', round(r_class, 4)*100, '%')
print('f1:', round(f_class, 4)*100, '%')


labels = ['no fire', 'small fire', 'large fire']
cm = metrics.confusion_matrix(y_test, y_pred, labels=labels)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # normalize confusion matrix
cm_norm
import plotly.graph_objects as go

fig = go.Figure(
    data=go.Heatmap(
        z=cm_norm,
        x=labels,
        y=labels,
        colorscale=[
            [0, 'white'],
            # [0.5, 'white'],
            [1, 'red']
        ],
        zmin=0,
        zmax=1
    ),
    layout=go.Layout(
        title={
            'text': 'Confusion matrix / Minority',
            'font': {'size': 32},
            'x': 0.5
        },
        yaxis_title={
            'font': {'size': 20},
            'text': 'Actual'
        },
        xaxis_title={
            'font': {'size': 20},
            'text': 'Predicted'
        },
        width=600,
        height=600
    )
)

fig.update_xaxes(tickfont=dict(size = 16))
fig.update_yaxes(tickfont=dict(size = 16))


fig.show()

Acurracy:  88.49000000000001 %
Precision: 88.67 %
Recall: 88.49000000000001 %
f1: 88.29 %


In [263]:
from sklearn.model_selection import cross_val_score

# define the number of times to run the model
n_runs = 10

# create an empty list to store the accuracy values
accuracy_list = []

for i in range(n_runs):
    # perform train-test split

    x_train, x_test, y_train, y_test = train_test_split(X, y, shuffle = True)
    
    # preprocess the data
    cat_col = ['month_apr', 'month_aug', 'month_dec', 'month_feb', 'month_jan',
       'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov',
       'month_oct', 'month_sep', 'day_fri', 'day_mon', 'day_sat', 'day_sun',
       'day_thu', 'day_tue', 'day_wed']
    num_col = ['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']  
    x_train_num = x_train.loc[:, num_col]
    x_train_cat = x_train.loc[:, cat_col]
    x_test_num = x_test.loc[:, num_col]
    x_test_cat = x_test.loc[:, cat_col]

    scl = StandardScaler()
    scl.fit(x_train_num)

    x_train_num = scl.transform(x_train_num)
    x_test_num = scl.transform(x_test_num)

    x_train = np.concatenate([x_train_num, np.array(x_train_cat)], axis=1)
    x_test = np.concatenate([x_test_num, np.array(x_test_cat)], axis=1)

    ros = RandomOverSampler(sampling_strategy = 'not majority')
    x_train_ros, y_train_ros = ros.fit_resample(x_train, y_train)   
    
    # fit the model
    clf = svm.SVC(C=1, kernel='rbf', gamma='auto', max_iter=int(5e4), cache_size=1000)
    clf.fit(x_train_ros, y_train_ros)

    # make predictions on the test set
    y_pred = clf.predict(x_test)

    # calculate the accuracy and append to the list
    accuracy = metrics.accuracy_score(y_test, y_pred)
    accuracy_list.append(accuracy)
    print(f"Accuracy of {i+1}:", round(accuracy, 4)*100, '%')

# print the average accuracy
print("Average Accuracy over {} runs: {}".format(n_runs, round(np.mean(accuracy_list), 2)))

Accuracy of 1: 36.15 %
Accuracy of 2: 39.23 %
Accuracy of 3: 42.309999999999995 %
Accuracy of 4: 33.85 %
Accuracy of 5: 36.919999999999995 %
Accuracy of 6: 40.77 %
Accuracy of 7: 43.08 %
Accuracy of 8: 33.08 %
Accuracy of 9: 42.309999999999995 %
Accuracy of 10: 35.38 %
Average Accuracy over 10 runs: 0.38


## 2. Upsampling by SMOTE

In [82]:
from imblearn.over_sampling import SMOTE

smt = SMOTE(sampling_strategy = 'auto')
x_train_smt, y_train_smt = smt.fit_resample(x_train, y_train)
print(x_train_smt.shape, x_test.shape)

(2964, 9) (756, 9)


In [607]:
train_proportions = y_train_smt.value_counts(normalize = True)
print(train_proportions)

no fire       0.333333
large fire    0.333333
small fire    0.333333
Name: scale, dtype: float64


## Logistic Regression

In [608]:
clf = LogisticRegression(max_iter = int(1e7) ,solver = 'liblinear', l1_ratio = 0.75)

clf.fit(x_train_smt, y_train_smt)

y_pred = clf.predict(x_test)

acu = metrics.accuracy_score(y_test, y_pred)
pre_class, r_class, f_class, sup = metrics.precision_recall_fscore_support(y_test, y_pred, average = 'weighted')

print("Acurracy: ", round(acu, 4)*100, '%')
print('Precision:', round(pre_class, 4)*100, '%')
print('Recall:', round(r_class, 4)*100, '%')
print('f1:', round(f_class, 4)*100, '%')

Acurracy:  40.21 %
Precision: 48.17 %
Recall: 40.21 %
f1: 42.07 %


In [600]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import plotly.graph_objects as go

# predict probabilities for the test set
y_pred_proba = clf.predict_proba(x_test)

# binarize the true labels
y_test_bin = label_binarize(y_test, classes = clf.classes_)

# calculate fpr, tpr and threshold values for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(clf.classes_)):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), y_pred_proba.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# plot ROC curve for each class and micro-average ROC curve
fig = go.Figure()
class_names = clf.classes_
for i in range(len(class_names)):
    fig.add_trace(go.Scatter(x = fpr[i], y = tpr[i],
                             mode = 'lines',
                             line = dict(width=2),
                             name = 'ROC curve of class {} (AUC = {:.2f})'.format(class_names[i], roc_auc[i])))
fig.add_trace(go.Scatter(x = [0, 1], y = [0, 1],
                         mode = 'lines',
                         line = dict(color = 'navy', width = 2, dash = 'dash'),
                         showlegend = False))
fig.add_annotation(x = 0.5, y = -0.2,
                   text = "Micro-average ROC curve (AUC = {0:.2f})"
                   "\nComputed using all classes as positive samples"
                   "\n({1}-way classification)".format(roc_auc["micro"], len(class_names)),
                   showarrow = False,
                   xref = 'paper', yref = 'paper', align = 'center')

fig.update_layout(title={
                'text': 'M/ ROC Curve / SMOTE',
                'x':0.5,
                'font':{'size' : 32}
                },
                  xaxis_title = {
                'text': 'False Positive Rate',
                'font':{'size' : 20}
                },
                  yaxis_title = {
                'text': 'True Positive Rate',
                'font':{'size' : 20}
                },
                  legend=dict(x = 0.1, y = 0.9),
                  width = 1200,
                  height = 800
                  )
fig.show()

In [54]:
from sklearn.model_selection import cross_val_score

# define the number of times to run the model
n_runs = 10

# create an empty list to store the accuracy values
accuracy_list = []

for i in range(n_runs):
    # perform train-test split
    x_train, x_test, y_train, y_test = train_test_split(X, y, shuffle = True)
    
    # preprocess the data
    cat_col = ['month_apr', 'month_aug', 'month_dec', 'month_feb', 'month_jan',
       'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov',
       'month_oct', 'month_sep', 'day_fri', 'day_mon', 'day_sat', 'day_sun',
       'day_thu', 'day_tue', 'day_wed']
    num_col = ['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']  
    x_train_num = x_train.loc[:, num_col]
    x_train_cat = x_train.loc[:, cat_col]
    x_test_num = x_test.loc[:, num_col]
    x_test_cat = x_test.loc[:, cat_col]

    scl = StandardScaler()
    scl.fit(x_train_num)

    x_train_num = scl.transform(x_train_num)
    x_test_num = scl.transform(x_test_num)

    x_train = np.concatenate([x_train_num, np.array(x_train_cat)], axis=1)
    x_test = np.concatenate([x_test_num, np.array(x_test_cat)], axis=1)
    
    # fit the model
    clf = svm.SVC(C=1, kernel='rbf', gamma='auto', max_iter=int(5e4), cache_size=1000)
    clf.fit(x_train, y_train)

    # make predictions on the test set
    y_pred = clf.predict(x_test)

    # calculate the accuracy and append to the list
    accuracy = metrics.accuracy_score(y_test, y_pred)
    accuracy_list.append(accuracy)
    print(f"Accuracy of {i+1}:", round(accuracy, 2))

# print the average accuracy
print("Average Accuracy over {} runs: {}".format(n_runs, round(np.mean(accuracy_list), 2)))

Accuracy of 1: 0.78
Accuracy of 2: 0.78
Accuracy of 3: 0.81
Accuracy of 4: 0.78
Accuracy of 5: 0.8
Accuracy of 6: 0.8
Accuracy of 7: 0.79
Accuracy of 8: 0.83
Accuracy of 9: 0.82
Accuracy of 10: 0.77
Average Accuracy over 10 runs: 0.8


## SVC

In [127]:
labels = ['no fire', 'small fire', 'large fire']
cm = metrics.confusion_matrix(y_test, y_pred, labels=labels)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # normalize confusion matrix
cm_norm

import plotly.graph_objects as go

fig = go.Figure(
    data=go.Heatmap(
        z=cm_norm,
        x=labels,
        y=labels,
        colorscale=[
            [0, 'white'],
            # [0.5, 'white'],
            [1, 'red']
        ],
        zmin=0,
        zmax=1
    ),
    layout=go.Layout(
        title={
            'text': 'M / Confusion matrix / SMOTE',
            'font': {'size': 32},
            'x': 0.5
        },
        yaxis_title={
            'font': {'size': 20},
            'text': 'Actual'
        },
        xaxis_title={
            'font': {'size': 20},
            'text': 'Predicted'
        },
        width=600,
        height=600
    )
)

fig.update_xaxes(tickfont=dict(size = 16))
fig.update_yaxes(tickfont=dict(size = 16))

fig.show()

Acurracy:  0.8849
Precision: 0.8846
Recall: 0.8849
F1: 0.8843
