In [58]:
import pandas as pd
import numpy as np
import seaborn as sns

In [59]:
import warnings
warnings.filterwarnings(action='ignore')

In [60]:
df = pd.read_csv('Algerian Forest Fire_final.csv')
df.head()

Unnamed: 0,day,month,Temperature,RH,Ws,Rain,FFMC,DC,ISI,BUI,Classes_mode,FWI_mean
0,1,6,29,57,18,0.0,65.7,7.6,1.3,3.4,not fire,0.5
1,2,6,29,61,13,1.3,64.4,7.6,1.0,3.9,not fire,0.4
2,5,6,27,77,16,0.0,64.8,14.2,1.2,3.9,not fire,0.5
3,6,6,31,67,14,0.0,82.6,22.2,3.1,7.0,fire,2.5
4,7,6,33,54,13,0.0,88.2,30.5,6.4,10.9,fire,7.2


In [61]:
x = df.drop('Classes_mode', axis = 1)
y = df['Classes_mode']
y = y.apply(lambda x : 1 if x=='fire' else 0)
y.value_counts()

1    127
0     97
Name: Classes_mode, dtype: int64

In [62]:
# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [63]:
x_train.shape, x_test.shape

((150, 11), (74, 11))

In [99]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (accuracy_score, f1_score, confusion_matrix, make_scorer, roc_curve, 
                             auc, average_precision_score, roc_auc_score,classification_report)

# Estimators
pipe_lr = Pipeline([('scaler',StandardScaler()), ('clf', LogisticRegression())])

from sklearn.neighbors import KNeighborsClassifier
pipe_knn = Pipeline([('scaler',StandardScaler()), ('clf', KNeighborsClassifier())])

from sklearn.svm import SVC
pipe_svc = Pipeline([('scaler',StandardScaler()), ('clf', SVC())])

from sklearn.ensemble import RandomForestClassifier
pipe_rf = Pipeline([('clf', RandomForestClassifier())])

# Paramaters
param_range = [1,2,3,4,5,6,7,8,9,10]
param_range_fl = [1.0, 0.5]
jobs = -1

# GridSearch cross valuation
## parameters
grid_params_lr = [{'clf__penalty': ['l1', 'l2'],'clf__C': param_range_fl}]

grid_params_knn = [{'clf__n_neighbors' : param_range}]

grid_params_svc = [{'clf__kernel': ['linear', 'rbf'], 'clf__C': param_range}]

grid_params_rf = [{'clf__criterion': ['gini', 'entropy'], 'clf__max_depth': param_range,'clf__min_samples_split': param_range}]

## grid search models
LR = GridSearchCV(pipe_lr, cv=10, param_grid=grid_params_lr, verbose=1, 
                    return_train_score=True, scoring = 'accuracy',n_jobs=jobs)

KNN = GridSearchCV(pipe_knn, cv=10, param_grid=grid_params_knn, verbose=1, 
                    return_train_score=True, scoring = 'accuracy',n_jobs=jobs)

SVC = GridSearchCV(pipe_svc, cv=10, param_grid=grid_params_svc, verbose=1, 
                    return_train_score=True, scoring = 'accuracy',n_jobs=jobs)

RF = GridSearchCV(pipe_rf, cv=10, param_grid=grid_params_rf, verbose=1, 
                    return_train_score=True, scoring = 'accuracy',n_jobs=jobs)

# List of pipelines for iterating through each of them
grids = [LR,KNN,SVC,RF]

# Creating a dict for our reference
grid_dict = {0: 'Logistic Regression', 
        1: 'K Nearest Neighbor',
        2: 'Support Vector Classifier',
        3: 'Random Forest'}


# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''

for idx, gs in enumerate(grids):
    print(f'\nEstimator: {grid_dict[idx]}')
    gs.fit(x_train, y_train)
    print(f'Best params are : {gs.best_params_}')
    # Best training data accuracy
    print(f'Best training accuracy: {gs.best_score_}')
    # Predict on test data with best params
    y_pred = gs.predict(x_test)
    # Test data accuracy of model with best params
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
    confusion_mat = confusion_matrix(y_test, y_pred)
    print("Confusion matrix: \n",confusion_mat)
    print(classification_report(y_test, y_pred))

Performing model optimizations...

Estimator: Logistic Regression
Fitting 10 folds for each of 4 candidates, totalling 40 fits
Best params are : {'clf__C': 1.0, 'clf__penalty': 'l2'}
Best training accuracy: 0.9400000000000001
Test set accuracy score for best params: 0.986 
Confusion matrix: 
 [[34  0]
 [ 1 39]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        34
           1       1.00      0.97      0.99        40

    accuracy                           0.99        74
   macro avg       0.99      0.99      0.99        74
weighted avg       0.99      0.99      0.99        74


Estimator: K Nearest Neighbor
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best params are : {'clf__n_neighbors': 3}
Best training accuracy: 0.9333333333333333
Test set accuracy score for best params: 0.797 
Confusion matrix: 
 [[24 10]
 [ 5 35]]
              precision    recall  f1-score   support

           0       0.83      0.71      

In [None]:
plot_confusion_matrix(grid,x_test, y_test,values_format='d' )