In [103]:
import pandas as pd
import numpy as np

In [104]:
df = pd.read_csv("Algerian_forest_fires_dataset_cleaned_dataset.csv")
df.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,0


In [105]:
df['Classes'] = np.where(df.Classes.str.contains("not fire"),"not fire","fire")

In [106]:
df.Classes.unique()

array(['not fire', 'fire'], dtype=object)

In [107]:
df.Classes.value_counts(normalize= True)*100

fire        56.378601
not fire    43.621399
Name: Classes, dtype: float64

In [108]:
df.columns

Index(['day', 'month', 'year', 'Temperature', 'RH', 'Ws', 'Rain', 'FFMC',
       'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes', 'Region'],
      dtype='object')

In [109]:
## dropping day,month and year since these would not be important in building the model
df.drop(['day', 'month', 'year'],axis = 1,inplace = True)

## Let us first consider that the data is not imbalanced

In [110]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Temperature  243 non-null    int64  
 1   RH           243 non-null    int64  
 2   Ws           243 non-null    int64  
 3   Rain         243 non-null    float64
 4   FFMC         243 non-null    float64
 5   DMC          243 non-null    float64
 6   DC           243 non-null    float64
 7   ISI          243 non-null    float64
 8   BUI          243 non-null    float64
 9   FWI          243 non-null    float64
 10  Classes      243 non-null    object 
 11  Region       243 non-null    int64  
dtypes: float64(7), int64(4), object(1)
memory usage: 22.9+ KB


In [111]:
independent_features = [features for features in df.columns if df[features].dtype!='O']

In [112]:
independent_features

['Temperature',
 'RH',
 'Ws',
 'Rain',
 'FFMC',
 'DMC',
 'DC',
 'ISI',
 'BUI',
 'FWI',
 'Region']

In [113]:
dependent_feature = [features for features in df.columns if df[features].dtype =='O']
if "Classes" in dependent_feature:
    y = df["Classes"]

In [114]:
X = df[independent_features]

In [115]:
print(X.shape)
X

(243, 11)


Unnamed: 0,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Region
0,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,0
1,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,0
2,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,0
3,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,0
4,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,0
...,...,...,...,...,...,...,...,...,...,...,...
238,30,65,14,0.0,85.4,16.0,44.5,4.5,16.9,6.5,1
239,28,87,15,4.4,41.1,6.5,8.0,0.1,6.2,0.0,1
240,27,87,29,0.5,45.9,3.5,7.9,0.4,3.4,0.2,1
241,24,54,18,0.1,79.7,4.3,15.2,1.7,5.1,0.7,1


In [116]:
print(y.shape)
y

(243,)


0      not fire
1      not fire
2      not fire
3      not fire
4      not fire
         ...   
238        fire
239    not fire
240    not fire
241    not fire
242    not fire
Name: Classes, Length: 243, dtype: object

In [117]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [118]:
#Scaling of the data first
scalar = StandardScaler()
X_scaled = scalar.fit_transform(X)
X_scaled

array([[-0.87065469, -0.34067323,  0.8932769 , ..., -0.93601155,
        -0.88015863, -0.99589321],
       [-0.87065469, -0.07035945, -0.88887652, ..., -0.90079808,
        -0.8936262 , -0.99589321],
       [-1.69925426,  1.34878787,  2.31899963, ..., -0.98531039,
        -0.93402893, -0.99589321],
       ...,
       [-1.42305441,  1.68668009,  4.81401441, ..., -0.93601155,
        -0.92056135,  1.00412373],
       [-2.25165398, -0.54340856,  0.8932769 , ..., -0.81628577,
        -0.85322348,  1.00412373],
       [-2.25165398,  0.13237588, -0.17601515, ..., -0.83741385,
        -0.88015863,  1.00412373]])

In [119]:
# Splitting the train and test data
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.25,random_state = 40)

In [120]:
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)

In [121]:
y_pred = log_reg.predict(X_test)
y_pred

array(['fire', 'fire', 'not fire', 'fire', 'not fire', 'not fire', 'fire',
       'fire', 'fire', 'fire', 'fire', 'not fire', 'not fire', 'fire',
       'fire', 'not fire', 'fire', 'fire', 'fire', 'fire', 'fire',
       'not fire', 'fire', 'fire', 'fire', 'not fire', 'fire', 'fire',
       'not fire', 'fire', 'not fire', 'fire', 'fire', 'not fire', 'fire',
       'not fire', 'not fire', 'fire', 'not fire', 'fire', 'not fire',
       'fire', 'fire', 'not fire', 'fire', 'fire', 'fire', 'not fire',
       'fire', 'fire', 'fire', 'not fire', 'not fire', 'fire', 'fire',
       'not fire', 'not fire', 'fire', 'not fire', 'not fire', 'not fire'],
      dtype=object)

### Confusion Matrix


In [122]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [123]:
conf_mat = confusion_matrix(y_test,y_pred)
conf_mat

array([[36,  0],
       [ 1, 24]], dtype=int64)

In [124]:
true_positive = conf_mat[0][0]
false_positive = conf_mat[0][1]
false_negative = conf_mat[1][0]
true_negative = conf_mat[1][1]

## Accuracy score

In [125]:
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.9836065573770492

## Here as per the requirement we are getting more than 98% accuracy

# Considering the data as imbalanced now

In [75]:
df.Classes.value_counts(normalize=True)*100

fire        56.378601
not fire    43.621399
Name: Classes, dtype: float64

In [129]:
# Using SMOTE(Oversampling the minority class)Synthetic Minortiy OVersampling Technique
#importing SMOTE module from imblearn library

In [128]:
pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0
Note: you may need to restart the kernel to use updated packages.


In [150]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X, y)

In [151]:
X_resampled.shape

(274, 11)

In [153]:
y_resampled.shape

(274,)

In [154]:
scalar = StandardScaler()
X_scaled = scalar.fit_transform(X)
X_scaled

array([[-0.87065469, -0.34067323,  0.8932769 , ..., -0.93601155,
        -0.88015863, -0.99589321],
       [-0.87065469, -0.07035945, -0.88887652, ..., -0.90079808,
        -0.8936262 , -0.99589321],
       [-1.69925426,  1.34878787,  2.31899963, ..., -0.98531039,
        -0.93402893, -0.99589321],
       ...,
       [-1.42305441,  1.68668009,  4.81401441, ..., -0.93601155,
        -0.92056135,  1.00412373],
       [-2.25165398, -0.54340856,  0.8932769 , ..., -0.81628577,
        -0.85322348,  1.00412373],
       [-2.25165398,  0.13237588, -0.17601515, ..., -0.83741385,
        -0.88015863,  1.00412373]])

In [156]:
## Splitting the data
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.25,random_state=40)

In [157]:
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)

In [158]:
y_pred  = log_reg.predict(X_test)
y_pred

array(['fire', 'fire', 'not fire', 'fire', 'not fire', 'not fire', 'fire',
       'fire', 'fire', 'fire', 'fire', 'not fire', 'not fire', 'fire',
       'fire', 'not fire', 'fire', 'fire', 'fire', 'fire', 'fire',
       'not fire', 'fire', 'fire', 'fire', 'not fire', 'fire', 'fire',
       'not fire', 'fire', 'not fire', 'fire', 'fire', 'not fire', 'fire',
       'not fire', 'not fire', 'fire', 'not fire', 'fire', 'not fire',
       'fire', 'fire', 'not fire', 'fire', 'fire', 'fire', 'not fire',
       'fire', 'fire', 'fire', 'not fire', 'not fire', 'fire', 'fire',
       'not fire', 'not fire', 'fire', 'not fire', 'not fire', 'not fire'],
      dtype=object)

In [162]:
from sklearn.metrics import classification_report
classification_rep = classification_report(y_test,y_pred)
print(classification_rep)

              precision    recall  f1-score   support

        fire       0.97      1.00      0.99        36
    not fire       1.00      0.96      0.98        25

    accuracy                           0.98        61
   macro avg       0.99      0.98      0.98        61
weighted avg       0.98      0.98      0.98        61

