# Fire prediction by Logistic regression
## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Reading the dataset

In [2]:
df=pd.read_csv("Dataset\Dataset.csv",header=1)

In [3]:
df.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire


## Data Cleaning

In [4]:
## Checking for null values
df.isnull().sum()

day            0
month          1
year           1
Temperature    1
 RH            1
 Ws            1
Rain           1
FFMC           1
DMC            1
DC             1
ISI            1
BUI            1
FWI            1
Classes        2
dtype: int64

In [5]:
df[df.isnull().any(axis=1)]

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
122,Sidi-Bel Abbes Region Dataset,,,,,,,,,,,,,
167,14,7.0,2012.0,37.0,37.0,18.0,0.2,88.9,12.9,14.6 9,12.5,10.4,fire,


In [6]:
## Droping the null values
df=df.dropna().reset_index(drop=True)

In [7]:
df.isnull().sum()

day            0
month          0
year           0
Temperature    0
 RH            0
 Ws            0
Rain           0
FFMC           0
DMC            0
DC             0
ISI            0
BUI            0
FWI            0
Classes        0
dtype: int64

In [8]:
## Combining the two region dataset with new column "Region"
df.loc[:122,"Region"]=int(0)
df.loc[122:,"Region"]=int(1)

In [9]:
df.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,0.0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,0.0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,0.0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,0.0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,0.0


In [10]:
df.iloc[122]

day                    day
month                month
year                  year
Temperature    Temperature
 RH                     RH
 Ws                     Ws
Rain                 Rain 
FFMC                  FFMC
DMC                    DMC
DC                      DC
ISI                    ISI
BUI                    BUI
FWI                    FWI
Classes          Classes  
Region                 1.0
Name: 122, dtype: object

In [11]:
df=df.drop(122).reset_index(drop=True)

In [12]:
## Deleting the spaces in the column name
df.columns=df.columns.str.strip()

In [13]:
## Deleting the spaces in the dataset of dependent variable
df["Classes"]=df["Classes"].str.strip()

In [14]:
## Counting the values of dependent variable
df["Classes"].value_counts()

Classes
fire        137
not fire    106
Name: count, dtype: int64

In [15]:
df.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,0.0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,0.0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,0.0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,0.0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,0.0


In [16]:
## Dropping the unwanted columns
df.drop(["day",'month','year'],axis=1,inplace=True)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Temperature  243 non-null    object 
 1   RH           243 non-null    object 
 2   Ws           243 non-null    object 
 3   Rain         243 non-null    object 
 4   FFMC         243 non-null    object 
 5   DMC          243 non-null    object 
 6   DC           243 non-null    object 
 7   ISI          243 non-null    object 
 8   BUI          243 non-null    object 
 9   FWI          243 non-null    object 
 10  Classes      243 non-null    object 
 11  Region       243 non-null    float64
dtypes: float64(1), object(11)
memory usage: 22.9+ KB


In [18]:
## Changing the datatype of the dataset 
df[['Temperature','RH','Ws','Region']]=df[['Temperature','RH','Ws','Region']].astype(int)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Temperature  243 non-null    int64 
 1   RH           243 non-null    int64 
 2   Ws           243 non-null    int64 
 3   Rain         243 non-null    object
 4   FFMC         243 non-null    object
 5   DMC          243 non-null    object
 6   DC           243 non-null    object
 7   ISI          243 non-null    object
 8   BUI          243 non-null    object
 9   FWI          243 non-null    object
 10  Classes      243 non-null    object
 11  Region       243 non-null    int64 
dtypes: int64(4), object(8)
memory usage: 22.9+ KB


In [20]:
## Changing the datatypes of the columns
object=[feature for feature in df.columns if df[feature].dtypes=='O']
for i in object:
    if i!='Classes':
        df[i]=df[i].astype(float)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Temperature  243 non-null    int64  
 1   RH           243 non-null    int64  
 2   Ws           243 non-null    int64  
 3   Rain         243 non-null    float64
 4   FFMC         243 non-null    float64
 5   DMC          243 non-null    float64
 6   DC           243 non-null    float64
 7   ISI          243 non-null    float64
 8   BUI          243 non-null    float64
 9   FWI          243 non-null    float64
 10  Classes      243 non-null    object 
 11  Region       243 non-null    int64  
dtypes: float64(7), int64(4), object(1)
memory usage: 22.9+ KB


In [22]:
df.tail()

Unnamed: 0,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
238,30,65,14,0.0,85.4,16.0,44.5,4.5,16.9,6.5,fire,1
239,28,87,15,4.4,41.1,6.5,8.0,0.1,6.2,0.0,not fire,1
240,27,87,29,0.5,45.9,3.5,7.9,0.4,3.4,0.2,not fire,1
241,24,54,18,0.1,79.7,4.3,15.2,1.7,5.1,0.7,not fire,1
242,24,64,15,0.2,67.3,3.8,16.5,1.2,4.8,0.5,not fire,1


## Seperation of Dependent and independent variables

In [23]:
X=df.drop('Classes',axis=1)
Y=df['Classes']

In [24]:
X.head()

Unnamed: 0,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Region
0,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,0
1,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,0
2,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,0
3,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,0
4,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,0


In [25]:
Y.head()

0    not fire
1    not fire
2    not fire
3    not fire
4    not fire
Name: Classes, dtype: object

## Train Test Split

In [26]:
from sklearn.model_selection import train_test_split
X_train,X_Test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=42)

## Standardisation

In [27]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train=scaler.fit_transform(X_train)
x_test=scaler.transform(X_Test)

## Logistic Regression Model Fitting

In [28]:
from sklearn.linear_model import LogisticRegression
logistic=LogisticRegression()
logistic.fit(x_train,y_train)
y_pred=logistic.predict(x_test)

## Performance Metrics

In [29]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
score=accuracy_score(y_test,y_pred)
print(score)
cm=confusion_matrix(y_test,y_pred)
print(cm)
print(classification_report(y_test,y_pred))

0.9508196721311475
[[36  1]
 [ 2 22]]
              precision    recall  f1-score   support

        fire       0.95      0.97      0.96        37
    not fire       0.96      0.92      0.94        24

    accuracy                           0.95        61
   macro avg       0.95      0.94      0.95        61
weighted avg       0.95      0.95      0.95        61



## Prediction of new data

In [30]:
logistic.predict(scaler.transform([[29,57,18,0,65.7,3.4,7.6,1.3,3.4,0.5,1]]))

array(['not fire'], dtype=object)

## Hyperparameter Tuning

### Grid Search CV

In [31]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
cv=StratifiedKFold()
grid_model=LogisticRegression()
penalty=['l1', 'l2', 'elasticnet', None]
c=[100,10,1.0,0.1,0.01]
solver=['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
params=dict(penalty=penalty,C=c,solver=solver)
grid=GridSearchCV(estimator=grid_model,param_grid=params,cv=cv,scoring="accuracy")

In [32]:
## Grid search cv model fitting
grid.fit(x_train,y_train)

In [33]:
print(grid.best_params_)
print(grid.best_score_)
y_pred_grid=grid.predict(x_test)

{'C': 10, 'penalty': 'l1', 'solver': 'saga'}
0.978078078078078


In [34]:
## Performance metrics of grid search cv
score=accuracy_score(y_test,y_pred_grid)
print(score)
cm=confusion_matrix(y_test,y_pred_grid)
print(cm)
print(classification_report(y_test,y_pred_grid))

0.9672131147540983
[[36  1]
 [ 1 23]]
              precision    recall  f1-score   support

        fire       0.97      0.97      0.97        37
    not fire       0.96      0.96      0.96        24

    accuracy                           0.97        61
   macro avg       0.97      0.97      0.97        61
weighted avg       0.97      0.97      0.97        61



In [35]:
## Prediction for new data in 
grid.predict(scaler.transform([[30,60,20,0,60.5,4.5,7.6,1.3,3.4,0.5,1]]))

array(['not fire'], dtype=object)

### Randomized Search CV

In [36]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
cv=StratifiedKFold()
random_model=LogisticRegression()
penalty=['l1', 'l2', 'elasticnet', None]
c=[100,10,1.0,0.1,0.01]
solver=['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
params=dict(penalty=penalty,C=c,solver=solver)
random=GridSearchCV(estimator=random_model,param_grid=params,cv=cv,scoring="accuracy")

In [37]:
## randomized cv model fitting
random.fit(x_train,y_train)
y_pred_random=random.predict(x_test)

In [38]:
## Performance metrics for randomized search cv
score=accuracy_score(y_test,y_pred_random)
print(score)
cm=confusion_matrix(y_test,y_pred_random)
print(cm)
print(classification_report(y_test,y_pred_random))

0.9672131147540983
[[36  1]
 [ 1 23]]
              precision    recall  f1-score   support

        fire       0.97      0.97      0.97        37
    not fire       0.96      0.96      0.96        24

    accuracy                           0.97        61
   macro avg       0.97      0.97      0.97        61
weighted avg       0.97      0.97      0.97        61



In [39]:
print(random.best_params_)
print(random.best_score_)

{'C': 10, 'penalty': 'l1', 'solver': 'saga'}
0.978078078078078


In [40]:
## Prediction for new data in randomized search cv model
random.predict(scaler.transform([[29,57,18,0,65.7,3.4,7.6,1.3,3.4,0.5,1]]))

array(['not fire'], dtype=object)