In [1]:
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing 
from IPython.display import display
from sklearn import tree
from sklearn.manifold import TSNE
from sklearn import svm
from sklearn.svm import SVC 
from sklearn import linear_model
import joblib 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier 

In [55]:
final_df = pd.read_pickle('pickle/firmsxn_24JAN.pkl')
final_df.iloc[6]

LATITUDE              15.73662
LONGITUDE             77.15088
ACQ_TIME                     1
CONFIDENCE                   7
DAYNIGHT                     1
FRP                          3
STATE                       34
ACQ_YEAR                  2022
ACQ_MONTH                    1
STATE_PRCNT_FOREST       17.88
AVG_TEMP                  27.5
AVG_PREC                 87.25
Name: 31, dtype: object

In [20]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70 entries, 22 to 436
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   LATITUDE            70 non-null     float64
 1   LONGITUDE           70 non-null     float64
 2   ACQ_TIME            70 non-null     int64  
 3   CONFIDENCE          70 non-null     int64  
 4   DAYNIGHT            70 non-null     int64  
 5   FRP                 70 non-null     int64  
 6   STATE               70 non-null     int64  
 7   ACQ_YEAR            70 non-null     object 
 8   ACQ_MONTH           70 non-null     int64  
 9   STATE_PRCNT_FOREST  70 non-null     float64
 10  AVG_TEMP            70 non-null     float64
 11  AVG_PREC            70 non-null     float64
dtypes: float64(5), int64(6), object(1)
memory usage: 7.1+ KB


In [21]:
final_df.dropna()

Unnamed: 0,LATITUDE,LONGITUDE,ACQ_TIME,CONFIDENCE,DAYNIGHT,FRP,STATE,ACQ_YEAR,ACQ_MONTH,STATE_PRCNT_FOREST,AVG_TEMP,AVG_PREC
22,17.20519,80.09620,1,6,1,1,33,2022,1,18.36,27.1,71.44
23,17.20362,80.10623,1,7,1,2,33,2022,1,18.36,27.1,71.44
24,17.61481,76.94758,1,7,1,2,26,2022,1,20.11,22.9,132.51
28,17.19704,78.13490,1,6,1,2,33,2022,1,18.36,27.1,71.44
29,17.20525,77.17860,1,7,1,2,26,2022,1,20.11,22.9,132.51
...,...,...,...,...,...,...,...,...,...,...,...,...
387,27.41003,72.16960,1,8,1,5,7,2022,1,4.86,25.1,38.20
390,27.40919,72.05581,1,8,1,4,7,2022,1,4.86,25.1,38.20
430,21.18018,72.97983,2,6,1,3,23,2022,1,7.57,27.0,51.90
431,20.95814,70.60186,2,8,1,3,23,2022,1,7.57,27.0,51.90


In [34]:
#Breaking down data into train and test 
y = final_df['FRP'].values
x = final_df.drop(['FRP'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

In [35]:
#Funstion for MAPE error to be used further
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [36]:
x_train.shape

(46, 11)

In [37]:
y_train.shape

(46,)

In [38]:
x_train = x_train.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)

In [39]:
#Randomly taking nearest neighbors to be 5 here
knnModel = KNeighborsClassifier(n_neighbors = 5).fit(x_train, y_train) 

In [40]:
x_test

Unnamed: 0,LATITUDE,LONGITUDE,ACQ_TIME,CONFIDENCE,DAYNIGHT,STATE,ACQ_YEAR,ACQ_MONTH,STATE_PRCNT_FOREST,AVG_TEMP,AVG_PREC
225,17.26403,73.83391,2,7,1,25,2022,1,16.5,26.4,53.7
52,22.20425,75.93801,1,7,1,22,2022,1,25.14,25.3,78.26
24,17.61481,76.94758,1,7,1,26,2022,1,20.11,22.9,132.51
211,28.96943,95.78728,2,6,1,11,2022,1,79.63,20.8,230.62
234,18.09689,73.21223,2,7,1,25,2022,1,16.5,26.4,53.7
35,15.03169,77.57025,1,7,1,34,2022,1,17.88,27.5,87.25
57,21.4005,74.79194,1,9,1,25,2022,1,16.5,26.4,53.7
28,17.19704,78.1349,1,6,1,33,2022,1,18.36,27.1,71.44
36,15.21525,75.95771,1,10,1,26,2022,1,20.11,22.9,132.51
228,17.96535,75.77852,2,8,1,25,2022,1,16.5,26.4,53.7


In [41]:
x_test = x_test.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)

In [42]:
#Using the model for predicting values of Test data
knn_predictions = knnModel.predict(x_test)  

#Fining accuracy of the Model
accuracy_train = knnModel.score(x_train, y_train) 
accuracy_test = knnModel.score(x_test, y_test) 

print('Train Data Accuracy is :',accuracy_train)
print('Test Data Accuracy is :',accuracy_test)

#Finding MAE
knn_MAE = mean_absolute_error(y_test, knn_predictions)
print('MAE value is: ', knn_MAE)

#Finding MAPE
knn_MAPE = mean_absolute_percentage_error(y_test, knn_predictions)
print('MAPE value is: ', knn_MAPE)

Train Data Accuracy is : 0.6304347826086957
Test Data Accuracy is : 0.3333333333333333
MAE value is:  1.4583333333333333
MAPE value is:  35.4265873015873


In [43]:
#Now applying  Gaussian Naive Bayes as it does not require any normalization too
gnb = GaussianNB().fit(x_train, y_train) 

In [44]:
gnb_predictions = gnb.predict(x_test) 
# accuracy on X_test 
accuracy_train = gnb.score(x_train, y_train) 
accuracy_test = gnb.score(x_test, y_test) 

print('Train Data Accuracy is :',accuracy_train)
print('Test Data Accuracy is :',accuracy_test)

#Finding MAE
gnb_MAE = mean_absolute_error(y_test, gnb_predictions)
print('MAE value is: ', gnb_MAE)

#Finding MAPE
gnb_MAPE = mean_absolute_percentage_error(y_test, gnb_predictions)
print('MAPE value is: ', gnb_MAPE)

Train Data Accuracy is : 0.5869565217391305
Test Data Accuracy is : 0.4583333333333333
MAE value is:  0.875
MAPE value is:  31.686507936507937


In [45]:
decisionTreeModel = tree.DecisionTreeClassifier().fit(x_train, y_train)

In [46]:
decisionTreeModel_predictions = decisionTreeModel.predict(x_test) 
# accuracy on X_test 
accuracy_train = decisionTreeModel.score(x_train, y_train) 
accuracy_test = decisionTreeModel.score(x_test, y_test) 

print('Train Data Accuracy is :',accuracy_train)
print('Test Data Accuracy is :',accuracy_test)

#Finding MAE
decisionTreeModel_MAE = mean_absolute_error(y_test, decisionTreeModel_predictions)
print('MAE value is: ', decisionTreeModel_MAE)

#Finding MAPE
decisionTreeModel_MAPE = mean_absolute_percentage_error(y_test, decisionTreeModel_predictions)
print('MAPE value is: ', decisionTreeModel_MAPE)

Train Data Accuracy is : 1.0
Test Data Accuracy is : 0.3333333333333333
MAE value is:  1.0
MAPE value is:  29.88095238095238


In [47]:
#Second tree based approach: Random Forest
rf_clf = RandomForestClassifier().fit(x_train, y_train) 

In [48]:
#Since the model training consumes a lot of RAM. Therefore storing it in a .pkl file for further use.
joblib.dump(rf_clf, 'pickle/randomForestModel.pkl')

['pickle/randomForestModel.pkl']

In [49]:
rf_predictions = rf_clf.predict(x_test) 
# accuracy on X_test 
accuracy_train = rf_clf.score(x_train, y_train) 
accuracy_test = rf_clf.score(x_test, y_test) 

print('Train Data Accuracy is :',accuracy_train)
print('Test Data Accuracy is :',accuracy_test)

#Finding MAE
rf_MAE = mean_absolute_error(y_test, rf_predictions)
print('MAE value is: ', rf_MAE)

#Finding MAPE
rf_MAPE = mean_absolute_percentage_error(y_test, rf_predictions)
print('MAPE value is: ', rf_MAPE)

Train Data Accuracy is : 1.0
Test Data Accuracy is : 0.4166666666666667
MAE value is:  0.9166666666666666
MAPE value is:  26.96428571428571
