In [78]:
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing 
from IPython.display import display
from sklearn import tree
from sklearn.manifold import TSNE
from sklearn import svm
from sklearn.svm import SVC 
from sklearn import linear_model
import joblib 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import RandomizedSearchCV

In [79]:
#Original Dataset
conn = sqlite3.connect('fpa_archive/FPA_FOD_20170508.sqlite')

In [80]:
df = pd.read_sql_query("SELECT * FROM fires;", conn)

In [81]:
#Taking random 4500 rows of the entire datset to get their fire size class prediction, We can take any number of rows from any part of the data for getting the result
dataset = df[5000:50000:10]
dataset_func1 = dataset.drop(['FIRE_SIZE_CLASS'], axis = 1)
print(dataset_func1)

       OBJECTID  FOD_ID      FPA_ID SOURCE_SYSTEM_TYPE SOURCE_SYSTEM  \
5000       5001    5024  FS-1428631                FED   FS-FIRESTAT   
5010       5011    5034  FS-1428644                FED   FS-FIRESTAT   
5020       5021    5044  FS-1428657                FED   FS-FIRESTAT   
5030       5031    5054  FS-1428673                FED   FS-FIRESTAT   
5040       5041    5064  FS-1428687                FED   FS-FIRESTAT   
...         ...     ...         ...                ...           ...   
49950     49951   50294   FS-270902                FED   FS-FIRESTAT   
49960     49961   50305   FS-270916                FED   FS-FIRESTAT   
49970     49971   50315   FS-270950                FED   FS-FIRESTAT   
49980     49981   50325   FS-270960                FED   FS-FIRESTAT   
49990     49991   50335   FS-270970                FED   FS-FIRESTAT   

      NWCG_REPORTING_AGENCY NWCG_REPORTING_UNIT_ID  \
5000                     FS                USAZKNF   
5010                     FS

In [82]:
dataset.columns

Index(['OBJECTID', 'FOD_ID', 'FPA_ID', 'SOURCE_SYSTEM_TYPE', 'SOURCE_SYSTEM',
       'NWCG_REPORTING_AGENCY', 'NWCG_REPORTING_UNIT_ID',
       'NWCG_REPORTING_UNIT_NAME', 'SOURCE_REPORTING_UNIT',
       'SOURCE_REPORTING_UNIT_NAME', 'LOCAL_FIRE_REPORT_ID',
       'LOCAL_INCIDENT_ID', 'FIRE_CODE', 'FIRE_NAME',
       'ICS_209_INCIDENT_NUMBER', 'ICS_209_NAME', 'MTBS_ID', 'MTBS_FIRE_NAME',
       'COMPLEX_NAME', 'FIRE_YEAR', 'DISCOVERY_DATE', 'DISCOVERY_DOY',
       'DISCOVERY_TIME', 'STAT_CAUSE_CODE', 'STAT_CAUSE_DESCR', 'CONT_DATE',
       'CONT_DOY', 'CONT_TIME', 'FIRE_SIZE', 'FIRE_SIZE_CLASS', 'LATITUDE',
       'LONGITUDE', 'OWNER_CODE', 'OWNER_DESCR', 'STATE', 'COUNTY',
       'FIPS_CODE', 'FIPS_NAME', 'Shape'],
      dtype='object')

In [83]:
dataset.iloc[1].Shape

b'\x00\x01\xad\x10\x00\x00\xa0p=\n\xd7\x0b\\\xc08Z_\xa8\xec@B@\xa0p=\n\xd7\x0b\\\xc08Z_\xa8\xec@B@|\x01\x00\x00\x00\xa0p=\n\xd7\x0b\\\xc08Z_\xa8\xec@B@\xfe'

In [106]:
dfx = dataset.iloc[1:3]

In [107]:
dfx.iloc[1]

OBJECTID                                                                   5021
FOD_ID                                                                     5044
FPA_ID                                                               FS-1428657
SOURCE_SYSTEM_TYPE                                                          FED
SOURCE_SYSTEM                                                       FS-FIRESTAT
NWCG_REPORTING_AGENCY                                                        FS
NWCG_REPORTING_UNIT_ID                                                  USFLFNF
NWCG_REPORTING_UNIT_NAME                            National Forests in Florida
SOURCE_REPORTING_UNIT                                                      0805
SOURCE_REPORTING_UNIT_NAME                          National Forests in Florida
LOCAL_FIRE_REPORT_ID                                                         60
LOCAL_INCIDENT_ID                                                            15
FIRE_CODE                               

In [100]:
dfx =dfx.drop(['FIRE_SIZE_CLASS'], axis = 1)

In [101]:
dfx.shape

(2, 38)

In [102]:
def DataPrediction(data):
  test_df = pd.DataFrame() 
  for i in range(3):
    SampleModel = joblib.load('pickle/SampleModel_'+ str(i) + '.pkl')
    predictedValues = SampleModel.predict(data)
    columnName = 'predict' + str(i)
    test_df[columnName] = predictedValues

  test_finalPrediction = []
  for j in range(len(test_df)):
    row_list = test_df.iloc[j].values.tolist()
    majority_count = max(set(row_list) , key=row_list.count)
    test_finalPrediction.append(majority_count)

  test_finalPrediction = np.array(test_finalPrediction)
  return(test_finalPrediction)

In [103]:
def function1(data):
  '''This function will give the prediction for input data given'''

  print('Deleting unnecessary features......\n')
  del_features = ['OBJECTID', 'FOD_ID', 'FPA_ID', 'NWCG_REPORTING_UNIT_ID', 'NWCG_REPORTING_UNIT_NAME', 'SOURCE_REPORTING_UNIT', 'SOURCE_REPORTING_UNIT_NAME', 'LOCAL_FIRE_REPORT_ID', 'LOCAL_INCIDENT_ID', 'FIRE_CODE', 'FIRE_NAME', 'ICS_209_INCIDENT_NUMBER' , 'ICS_209_NAME', 'MTBS_FIRE_NAME', 'MTBS_ID', 'COMPLEX_NAME', 'DISCOVERY_DATE', 'STAT_CAUSE_DESCR', 'CONT_DATE', 'CONT_TIME', 'FIRE_SIZE', 'OWNER_DESCR', 'COUNTY', 'FIPS_CODE', 'FIPS_NAME', 'Shape' ]
  for i, item in enumerate(del_features):
    del data[item];
  print('Data shape is: ', data.shape, '\n')

  print('Encoding features......\n')
  label_encoder = preprocessing.LabelEncoder() 
  encode_features = ['SOURCE_SYSTEM_TYPE', 'SOURCE_SYSTEM', 'NWCG_REPORTING_AGENCY']
  for j, e_item in enumerate(encode_features):
    data[e_item] = label_encoder.fit_transform(data[e_item]) 
    data[e_item].astype('int64')

  #Manually encoding states feature
  data['STATE'] = data['STATE'].map({'AL': 0, 'AK': 1, 'AZ': 2, 'AR': 3, 'CA': 4, 'CO': 5,'CT': 6,'DE': 7,'DC': 8,'FL': 9,'GA': 10,'HI': 11,'ID': 12,'IL': 13,'IN': 14,'IA': 15,'KS': 16,'KY': 17,'LA': 18,'ME': 19,'MD': 20,'MA': 21,'MI': 22,'MN': 23,'MS': 24,'MO': 25,'MT': 26,'NE': 27,'NV': 28,'NH': 29,'NJ': 30,'NM': 31,'NY': 32,'NC': 33,'ND': 34,'OH': 35,'OK': 36,'OR': 37,'PA': 38,'PR': 39,'RI': 40,'SC': 41,'SD': 42,'TN': 43,'TX': 44,'UT': 45,'VT': 46,'VA': 47,'WA': 48,'WV': 49,'WI': 50,'WY': 51}) 
  data['STATE'].astype('int64')

  print('Performing Feature Engineering......\n')
  #Adding Feature Discovery Month
  discovery_month = [];
  for i in range(len(data)):
   key = data.iloc[i]['DISCOVERY_DOY']
   if( 1 <= key <= 31 ):
    discovery_month.append(1)
   elif ( 32 <= key <= 60 ):
      discovery_month.append(2)
   elif ( 61 <= key <= 91 ):
     discovery_month.append(3)
   elif ( 92 <= key <= 121 ):
     discovery_month.append(4)
   elif ( 122 <= key <= 152 ):
     discovery_month.append(5)
   elif ( 153 <= key <= 182 ):
     discovery_month.append(6)
   elif ( 183 <= key <= 213 ):
     discovery_month.append(7)
   elif ( 214 <= key <= 244 ):
     discovery_month.append(8)
   elif ( 245 <= key <= 274 ):
     discovery_month.append(9)
   elif ( 275 <= key <= 305 ):
     discovery_month.append(10)
   elif ( 306 <= key <= 335 ):
     discovery_month.append(11)
   elif ( 336 <= key <= 366 ):
     discovery_month.append(12)
    
  data['DISCOVERY_MONTH'] = discovery_month
  data['DISCOVERY_MONTH'].astype('int64')
  print('Data shape is: ', data.shape, '\n')

  #Delete DISCOVERY_DOY and CONT_DOY also now
  del data['DISCOVERY_DOY']
  del data['CONT_DOY']

  #Feature2 DISCOVERY_TOD
  discovery_tod = [];
  data['DISCOVERY_TIME'] = data['DISCOVERY_TIME'].replace([None],'0000')
  for i in range(len(data)):
    key = data.iloc[i]['DISCOVERY_TIME']
    if( key == '0000' ):
      discovery_tod.append(0)
    elif ( '0000' < key <= '0600' ):
      discovery_tod.append(1)
    elif ( '0600' < key <= '1200' ):
      discovery_tod.append(2)
    elif ( '1200' < key <= '1600' ):
      discovery_tod.append(3)
    elif ( '1600' < key <= '2000' ):
      discovery_tod.append(4)
    elif ( '2000' < key <= '2400' ):
      discovery_tod.append(5)

  data['DISCOVERY_TOD'] = discovery_tod
  data['DISCOVERY_TOD'].astype('int64')

  del data['DISCOVERY_TIME']

  data['LATITUDE'] = (data['LATITUDE']*10).apply(np.floor)/10
  data['LONGITUDE'] = (data['LONGITUDE']*10).apply(np.floor)/10

  #Add forest Area feature
  forest_Area = pd.read_excel('xlsx/FOREST_Area.xlsx')
  forest_Area.head()
  STATE_PRCNT_FOREST = [];
  for i in range(len(data)):
    key = data.iloc[i]['STATE'].astype('int64')
    STATE_PRCNT_FOREST.append(forest_Area['Forest_Coverage'].values[key])
  
  data['STATE_PRCNT_FOREST'] = STATE_PRCNT_FOREST
  data['STATE_PRCNT_FOREST'].astype('float64')

  #Add Avg Temp Feature
  avg_temp  = pd.read_excel('xlsx/avg_temp.xlsx')

  AVG_TEMP_LIST = [];
  for i in range(len(data)):
    state_key = data.iloc[i]['STATE'].astype('int64')
    year_key = data.iloc[i]['FIRE_YEAR'].astype('int64')
    AVG_TEMP_LIST.append(avg_temp[year_key].values[state_key])
  
  data['AVG_TEMP'] = AVG_TEMP_LIST
  data['AVG_TEMP'].astype('float64')

  #Add Avg Prec Feature
  avg_prec  = pd.read_excel('xlsx/avg_prec.xlsx')

  AVG_PREC_LIST = [];
  for i in range(len(data)):
   state_key = data.iloc[i]['STATE'].astype('int64')
   AVG_PREC_LIST.append(avg_prec['Avg_Prec'].values[state_key])

  data['AVG_PREC'] = AVG_PREC_LIST
  data['AVG_PREC'].astype('float64')
  print('Final features are: ', data.columns,'\n')
  print('EDA Completed...... \n')
  print('Predicting the fire size class......\n')

  predictions = DataPrediction(data)
  data['PREDICTED_CLASS'] = predictions
  #Simplifying the predicted class by giving area covered in each class
  predictedRange = []
  for i in range(len(data)):
    key = data.iloc[i]['PREDICTED_CLASS']
    if( key == 1 ):
      predictedRange.append('0-0.25 acres')
    elif ( key == 2 ):
      predictedRange.append('0.26-9.9 acres')
    elif ( key == 3 ):
      predictedRange.append('10.0-99.9 acres')
    elif ( key == 4 ):
      predictedRange.append('100-299 acres')
    elif ( key == 5 ):
      predictedRange.append('300-999 acres')
    elif ( key == 6 ):
      predictedRange.append('1000-5000 acres')
    else:
      predictedRange.append('5000+ acres')
    
  data['Area Range'] = predictedRange
  print(data.columns)
  print(data)

In [104]:
function1(dfx)

Deleting unnecessary features......

Data shape is:  (2, 12) 

Encoding features......

Performing Feature Engineering......

Data shape is:  (2, 13) 

Final features are:  Index(['SOURCE_SYSTEM_TYPE', 'SOURCE_SYSTEM', 'NWCG_REPORTING_AGENCY',
       'FIRE_YEAR', 'STAT_CAUSE_CODE', 'LATITUDE', 'LONGITUDE', 'OWNER_CODE',
       'STATE', 'DISCOVERY_MONTH', 'DISCOVERY_TOD', 'STATE_PRCNT_FOREST',
       'AVG_TEMP', 'AVG_PREC'],
      dtype='object') 

EDA Completed...... 

Predicting the fire size class......

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parall

Result: 2 columns namely 'PREDICTED_CLASS' and 'Area Range' are the results obtained for the given set of features in our model. 
We can change the number of input features as per our convinience.

In [8]:
#Function 2: Taking both x and y values to get the the Performance Metric values for our given data
def function2(dataset):
  '''Taking the entire dataset as input for this function and then using the labels to determine MAE and MAPE scores'''
  #Encoding y_data for computation purpose
  dataset['FIRE_SIZE_CLASS'] = dataset['FIRE_SIZE_CLASS'].map({'A': 1, 'B': 2, 'C':3, 'D':4, 'E': 5, 'F': 6,'G': 7}) 
  dataset['FIRE_SIZE_CLASS'].astype('int64')

  #Breaking down  features and label data 
  
  y_data = dataset['FIRE_SIZE_CLASS']
  x_data = dataset.drop(['FIRE_SIZE_CLASS'], axis = 1)
  
  #Predicing labels using x_data (Similar to function 1)
  del_features = ['OBJECTID', 'FOD_ID', 'FPA_ID', 'NWCG_REPORTING_UNIT_ID', 'NWCG_REPORTING_UNIT_NAME', 'SOURCE_REPORTING_UNIT', 'SOURCE_REPORTING_UNIT_NAME', 'LOCAL_FIRE_REPORT_ID', 'LOCAL_INCIDENT_ID', 'FIRE_CODE', 'FIRE_NAME', 'ICS_209_INCIDENT_NUMBER' , 'ICS_209_NAME', 'MTBS_FIRE_NAME', 'MTBS_ID', 'COMPLEX_NAME', 'DISCOVERY_DATE', 'STAT_CAUSE_DESCR', 'CONT_DATE', 'CONT_TIME', 'FIRE_SIZE', 'OWNER_DESCR', 'COUNTY', 'FIPS_CODE', 'FIPS_NAME', 'Shape' ]
  for i, item in enumerate(del_features):
    del x_data[item];


  label_encoder = preprocessing.LabelEncoder() 
  encode_features = ['SOURCE_SYSTEM_TYPE', 'SOURCE_SYSTEM', 'NWCG_REPORTING_AGENCY']
  for j, e_item in enumerate(encode_features):
    x_data[e_item] = label_encoder.fit_transform(x_data[e_item]) 
    x_data[e_item].astype('int64')

  #Manually encoding states feature
  x_data['STATE'] = x_data['STATE'].map({'AL': 0, 'AK': 1, 'AZ': 2, 'AR': 3, 'CA': 4, 'CO': 5,'CT': 6,'DE': 7,'DC': 8,'FL': 9,'GA': 10,'HI': 11,'ID': 12,'IL': 13,'IN': 14,'IA': 15,'KS': 16,'KY': 17,'LA': 18,'ME': 19,'MD': 20,'MA': 21,'MI': 22,'MN': 23,'MS': 24,'MO': 25,'MT': 26,'NE': 27,'NV': 28,'NH': 29,'NJ': 30,'NM': 31,'NY': 32,'NC': 33,'ND': 34,'OH': 35,'OK': 36,'OR': 37,'PA': 38,'PR': 39,'RI': 40,'SC': 41,'SD': 42,'TN': 43,'TX': 44,'UT': 45,'VT': 46,'VA': 47,'WA': 48,'WV': 49,'WI': 50,'WY': 51}) 
  x_data['STATE'].astype('int64')

  #Adding Feature Discovery Month
  discovery_month = [];
  for i in range(len(x_data)):
   key = x_data.iloc[i]['DISCOVERY_DOY']
   if( 1 <= key <= 31 ):
    discovery_month.append(1)
   elif ( 32 <= key <= 60 ):
      discovery_month.append(2)
   elif ( 61 <= key <= 91 ):
     discovery_month.append(3)
   elif ( 92 <= key <= 121 ):
     discovery_month.append(4)
   elif ( 122 <= key <= 152 ):
     discovery_month.append(5)
   elif ( 153 <= key <= 182 ):
     discovery_month.append(6)
   elif ( 183 <= key <= 213 ):
     discovery_month.append(7)
   elif ( 214 <= key <= 244 ):
     discovery_month.append(8)
   elif ( 245 <= key <= 274 ):
     discovery_month.append(9)
   elif ( 275 <= key <= 305 ):
     discovery_month.append(10)
   elif ( 306 <= key <= 335 ):
     discovery_month.append(11)
   elif ( 336 <= key <= 366 ):
     discovery_month.append(12)
    
  x_data['DISCOVERY_MONTH'] = discovery_month
  x_data['DISCOVERY_MONTH'].astype('int64')

  #Delete DISCOVERY_DOY and CONT_DOY also now
  del x_data['DISCOVERY_DOY']
  del x_data['CONT_DOY']

  #Feature2 DISCOVERY_TOD
  discovery_tod = [];
  x_data['DISCOVERY_TIME'] = x_data['DISCOVERY_TIME'].replace([None],'0000')
  for i in range(len(x_data)):
    key = x_data.iloc[i]['DISCOVERY_TIME']
    if( key == '0000' ):
      discovery_tod.append(0)
    elif ( '0000' < key <= '0600' ):
      discovery_tod.append(1)
    elif ( '0600' < key <= '1200' ):
      discovery_tod.append(2)
    elif ( '1200' < key <= '1600' ):
      discovery_tod.append(3)
    elif ( '1600' < key <= '2000' ):
      discovery_tod.append(4)
    elif ( '2000' < key <= '2400' ):
      discovery_tod.append(5)

  x_data['DISCOVERY_TOD'] = discovery_tod
  x_data['DISCOVERY_TOD'].astype('int64')

  del x_data['DISCOVERY_TIME']

  x_data['LATITUDE'] = (x_data['LATITUDE']*10).apply(np.floor)/10
  x_data['LONGITUDE'] = (x_data['LONGITUDE']*10).apply(np.floor)/10

  #Add forest Area feature
  forest_Area = pd.read_excel('xlsx/FOREST_Area.xlsx')
  forest_Area.head()
  STATE_PRCNT_FOREST = [];
  for i in range(len(x_data)):
    key = x_data.iloc[i]['STATE'].astype('int64')
    STATE_PRCNT_FOREST.append(forest_Area['Forest_Coverage'].values[key])
  
  x_data['STATE_PRCNT_FOREST'] = STATE_PRCNT_FOREST
  x_data['STATE_PRCNT_FOREST'].astype('float64')

  #Add Avg Temp Feature
  avg_temp  = pd.read_excel('xlsx/avg_temp.xlsx')

  AVG_TEMP_LIST = [];
  for i in range(len(x_data)):
    state_key = x_data.iloc[i]['STATE'].astype('int64')
    year_key = x_data.iloc[i]['FIRE_YEAR'].astype('int64')
    AVG_TEMP_LIST.append(avg_temp[year_key].values[state_key])
  
  x_data['AVG_TEMP'] = AVG_TEMP_LIST
  x_data['AVG_TEMP'].astype('float64')

  #Add Avg Prec Feature
  avg_prec  = pd.read_excel('xlsx/avg_prec.xlsx')

  AVG_PREC_LIST = [];
  for i in range(len(x_data)):
   state_key = x_data.iloc[i]['STATE'].astype('int64')
   AVG_PREC_LIST.append(avg_prec['Avg_Prec'].values[state_key])

  x_data['AVG_PREC'] = AVG_PREC_LIST
  x_data['AVG_PREC'].astype('float64')

  predictions = DataPrediction(x_data)

  #Got the prediction values, now computing the errors
  MAE_value = mean_absolute_error(y_data, predictions)
  print('Mean Absolute Error comes out to be: ', MAE_value, '\n')

  y_true, y_pred = np.array(y_data), np.array(predictions)
  MAPE_value = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
  print('Mean Absolute Percentage Error is: ', MAPE_value)


  



In [9]:
function2(dataset)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['FIRE_SIZE_CLASS'] = dataset['FIRE_SIZE_CLASS'].map({'A': 1, 'B': 2, 'C':3, 'D':4, 'E': 5, 'F': 6,'G': 7})


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  

Mean Absolute Percentage Error (MAPE) is the most common error for Forcasting.
In our study the MAPE value comes out to be 23.42%
which means for the remaining ~ 77% of the times, the model is predicting the right firesize class.
