# Fire Data Feature Selection

In [2]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

  (fname, cnt))


### Importing Data

The following code block read in the data and then took the Fire Size and Fire Size Classification out of the primary dataset and put it into deperate dataframe, "labels."

In [3]:
data = pd.read_csv('data/california_fires.csv')
data = data.sort_values('DISCOVERY_DATE')
labels = pd.DataFrame()
labels['FIRE_SIZE'] = data['FIRE_SIZE']
labels['FIRE_SIZE_CLASS'] = data['FIRE_SIZE_CLASS']

  interactivity=interactivity, compiler=compiler, result=result)


### Missing Values



In order to select the best features, we first found the columns with the highest amount of missing values and evaluated if they had any inherent value or would add to the model. Luckily, we felt as if the features with the highest percentages of missing values (as seen below) were all redundant and lacked importance, so felt comfortable in removing them. 

We then empirically looked at the data and removed columns that we felt would add no additional value. For example, we removed the name of the fire, the reporting agency, and other features that would do nothing but complicate the model. 

In [4]:
missing_percentage = []
for column in data.columns:
    missing_percentage.append(str(int(data[column].isna().sum()/len(data[column])*100)) + "% missing in "+ column)
missing_percentage.sort()
missing_percentage

['0% missing in DISCOVERY_DATE',
 '0% missing in DISCOVERY_DOY',
 '0% missing in FIRE_SIZE',
 '0% missing in FIRE_SIZE_CLASS',
 '0% missing in FIRE_YEAR',
 '0% missing in LATITUDE',
 '0% missing in LONGITUDE',
 '0% missing in NWCG_REPORTING_AGENCY',
 '0% missing in NWCG_REPORTING_UNIT_ID',
 '0% missing in OBJECTID',
 '0% missing in OWNER_CODE',
 '0% missing in OWNER_DESCR',
 '0% missing in SOURCE_REPORTING_UNIT',
 '0% missing in SOURCE_SYSTEM',
 '0% missing in SOURCE_SYSTEM_TYPE',
 '0% missing in STATE',
 '0% missing in STAT_CAUSE_CODE',
 '0% missing in STAT_CAUSE_DESCR',
 '0% missing in Shape',
 '0% missing in Unnamed: 0',
 '41% missing in DISCOVERY_TIME',
 '51% missing in CONT_DATE',
 '51% missing in CONT_DOY',
 '51% missing in CONT_TIME',
 '7% missing in FIRE_NAME',
 '70% missing in COUNTY',
 '70% missing in FIPS_NAME']

In [5]:
data = data.drop(columns=[ 'OBJECTID', 'SOURCE_SYSTEM_TYPE', 'Shape', 'FIRE_NAME', 'NWCG_REPORTING_AGENCY', 'NWCG_REPORTING_UNIT_ID', 'SOURCE_REPORTING_UNIT', 'STAT_CAUSE_CODE', 'FIPS_NAME', 'DISCOVERY_TIME', 'CONT_DATE', 'CONT_DOY', 'CONT_TIME', 'COUNTY', 'Unnamed: 0', 'FIRE_SIZE_CLASS', 'FIRE_SIZE']).reset_index()
data.head()

Unnamed: 0,index,SOURCE_SYSTEM,FIRE_YEAR,DISCOVERY_DATE,DISCOVERY_DOY,STAT_CAUSE_DESCR,LATITUDE,LONGITUDE,OWNER_CODE,OWNER_DESCR,STATE
0,34267,DOI-WFMI,1992,2448622.5,1,Debris Burning,33.1667,-116.6342,2.0,BIA,CA
1,135389,ST-CACDF,1992,2448622.5,1,Children,33.663889,-116.171944,14.0,MISSING/NOT SPECIFIED,CA
2,10455,FS-FIRESTAT,1992,2448622.5,1,Lightning,38.205,-120.335,13.0,STATE OR PRIVATE,CA
3,135391,ST-CACDF,1992,2448623.5,2,Children,33.678056,-116.171944,14.0,MISSING/NOT SPECIFIED,CA
4,135390,ST-CACDF,1992,2448623.5,2,Miscellaneous,33.896111,-116.99,14.0,MISSING/NOT SPECIFIED,CA


### Converting Categorical Data

In order to accurately run our feature selection model, we needed to get dummies for the categorical values, as seen below. 

In [6]:
dummies = pd.get_dummies(data)
dummies = dummies.drop(columns = 'index')
dummies.head()

Unnamed: 0,FIRE_YEAR,DISCOVERY_DATE,DISCOVERY_DOY,LATITUDE,LONGITUDE,OWNER_CODE,SOURCE_SYSTEM_DOI-WFMI,SOURCE_SYSTEM_FS-FIRESTAT,SOURCE_SYSTEM_FWS-FMIS,SOURCE_SYSTEM_IA-ICS209,SOURCE_SYSTEM_ST-CACDF,SOURCE_SYSTEM_ST-NASF,STAT_CAUSE_DESCR_Arson,STAT_CAUSE_DESCR_Campfire,STAT_CAUSE_DESCR_Children,STAT_CAUSE_DESCR_Debris Burning,STAT_CAUSE_DESCR_Equipment Use,STAT_CAUSE_DESCR_Fireworks,STAT_CAUSE_DESCR_Lightning,STAT_CAUSE_DESCR_Miscellaneous,STAT_CAUSE_DESCR_Missing/Undefined,STAT_CAUSE_DESCR_Powerline,STAT_CAUSE_DESCR_Railroad,STAT_CAUSE_DESCR_Smoking,STAT_CAUSE_DESCR_Structure,OWNER_DESCR_BIA,OWNER_DESCR_BLM,OWNER_DESCR_BOR,OWNER_DESCR_COUNTY,OWNER_DESCR_FOREIGN,OWNER_DESCR_FWS,OWNER_DESCR_MISSING/NOT SPECIFIED,OWNER_DESCR_MUNICIPAL/LOCAL,OWNER_DESCR_NPS,OWNER_DESCR_OTHER FEDERAL,OWNER_DESCR_PRIVATE,OWNER_DESCR_STATE,OWNER_DESCR_STATE OR PRIVATE,OWNER_DESCR_TRIBAL,OWNER_DESCR_UNDEFINED FEDERAL,OWNER_DESCR_USFS,STATE_CA
0,1992,2448622.5,1,33.1667,-116.6342,2.0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1992,2448622.5,1,33.663889,-116.171944,14.0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,1992,2448622.5,1,38.205,-120.335,13.0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
3,1992,2448623.5,2,33.678056,-116.171944,14.0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
4,1992,2448623.5,2,33.896111,-116.99,14.0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


### Finding Best Features
Once we have cleaned data, we ran a recursive feature elimination to select the top 10 features within the dataset in regards to the labels dataframe we selected previously. 

In [7]:
from sklearn.feature_selection import SelectKBest, chi2, RFE, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from itertools import compress

In [8]:
model = RandomForestClassifier()
rfe = RFE(model, 10)
rfe.fit(dummies, labels.FIRE_SIZE_CLASS)

RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
  n_features_to_select=10, step=1, verbose=0)

Below is a list of the top ten features within the given fire features dataset.

In [9]:
list(compress(dummies.columns.values, rfe.get_support()))

['FIRE_YEAR',
 'DISCOVERY_DATE',
 'DISCOVERY_DOY',
 'LATITUDE',
 'LONGITUDE',
 'OWNER_CODE',
 'SOURCE_SYSTEM_FS-FIRESTAT',
 'SOURCE_SYSTEM_ST-CACDF',
 'STAT_CAUSE_DESCR_Equipment Use',
 'STAT_CAUSE_DESCR_Miscellaneous']

### Converting and saving the important features. 
From this list, we cross referenced the features in the climate dataset, and selected a few unique columns that we felt would add most to the model. From there, we sectioned off the dataset, and founds dummies for our final dataset to use in the model.

In [10]:
dummy_data = data[['FIRE_YEAR',
 'DISCOVERY_DATE',
 'DISCOVERY_DOY',
 'OWNER_CODE',
 'STAT_CAUSE_DESCR',
 'OWNER_DESCR']]
dummies = pd.get_dummies(dummy_data)
dummies.head()

Unnamed: 0,FIRE_YEAR,DISCOVERY_DATE,DISCOVERY_DOY,OWNER_CODE,STAT_CAUSE_DESCR_Arson,STAT_CAUSE_DESCR_Campfire,STAT_CAUSE_DESCR_Children,STAT_CAUSE_DESCR_Debris Burning,STAT_CAUSE_DESCR_Equipment Use,STAT_CAUSE_DESCR_Fireworks,STAT_CAUSE_DESCR_Lightning,STAT_CAUSE_DESCR_Miscellaneous,STAT_CAUSE_DESCR_Missing/Undefined,STAT_CAUSE_DESCR_Powerline,STAT_CAUSE_DESCR_Railroad,STAT_CAUSE_DESCR_Smoking,STAT_CAUSE_DESCR_Structure,OWNER_DESCR_BIA,OWNER_DESCR_BLM,OWNER_DESCR_BOR,OWNER_DESCR_COUNTY,OWNER_DESCR_FOREIGN,OWNER_DESCR_FWS,OWNER_DESCR_MISSING/NOT SPECIFIED,OWNER_DESCR_MUNICIPAL/LOCAL,OWNER_DESCR_NPS,OWNER_DESCR_OTHER FEDERAL,OWNER_DESCR_PRIVATE,OWNER_DESCR_STATE,OWNER_DESCR_STATE OR PRIVATE,OWNER_DESCR_TRIBAL,OWNER_DESCR_UNDEFINED FEDERAL,OWNER_DESCR_USFS
0,1992,2448622.5,1,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1992,2448622.5,1,14.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,1992,2448622.5,1,13.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,1992,2448623.5,2,14.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,1992,2448623.5,2,14.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


Finally, we converted the new dataframe to a CSV to be interfaced and referenced in the model file.

In [11]:
dummies.to_csv('dummies_data.csv')