<a href="https://colab.research.google.com/github/prodramp/wildfire/blob/main/ml/ca_wildfire_ml_lightgbm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import lightgbm as lgb
from sklearn import metrics
import pandas as pd
import plotly.express as px

In [None]:
# Use the cell below to upload your own dataset
from google.colab import files
uploaded = files.upload()

In [2]:
!ls

sample_data


In [9]:
train_source = "https://github.com/prodramp/wildfire/raw/main/california-data/ca_fire_train.csv.zip"
valid_source = "https://github.com/prodramp/wildfire/raw/main/california-data/ca_fire_valid.csv.zip"
test_source = "https://github.com/prodramp/wildfire/raw/main/california-data/ca_fire_test.csv.zip"

In [10]:
train = pd.read_csv(train_source)
valid = pd.read_csv(valid_source)
test = pd.read_csv(test_source)

In [11]:
train.shape

(1071252, 12)

In [12]:
valid.shape

(117936, 12)

In [13]:
test.shape

(14742, 12)

In [14]:
train.groupby('year').count()

Unnamed: 0_level_0,latitude,longitude,month,fire_count,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2001,9828,9828,9828,9828,9828,9828,9828,9828,9828,9828,9828
2002,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2003,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2004,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2005,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2006,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2007,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2008,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2009,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2010,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968


In [15]:
valid.groupby('year').count()

Unnamed: 0_level_0,latitude,longitude,month,fire_count,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2021,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968


In [16]:
test.groupby('year').count()

Unnamed: 0_level_0,latitude,longitude,month,fire_count,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022,14742,14742,14742,14742,14742,14742,14742,14742,14742,14742,14742


In [17]:
features = [
    'latitude', 'longitude', 'month',
    'fire_cnt_before', 'fire_before',
    'fire_cnt_last_year', 'fire_last_year',
    'fire_cnt_last_year_same_month', 'fire_last_year_same_month'
]

In [18]:
train.columns

Index(['latitude', 'longitude', 'year', 'month', 'fire_count', 'fire',
       'fire_cnt_before', 'fire_before', 'fire_cnt_last_year',
       'fire_last_year', 'fire_cnt_last_year_same_month',
       'fire_last_year_same_month'],
      dtype='object')

In [19]:
train_data = lgb.Dataset(train[features], label=train.fire)
valid_data = lgb.Dataset(valid[features], label=valid.fire)


In [20]:
train_data = lgb.Dataset(train[features], label=train.fire)
valid_data = lgb.Dataset(valid[features], label=valid.fire)


In [21]:
parameters = {'num_leaves': 10, 'max_depth': 8, 'objective': 'binary', 'metric': 'auc'}
num_round = 500

In [22]:
model = lgb.train(parameters, train_data, num_round, valid_sets=[valid_data],
                  early_stopping_rounds=5, verbose_eval=50)

Training until validation scores don't improve for 5 rounds.
[50]	valid_0's auc: 0.929909
[100]	valid_0's auc: 0.932862
[150]	valid_0's auc: 0.934364
[200]	valid_0's auc: 0.935244
Early stopping, best iteration is:
[198]	valid_0's auc: 0.935247


In [24]:
test_predictions = model.predict(test[features])

In [25]:
test_auc = metrics.roc_auc_score(test.fire, test_predictions)
test_auc

0.9724192806466917

In [26]:
fpr, tpr, thr = metrics.roc_curve(test.fire, test_predictions)
px.line(pd.DataFrame(dict(FPR=fpr, TPR=tpr)),
        x='FPR', y='TPR', title='Fire/hotspot model performance for 2021')
