In [None]:
%matplotlib inline
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point, Polygon
from geopandas import datasets, GeoDataFrame, read_file
from geopandas.tools import overlay
import shapely.speedups
import matplotlib.pyplot as plt
from datetime import datetime
from dateutil.relativedelta import relativedelta

shapely.speedups.enable()
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.mixture import GaussianMixture
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import seaborn as sns

In [None]:
# Link to our dataset
# https://drive.google.com/drive/folders/1GG8AbXxKZD1i_Z4kmFAiEG1Z0TSkS0V3?usp=share_link

In [None]:
# Load dataset
df = gpd.read_file(r'D:\Usuários\Fernando_Roriz\UC_Berkeley\Capstone\Data\Final_Dataset_Adjusted\df_final_dataset.shp')

In [None]:
# Rename columns -> geopandas restrict it to size(10) when saving

column_dict = {'state':'state', 'image_date':'image_date', 'year':'year', 'area_km':'area_km',
              'fire_2017':'fire_2017', 'fire_2018':'fire_2018','fire_2019':'fire_2019',
               'fire_2020':'fire_2020', 'fire_2021':'fire_2021','YYYYMMDD':'YYYYMMDD',
              'n_fires':'n_fires','any_fire_a':'any_fire_all_years','any_fire_2':'any_fire_2017',
              'any_fire_1':'any_fire_2018', 'any_fire_3':'any_fire_2019', 'any_fire_4':'any_fire_2020',
               'any_fire_5':'any_fire_2021', 'lon_centro':'lon_centroid','lat_centro':'lat_centroid',
              'length':'length','gap_defore':'gap_deforest_2017', 'gap_defo_1':'gap_deforest_2018',
               'gap_defo_2':'gap_deforest_2019', 'gap_defo_3':'gap_deforest_2020','gap_defo_4':'gap_deforest_2021'}
df.rename(columns=column_dict, inplace=True)

In [None]:
df.head()

In [None]:
df['fire_same_year'] = 0
df['fire_before'] = 0
df['n_fire_before'] = 0

for i in range(len(df)):
    
    if df.loc[i,'n_fires']==0:
        continue
    else:
        # Convert string list to list
        aux_list = df['YYYYMMDD'][i].strip('][').split(', ')
        
        date_image = datetime.strptime(df['image_date'][i][0:10].replace('-',''), '%Y%m%d')
        last_date = datetime.strptime(str(str(int(df['year'][i])) + '1231'), '%Y%m%d')
        date_t_1 = date_image + relativedelta(years=1)
        date_t_2 = date_image + relativedelta(years=2)
        date_t_3 = date_image + relativedelta(years=3)
        
        count=0
        gap_last_fire = 10000000
        for d in aux_list:
            
            try:
                date_fire = datetime.strptime(d, '%Y%m%d')
            except:
                continue
                
            if date_fire > date_image and date_fire <= last_date:
                df.loc[i,'fire_same_year'] = 1
            if date_fire < date_image:
                count += 1
                if (date_fire-date_image).days < gap_last_fire:
                    gap_last_fire = (date_fire-date_image).days
        if count > 0:
            df.loc[i,'fire_before'] = 1
            df.loc[i,'n_fire_before'] = count

# Save variables to create panel data

In [None]:
# Dataset for 2017
df_2017 = pd.DataFrame()
df_2017['fire_t'] = df[(df['year']==2017)]['fire_same_year']
df_2017['fire_before'] = df[(df['year']==2017)]['fire_before']
df_2017['n_fire_before'] = df[(df['year']==2017)]['n_fire_before']
df_2017['area_km'] = df[(df['year']==2017)]['area_km']
df_2017['area_km_2'] = df[(df['year']==2017)]['area_km']**2
df_2017['lon_centroid'] = df[(df['year']==2017)]['lon_centroid']
df_2017['lat_centroid'] = df[(df['year']==2017)]['lat_centroid']
df_2017['length'] = df[(df['year']==2017)]['length']
df_2017['length_2'] = df[(df['year']==2017)]['length']**2
df_2017['year'] = df[(df['year']==2017)]['year']
df_2017['city_name'] = df[(df['year']==2017)]['city_name']
df_2017['dist_road'] = df[(df['year']==2017)]['dist_road']
df_2017['year_reference'] = 2017

# Dataset for 2018
df_2018 = pd.DataFrame()
df_2018['fire_t'] = df[(df['year']==2018)]['fire_same_year']
df_2018['fire_before'] = df[(df['year']==2018)]['fire_before']
df_2018['n_fire_before'] = df[(df['year']==2018)]['n_fire_before']
df_2018['area_km'] = df[(df['year']==2018)]['area_km']
df_2018['area_km_2'] = df[(df['year']==2018)]['area_km']**2
df_2018['lon_centroid'] = df[(df['year']==2018)]['lon_centroid']
df_2018['lat_centroid'] = df[(df['year']==2018)]['lat_centroid']
df_2018['length'] = df[(df['year']==2018)]['length']
df_2018['length_2'] = df[(df['year']==2018)]['length']**2
df_2018['year'] = df[(df['year']==2018)]['year']
df_2018['city_name'] = df[(df['year']==2018)]['city_name']
df_2018['dist_road'] = df[(df['year']==2018)]['dist_road']
df_2018['year_reference'] = 2018

# Dataset for 2019
df_2019 = pd.DataFrame()
df_2019['fire_t'] = df[(df['year']==2019)]['fire_same_year']
df_2019['fire_before'] = df[(df['year']==2019)]['fire_before']
df_2019['n_fire_before'] = df[(df['year']==2019)]['n_fire_before']
df_2019['area_km'] = df[(df['year']==2019)]['area_km']
df_2019['area_km_2'] = df[(df['year']==2019)]['area_km']**2
df_2019['lon_centroid'] = df[(df['year']==2019)]['lon_centroid']
df_2019['lat_centroid'] = df[(df['year']==2019)]['lat_centroid']
df_2019['length'] = df[(df['year']==2019)]['length']
df_2019['length_2'] = df[(df['year']==2019)]['length']**2
df_2019['year'] = df[(df['year']==2019)]['year']
df_2019['city_name'] = df[(df['year']==2019)]['city_name']
df_2019['dist_road'] = df[(df['year']==2019)]['dist_road']
df_2019['year_reference'] = 2019

# Dataset for 2020
df_2020 = pd.DataFrame()
df_2020['fire_t'] = df[(df['year']==2020)]['fire_same_year']
df_2020['fire_before'] = df[(df['year']==2020)]['fire_before']
df_2020['n_fire_before'] = df[(df['year']==2020)]['n_fire_before']
df_2020['area_km'] = df[(df['year']==2020)]['area_km']
df_2020['area_km_2'] = df[(df['year']==2020)]['area_km']**2
df_2020['lon_centroid'] = df[(df['year']==2020)]['lon_centroid']
df_2020['lat_centroid'] = df[(df['year']==2020)]['lat_centroid']
df_2020['length'] = df[(df['year']==2020)]['length']
df_2020['length_2'] = df[(df['year']==2020)]['length']**2
df_2020['year'] = df[(df['year']==2020)]['year']
df_2020['city_name'] = df[(df['year']==2020)]['city_name']
df_2020['dist_road'] = df[(df['year']==2020)]['dist_road']
df_2020['year_reference'] = 2020


# Dataset for 2021
df_2021 = pd.DataFrame()
df_2021['fire_t'] = df[(df['year']==2021)]['fire_same_year']
df_2021['fire_before'] = df[(df['year']==2021)]['fire_before']
df_2021['n_fire_before'] = df[(df['year']==2021)]['n_fire_before']
df_2021['area_km'] = df[(df['year']==2021)]['area_km']
df_2021['area_km_2'] = df[(df['year']==2021)]['area_km']**2
df_2021['lon_centroid'] = df[(df['year']==2021)]['lon_centroid']
df_2021['lat_centroid'] = df[(df['year']==2021)]['lat_centroid']
df_2021['length'] = df[(df['year']==2021)]['length']
df_2021['length_2'] = df[(df['year']==2021)]['length']**2
df_2021['year'] = df[(df['year']==2021)]['year']
df_2021['city_name'] = df[(df['year']==2021)]['city_name']
df_2021['dist_road'] = df[(df['year']==2021)]['dist_road']
df_2021['year_reference'] = 2021



In [None]:
# Concat datasets
df_panel = pd.concat([df_2017,df_2018, df_2019, df_2020, df_2021])

In [None]:
df_panel.head()

In [None]:
# Set overall parameters
k_neighbors = 10
threshold = 0.5

# Training data: 2017 / Test data: 2018

In [None]:
# Restrict to training and test years
train_data = df_panel[(df_panel['year_reference']>=2017) & (df_panel['year_reference']<=2017)]
test_data = df_panel[df_panel['year_reference']==2018]

# Eliminate small areas based on quantile
train_data = train_data[train_data['area_km']>=train_data['area_km'].quantile(0.0)]
test_data = test_data[test_data['area_km']>=test_data['area_km'].quantile(0.0)]

X_train = train_data[['fire_before','n_fire_before','area_km','area_km_2','lat_centroid', 'lon_centroid','length','dist_road']].values
y_train = train_data['fire_t'].values

X_test = test_data[['fire_before','n_fire_before','area_km','area_km_2','lat_centroid', 'lon_centroid','length','dist_road']].values
y_test = test_data['fire_t'].values

scaler = StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# KNN - 2017 -> 2018

In [None]:
knn = KNeighborsClassifier(n_neighbors = k_neighbors, weights = 'uniform', metric = 'manhattan')
knn.fit(X_train, y_train)

pred_train = knn.predict(X_train)
pred_test = knn.predict(X_test)

In [None]:
print("- Training data ratio results:")
print("")

prob_train = knn.predict_proba(X_train)

pred_train_2 = np.empty(len(pred_train), dtype=np.int)

for i in range(len(prob_train)):
    if prob_train[i][1] >= threshold:
        pred_train_2[i] = 1
    else:
        pred_train_2[i] = 0

df_train = train_data.copy()
df_train['result_2'] = np.array(pred_train_2)

print("Ratio Fire Got Right", round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==1)]['area_km'].sum()/df_train[df_train['fire_t']==1]['area_km'].sum(),2))
#fire_right.append(df_train[(df_train['fire_t']==1) & (df_train['result_2']==1)]['area_km'].sum()/df_train[df_train['fire_t']==1]['area_km'].sum())
print("Ratio Fire Got Wrong", round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()/df_train[df_train['fire_t']==1]['area_km'].sum(),2))
print("Ratio Non Fire Got Right", round(df_train[(df_train['fire_t']==0) & (df_train['result_2']==0)]['area_km'].sum()/df_train[df_train['fire_t']==0]['area_km'].sum(),2))
print("Ratio Non Fire Got Wrong", round(df_train[(df_train['fire_t']==0) & (df_train['result_2']==1)]['area_km'].sum()/df_train[df_train['fire_t']==0]['area_km'].sum(),2))
#non_fire_wrong.append(df_train[(df_train['fire_t']==0) & (df_train['result_2']==1)]['area_km'].sum()/df_train[df_train['fire_t']==0]['area_km'].sum())

print('PPV:',round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==1)]['area_km'].sum()/(df_train[(df_train['fire_t']==1) & (df_train['result_2']==1)]['area_km'].sum()+df_train[(df_train['fire_t']==0) & (df_train['result_2']==1)]['area_km'].sum()),2))
print('FOR:',round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()/(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()+df_train[(df_train['fire_t']==0) & (df_train['result_2']==0)]['area_km'].sum()),2))
print('NPV:',1-round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()/(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()+df_train[(df_train['fire_t']==0) & (df_train['result_2']==0)]['area_km'].sum()),2))

prob_test = knn.predict_proba(X_test)

pred_2 = np.empty(len(pred_test), dtype=np.int)

for i in range(len(prob_test)):
    if prob_test[i][1] >= threshold:
        pred_2[i] = 1
    else:
        pred_2[i] = 0

df_test = test_data.copy()
df_test['result_2'] = np.array(pred_2)

print("- Test data ratio results:")
print("")

print("Ratio Fire Got Right", round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==1)]['area_km'].sum()/df_test[df_test['fire_t']==1]['area_km'].sum(),2))
print("Ratio Fire Got Wrong", round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()/df_test[df_test['fire_t']==1]['area_km'].sum(),2))
print("Ratio Non Fire Got Right", round(df_test[(df_test['fire_t']==0) & (df_test['result_2']==0)]['area_km'].sum()/df_test[df_test['fire_t']==0]['area_km'].sum(),2))
print("Ratio Non Fire Got Wrong", round(df_test[(df_test['fire_t']==0) & (df_test['result_2']==1)]['area_km'].sum()/df_test[df_test['fire_t']==0]['area_km'].sum(),2))
print('PPV:',round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==1)]['area_km'].sum()/(df_test[(df_test['fire_t']==1) & (df_test['result_2']==1)]['area_km'].sum()+df_test[(df_test['fire_t']==0) & (df_test['result_2']==1)]['area_km'].sum()),2))
print('FOR:',round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()/(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()+df_test[(df_test['fire_t']==0) & (df_test['result_2']==0)]['area_km'].sum()),2))
print('NPV:',1-round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()/(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()+df_test[(df_test['fire_t']==0) & (df_test['result_2']==0)]['area_km'].sum()),2))

# Save results to generate colormap
pred_list = []
fcst_list = []
for i in range(len(y_test)):
    if y_test[i]==0 and pred_2[i]==0:
        pred_list.append(0)
        fcst_list.append(0)
    elif y_test[i]==1 and pred_2[i]==1:
        pred_list.append(1)
        fcst_list.append(1)
    elif y_test[i]==0 and pred_2[i]==1:
        pred_list.append(2)
        fcst_list.append(1)
    elif y_test[i]==1 and pred_2[i]==0:
        pred_list.append(3)
        fcst_list.append(0)
        
np_pred = np.array(pred_list)
np_fcst = np.array(fcst_list)

df_2018_aux = df[(df['year']==2018)]
df_2018_aux['compare_fcst']=np_pred
df_2018_aux['fire_fcst']=np_fcst

def my_colormap(value):  # scalar value defined in 'column'
    if value == 0:
        return "tan"
    elif value == 1:
        return "blue"
    elif value == 2:
        return "green"
    elif value == 3:
        return "red"
    
# Plot the overall result
colormap = []
for i in range(len(df_2018_aux['compare_fcst'].values)):
    colormap.append(my_colormap(df_2018_aux['compare_fcst'].values[i]))

In [None]:
#df_2018_aux.explore(color=colormap)

# Training data: 2017-2018 / Test data: 2019

In [None]:
# Restrict to training and test years
train_data = df_panel[(df_panel['year_reference']>=2017) & (df_panel['year_reference']<=2018)]
test_data = df_panel[df_panel['year_reference']==2019]

# Eliminate small areas based on quantile
train_data = train_data[train_data['area_km']>=train_data['area_km'].quantile(0.0)]
test_data = test_data[test_data['area_km']>=test_data['area_km'].quantile(0.0)]

X_train = train_data[['fire_before','n_fire_before','area_km','area_km_2','lat_centroid', 'lon_centroid','length','year','dist_road']].values
y_train = train_data['fire_t'].values

X_test = test_data[['fire_before','n_fire_before','area_km','area_km_2','lat_centroid', 'lon_centroid','length','year','dist_road']].values
y_test = test_data['fire_t'].values

scaler = StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# KNN - 2017-2018 -> 2019

In [None]:
knn = KNeighborsClassifier(n_neighbors = k_neighbors, weights = 'uniform', metric = 'manhattan')
knn.fit(X_train, y_train)

pred_train = knn.predict(X_train)
pred_test = knn.predict(X_test)

In [None]:
print("- Training data ratio results:")
print("")

prob_train = knn.predict_proba(X_train)

pred_train_2 = np.empty(len(pred_train), dtype=np.int)

for i in range(len(prob_train)):
    if prob_train[i][1] >= threshold:
        pred_train_2[i] = 1
    else:
        pred_train_2[i] = 0

df_train = train_data.copy()
df_train['result_2'] = np.array(pred_train_2)

print("Ratio Fire Got Right", round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==1)]['area_km'].sum()/df_train[df_train['fire_t']==1]['area_km'].sum(),2))
#fire_right.append(df_train[(df_train['fire_t']==1) & (df_train['result_2']==1)]['area_km'].sum()/df_train[df_train['fire_t']==1]['area_km'].sum())
print("Ratio Fire Got Wrong", round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()/df_train[df_train['fire_t']==1]['area_km'].sum(),2))
print("Ratio Non Fire Got Right", round(df_train[(df_train['fire_t']==0) & (df_train['result_2']==0)]['area_km'].sum()/df_train[df_train['fire_t']==0]['area_km'].sum(),2))
print("Ratio Non Fire Got Wrong", round(df_train[(df_train['fire_t']==0) & (df_train['result_2']==1)]['area_km'].sum()/df_train[df_train['fire_t']==0]['area_km'].sum(),2))
#non_fire_wrong.append(df_train[(df_train['fire_t']==0) & (df_train['result_2']==1)]['area_km'].sum()/df_train[df_train['fire_t']==0]['area_km'].sum())

print('PPV:',round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==1)]['area_km'].sum()/(df_train[(df_train['fire_t']==1) & (df_train['result_2']==1)]['area_km'].sum()+df_train[(df_train['fire_t']==0) & (df_train['result_2']==1)]['area_km'].sum()),2))
print('FOR:',round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()/(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()+df_train[(df_train['fire_t']==0) & (df_train['result_2']==0)]['area_km'].sum()),2))
print('NPV:',1-round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()/(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()+df_train[(df_train['fire_t']==0) & (df_train['result_2']==0)]['area_km'].sum()),2))

prob_test = knn.predict_proba(X_test)

pred_2 = np.empty(len(pred_test), dtype=np.int)

for i in range(len(prob_test)):
    if prob_test[i][1] >= threshold:
        pred_2[i] = 1
    else:
        pred_2[i] = 0

df_test = test_data.copy()
df_test['result_2'] = np.array(pred_2)

print("- Test data ratio results:")
print("")

print("Ratio Fire Got Right", round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==1)]['area_km'].sum()/df_test[df_test['fire_t']==1]['area_km'].sum(),2))
print("Ratio Fire Got Wrong", round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()/df_test[df_test['fire_t']==1]['area_km'].sum(),2))
print("Ratio Non Fire Got Right", round(df_test[(df_test['fire_t']==0) & (df_test['result_2']==0)]['area_km'].sum()/df_test[df_test['fire_t']==0]['area_km'].sum(),2))
print("Ratio Non Fire Got Wrong", round(df_test[(df_test['fire_t']==0) & (df_test['result_2']==1)]['area_km'].sum()/df_test[df_test['fire_t']==0]['area_km'].sum(),2))

print('PPV:',round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==1)]['area_km'].sum()/(df_test[(df_test['fire_t']==1) & (df_test['result_2']==1)]['area_km'].sum()+df_test[(df_test['fire_t']==0) & (df_test['result_2']==1)]['area_km'].sum()),2))
print('FOR:',round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()/(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()+df_test[(df_test['fire_t']==0) & (df_test['result_2']==0)]['area_km'].sum()),2))
print('NPV:',1-round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()/(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()+df_test[(df_test['fire_t']==0) & (df_test['result_2']==0)]['area_km'].sum()),2))


# Save results to generate colormap
pred_list = []
fcst_list = []
for i in range(len(y_test)):
    if y_test[i]==0 and pred_2[i]==0:
        pred_list.append(0)
        fcst_list.append(0)
    elif y_test[i]==1 and pred_2[i]==1:
        pred_list.append(1)
        fcst_list.append(1)
    elif y_test[i]==0 and pred_2[i]==1:
        pred_list.append(2)
        fcst_list.append(1)
    elif y_test[i]==1 and pred_2[i]==0:
        pred_list.append(3)
        fcst_list.append(0)
        
np_pred = np.array(pred_list)
np_fcst = np.array(fcst_list)

df_2019_aux = df[(df['year']==2019)]
df_2019_aux['compare_fcst']=np_pred
df_2019_aux['fire_fcst']=np_fcst

def my_colormap(value):  # scalar value defined in 'column'
    if value == 0:
        return "tan"
    elif value == 1:
        return "blue"
    elif value == 2:
        return "green"
    elif value == 3:
        return "red"
    
# Plot the overall result
colormap = []
for i in range(len(df_2019_aux['compare_fcst'].values)):
    colormap.append(my_colormap(df_2019_aux['compare_fcst'].values[i]))

In [None]:
#df_2019_aux.explore(color=colormap)

# Training data: 2017-2018-2019 / Test data: 2020

In [None]:
# Restrict to training and test years
train_data = df_panel[(df_panel['year_reference']>=2017) & (df_panel['year_reference']<=2019)]
test_data = df_panel[df_panel['year_reference']==2020]

# Eliminate small areas based on quantile
train_data = train_data[train_data['area_km']>=train_data['area_km'].quantile(0.0)]
test_data = test_data[test_data['area_km']>=test_data['area_km'].quantile(0.0)]

X_train = train_data[['fire_before','n_fire_before','area_km','area_km_2','lat_centroid', 'lon_centroid','length','year','dist_road']].values
y_train = train_data['fire_t'].values

X_test = test_data[['fire_before','n_fire_before','area_km','area_km_2','lat_centroid', 'lon_centroid','length','year','dist_road']].values
y_test = test_data['fire_t'].values

scaler = StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# KNN: 2017-2018-2019 -> 2020

In [None]:
knn = KNeighborsClassifier(n_neighbors = k_neighbors, weights = 'uniform', metric = 'manhattan')
knn.fit(X_train, y_train)

pred_train = knn.predict(X_train)
pred_test = knn.predict(X_test)


In [None]:
print("- Training data ratio results:")
print("")

prob_train = knn.predict_proba(X_train)

pred_train_2 = np.empty(len(pred_train), dtype=np.int)

for i in range(len(prob_train)):
    if prob_train[i][1] >= threshold:
        pred_train_2[i] = 1
    else:
        pred_train_2[i] = 0

df_train = train_data.copy()
df_train['result_2'] = np.array(pred_train_2)

print("Ratio Fire Got Right", round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==1)]['area_km'].sum()/df_train[df_train['fire_t']==1]['area_km'].sum(),2))
#fire_right.append(df_train[(df_train['fire_t']==1) & (df_train['result_2']==1)]['area_km'].sum()/df_train[df_train['fire_t']==1]['area_km'].sum())
print("Ratio Fire Got Wrong", round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()/df_train[df_train['fire_t']==1]['area_km'].sum(),2))
print("Ratio Non Fire Got Right", round(df_train[(df_train['fire_t']==0) & (df_train['result_2']==0)]['area_km'].sum()/df_train[df_train['fire_t']==0]['area_km'].sum(),2))
print("Ratio Non Fire Got Wrong", round(df_train[(df_train['fire_t']==0) & (df_train['result_2']==1)]['area_km'].sum()/df_train[df_train['fire_t']==0]['area_km'].sum(),2))
#non_fire_wrong.append(df_train[(df_train['fire_t']==0) & (df_train['result_2']==1)]['area_km'].sum()/df_train[df_train['fire_t']==0]['area_km'].sum())
print('PPV:',round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==1)]['area_km'].sum()/(df_train[(df_train['fire_t']==1) & (df_train['result_2']==1)]['area_km'].sum()+df_train[(df_train['fire_t']==0) & (df_train['result_2']==1)]['area_km'].sum()),2))
print('FOR:',round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()/(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()+df_train[(df_train['fire_t']==0) & (df_train['result_2']==0)]['area_km'].sum()),2))
print('NPV:',1-round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()/(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()+df_train[(df_train['fire_t']==0) & (df_train['result_2']==0)]['area_km'].sum()),2))


prob_test = knn.predict_proba(X_test)

pred_2 = np.empty(len(pred_test), dtype=np.int)

for i in range(len(prob_test)):
    if prob_test[i][1] >= threshold:
        pred_2[i] = 1
    else:
        pred_2[i] = 0

df_test = test_data.copy()
df_test['result_2'] = np.array(pred_2)

print("- Test data ratio results:")
print("")

print("Ratio Fire Got Right", round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==1)]['area_km'].sum()/df_test[df_test['fire_t']==1]['area_km'].sum(),2))
print("Ratio Fire Got Wrong", round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()/df_test[df_test['fire_t']==1]['area_km'].sum(),2))
print("Ratio Non Fire Got Right", round(df_test[(df_test['fire_t']==0) & (df_test['result_2']==0)]['area_km'].sum()/df_test[df_test['fire_t']==0]['area_km'].sum(),2))
print("Ratio Non Fire Got Wrong", round(df_test[(df_test['fire_t']==0) & (df_test['result_2']==1)]['area_km'].sum()/df_test[df_test['fire_t']==0]['area_km'].sum(),2))

print('PPV:',round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==1)]['area_km'].sum()/(df_test[(df_test['fire_t']==1) & (df_test['result_2']==1)]['area_km'].sum()+df_test[(df_test['fire_t']==0) & (df_test['result_2']==1)]['area_km'].sum()),2))
print('FOR:',round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()/(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()+df_test[(df_test['fire_t']==0) & (df_test['result_2']==0)]['area_km'].sum()),2))
print('NPV:',1-round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()/(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()+df_test[(df_test['fire_t']==0) & (df_test['result_2']==0)]['area_km'].sum()),2))


# Save results to generate colormap
pred_list = []
fcst_list = []
for i in range(len(y_test)):
    if y_test[i]==0 and pred_2[i]==0:
        pred_list.append(0)
        fcst_list.append(0)
    elif y_test[i]==1 and pred_2[i]==1:
        pred_list.append(1)
        fcst_list.append(1)
    elif y_test[i]==0 and pred_2[i]==1:
        pred_list.append(2)
        fcst_list.append(1)
    elif y_test[i]==1 and pred_2[i]==0:
        pred_list.append(3)
        fcst_list.append(0)
        
np_pred = np.array(pred_list)
np_fcst = np.array(fcst_list)

df_2020_aux = df[(df['year']==2020)]
df_2020_aux['compare_fcst']=np_pred
df_2020_aux['fire_fcst']=np_fcst

def my_colormap(value):  # scalar value defined in 'column'
    if value == 0:
        return "tan"
    elif value == 1:
        return "mediumblue"
    elif value == 2:
        return "mediumseagreen"
    elif value == 3:
        return "red"
    
# Plot the overall result
colormap = []
for i in range(len(df_2020_aux['compare_fcst'].values)):
    colormap.append(my_colormap(df_2020_aux['compare_fcst'].values[i]))

In [None]:
#df_2020_aux.explore(color=colormap)

# - Training data: 2018-2019-2020 / Test data: 2021

In [None]:
# Restrict to training and test years
train_data = df_panel[(df_panel['year_reference']>=2017) & (df_panel['year_reference']<=2020)]
test_data = df_panel[df_panel['year_reference']==2021]

# Eliminate small areas based on quantile
train_data = train_data[train_data['area_km']>=train_data['area_km'].quantile(0.0)]
test_data = test_data[test_data['area_km']>=test_data['area_km'].quantile(0.0)]

X_train = train_data[['fire_before','n_fire_before','area_km','area_km_2','lat_centroid', 'lon_centroid','length','year','dist_road']].values
y_train = train_data['fire_t'].values

X_test = test_data[['fire_before','n_fire_before','area_km','area_km_2','lat_centroid', 'lon_centroid','length','year','dist_road']].values
y_test = test_data['fire_t'].values

scaler = StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# KNN: 2018-2019-2020 -> 2021


In [None]:
knn = KNeighborsClassifier(n_neighbors = k_neighbors, weights = 'uniform', metric = 'manhattan')
knn.fit(X_train, y_train)
pred_train = knn.predict(X_train)
pred_test = knn.predict(X_test)

In [None]:
print("- Training data ratio results:")
print("")

prob_train = knn.predict_proba(X_train)

pred_train_2 = np.empty(len(pred_train), dtype=np.int)

for i in range(len(prob_train)):
    if prob_train[i][1] >= threshold:
        pred_train_2[i] = 1
    else:
        pred_train_2[i] = 0

df_train = train_data.copy()
df_train['result_2'] = np.array(pred_train_2)

print("Ratio Fire Got Right", round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==1)]['area_km'].sum()/df_train[df_train['fire_t']==1]['area_km'].sum(),2))
#fire_right.append(df_train[(df_train['fire_t']==1) & (df_train['result_2']==1)]['area_km'].sum()/df_train[df_train['fire_t']==1]['area_km'].sum())
print("Ratio Fire Got Wrong", round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()/df_train[df_train['fire_t']==1]['area_km'].sum(),2))
print("Ratio Non Fire Got Right", round(df_train[(df_train['fire_t']==0) & (df_train['result_2']==0)]['area_km'].sum()/df_train[df_train['fire_t']==0]['area_km'].sum(),2))
print("Ratio Non Fire Got Wrong", round(df_train[(df_train['fire_t']==0) & (df_train['result_2']==1)]['area_km'].sum()/df_train[df_train['fire_t']==0]['area_km'].sum(),2))
#non_fire_wrong.append(df_train[(df_train['fire_t']==0) & (df_train['result_2']==1)]['area_km'].sum()/df_train[df_train['fire_t']==0]['area_km'].sum())
print('PPV:',round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==1)]['area_km'].sum()/(df_train[(df_train['fire_t']==1) & (df_train['result_2']==1)]['area_km'].sum()+df_train[(df_train['fire_t']==0) & (df_train['result_2']==1)]['area_km'].sum()),2))
print('FOR:',round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()/(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()+df_train[(df_train['fire_t']==0) & (df_train['result_2']==0)]['area_km'].sum()),2))
print('NPV:',1-round(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()/(df_train[(df_train['fire_t']==1) & (df_train['result_2']==0)]['area_km'].sum()+df_train[(df_train['fire_t']==0) & (df_train['result_2']==0)]['area_km'].sum()),2))

prob_test = knn.predict_proba(X_test)

pred_2 = np.empty(len(pred_test), dtype=np.int)

for i in range(len(prob_test)):
    if prob_test[i][1] >= threshold:
        pred_2[i] = 1
    else:
        pred_2[i] = 0

df_test = test_data.copy()
df_test['result_2'] = np.array(pred_2)

print("- Test data ratio results:")
print("")

print("Ratio Fire Got Right", round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==1)]['area_km'].sum()/df_test[df_test['fire_t']==1]['area_km'].sum(),2))
print("Ratio Fire Got Wrong", round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()/df_test[df_test['fire_t']==1]['area_km'].sum(),2))
print("Ratio Non Fire Got Right", round(df_test[(df_test['fire_t']==0) & (df_test['result_2']==0)]['area_km'].sum()/df_test[df_test['fire_t']==0]['area_km'].sum(),2))
print("Ratio Non Fire Got Wrong", round(df_test[(df_test['fire_t']==0) & (df_test['result_2']==1)]['area_km'].sum()/df_test[df_test['fire_t']==0]['area_km'].sum(),2))

print('PPV:',round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==1)]['area_km'].sum()/(df_test[(df_test['fire_t']==1) & (df_test['result_2']==1)]['area_km'].sum()+df_test[(df_test['fire_t']==0) & (df_test['result_2']==1)]['area_km'].sum()),2))
print('FOR:',round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()/(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()+df_test[(df_test['fire_t']==0) & (df_test['result_2']==0)]['area_km'].sum()),2))
print('NPV:',1-round(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()/(df_test[(df_test['fire_t']==1) & (df_test['result_2']==0)]['area_km'].sum()+df_test[(df_test['fire_t']==0) & (df_test['result_2']==0)]['area_km'].sum()),2))


# Save results to generate colormap
pred_list = []
fcst_list = []
for i in range(len(y_test)):
    if y_test[i]==0 and pred_2[i]==0:
        pred_list.append(0)
        fcst_list.append(0)
    elif y_test[i]==1 and pred_2[i]==1:
        pred_list.append(1)
        fcst_list.append(1)
    elif y_test[i]==0 and pred_2[i]==1:
        pred_list.append(2)
        fcst_list.append(1)
    elif y_test[i]==1 and pred_2[i]==0:
        pred_list.append(3)
        fcst_list.append(0)
        
np_pred = np.array(pred_list)
np_fcst = np.array(fcst_list)

df_2021_aux = df[(df['year']==2021)]
df_2021_aux['compare_fcst']=np_pred
df_2021_aux['fire_fcst']=np_fcst

def my_colormap(value):  # scalar value defined in 'column'
    if value == 0:
        return "tan"
    elif value == 1:
        return "mediumblue"
    elif value == 2:
        return "mediumseagreen"
    elif value == 3:
        return "red"
    
# Plot the overall result
colormap = []
for i in range(len(df_2021_aux['compare_fcst'].values)):
    colormap.append(my_colormap(df_2021_aux['compare_fcst'].values[i]))

In [None]:
#df_2021_aux.explore(color=colormap)

In [None]:
df_test_aux = pd.concat([df_2018_aux, df_2019_aux, df_2020_aux, df_2021_aux])

In [None]:
def my_colormap(value):  # scalar value defined in 'column'
    if value == 0:
        return "tan"
    elif value == 1:
        return "blue"
    elif value == 2:
        return "green"
    elif value == 3:
        return "red"

In [None]:
# Plot the overall result
colormap = []
for i in range(len(df_test_aux['compare_fcst'].values)):
    colormap.append(my_colormap(df_test_aux['compare_fcst'].values[i]))

In [None]:
# Final Results - All test sets together:

print('Recall:', round(df_test_aux[df_test_aux['compare_fcst']==1]['area_km'].sum()/(df_test_aux[df_test_aux['compare_fcst']==1]['area_km'].sum()+df_test_aux[df_test_aux['compare_fcst']==3]['area_km'].sum()),2))
print('FPR:', round(df_test_aux[df_test_aux['compare_fcst']==2]['area_km'].sum()/(df_test_aux[df_test_aux['compare_fcst']==0]['area_km'].sum()+df_test_aux[df_test_aux['compare_fcst']==2]['area_km'].sum()),2))

print('PPV:', round(df_test_aux[df_test_aux['compare_fcst']==1]['area_km'].sum()/(df_test_aux[df_test_aux['compare_fcst']==1]['area_km'].sum()+df_test_aux[df_test_aux['compare_fcst']==2]['area_km'].sum()),2))
print('FOR:', round(df_test_aux[df_test_aux['compare_fcst']==3]['area_km'].sum()/(df_test_aux[df_test_aux['compare_fcst']==3]['area_km'].sum()+df_test_aux[df_test_aux['compare_fcst']==0]['area_km'].sum()),2))
print('NPV:', round(1-df_test_aux[df_test_aux['compare_fcst']==3]['area_km'].sum()/(df_test_aux[df_test_aux['compare_fcst']==3]['area_km'].sum()+df_test_aux[df_test_aux['compare_fcst']==0]['area_km'].sum()),2))

In [None]:
df_test_aux

In [None]:
df_test_aux_min = df_test_aux.copy()

df_test_aux_min.rename(columns={"image_date": "Date of Deforestation", 
                                "lon_centroid": "Polygon Centroid Longitude",
                                "lat_centroid": "Polygon Centroid Latitude",
                                "area_km":"Area of Deforestation",
                                "fire_same_year":"Actual Fire",
                                "fire_fcst":"Predicted Fire"}, inplace=True)


df_test_aux_min = df_test_aux_min[["Date of Deforestation",
                                   "Polygon Centroid Longitude",
                                   "Polygon Centroid Latitude",
                                   "Area of Deforestation",
                                   "Actual Fire",
                                   "Predicted Fire",
                                   "geometry"]]

In [None]:
# Uncomment to show map
df_test_aux_min.explore(color=colormap)

In [None]:
obj = df_test_aux.explore(color=colormap)
data, metadata = get_ipython().display_formatter.format(obj)
with open('map.html', 'w') as f:
    f.write(data['text/html'])