In [1]:
import pandas as pd
import numpy as np
import os
from datetime import timedelta

# Extract Fire data
There are 2 types of fire data
- Country labeled dataset
- Unlabeled dataset

This first part will deal with labeled one.

In [2]:
years = ['2016','2017','2018','2019']
countries = ['Cambodia','Myanmar','Thailand','Lao_PDR']

fire = {'2016':{}, '2017':{}, '2018':{}, '2019':{}}

Load all the data (all years, all countries) and transform them.

In [3]:
data = {}

for year in years:
    # Create full timeline
    timeline = pd.date_range(start=f'{year}-01-01', end=f'{year}-12-31 23:00:00', freq='H')
    left = pd.DataFrame(index=timeline)
    for country in countries:

        # Import data & manually parse dates
        df = pd.read_csv(f'./Fire hotspot/viirs-snpp_{year}_{country}.csv')
        df['hour'] = df['acq_time'].apply(str).str[:-2]
        df['datetime'] = pd.to_datetime(df['acq_date']+'T'+df['hour'].str.zfill(2), 
                            format='%Y-%m-%dT%H')   

        # Assume : frp represent brightness
        df = df[['latitude','longitude','frp','datetime']]
        df.rename(columns={'frp':country+'_frp'}, inplace=True)
        df.rename(columns={'latitude':country+'_lat'}, inplace=True)
        df.rename(columns={'longitude':country+'_long'}, inplace=True)

        # Assume : ไม่มีค่าไฟใน record = ไม่มีไฟ -> frp = 0
        df = df.groupby('datetime').mean()
        df = df.resample('H').mean().fillna(value=0.0)
        left = left.merge(df, how='left', left_index=True, right_index=True)
    data[year] = left.fillna(value=0.0)

Assemble them as the <u><strong>fire data for 2016 - 2019</strong></u>. And record the max, min of latitude, longitude of each country for further use.

In [7]:
fire_all = pd.concat(data)
fire_all.index = fire_all.index.levels[1]

records = {}
will_drop = []
for c in countries:
    records[c] = {'lat':
                    {'min':fire_all[c+'_lat'].min(),
                     'max':fire_all[c+'_lat'].max()},
                  'long':
                    {'min':fire_all[c+'_long'].min(),
                     'max':fire_all[c+'_long'].max()}
                  }
    will_drop += [c+'_lat', c+'_long']

fire_all.drop(will_drop, inplace=True, axis=1)

In [8]:
fire_all

Unnamed: 0,Cambodia_frp,Myanmar_frp,Thailand_frp,Lao_PDR_frp
2016-01-01 00:00:00,0.0,0.000,0.000000,0.0
2016-01-01 01:00:00,0.0,0.000,0.000000,0.0
2016-01-01 02:00:00,0.0,0.000,0.000000,0.0
2016-01-01 03:00:00,0.0,0.000,0.000000,0.0
2016-01-01 04:00:00,0.0,0.000,0.000000,0.0
...,...,...,...,...
2019-12-31 19:00:00,0.0,0.612,1.628704,0.0
2019-12-31 20:00:00,0.0,0.000,0.000000,0.0
2019-12-31 21:00:00,0.0,0.000,0.000000,0.0
2019-12-31 22:00:00,0.0,0.000,0.000000,0.0


Next, we'll localize fire data to Thailand local timezone (UTC+7). Then, we integrate fire data to full dataset which we cleaned in `3.fillna.ipynb`.

In [None]:
provinces = ['Bangkok','Chanthaburi','Chiang Mai','Kanchanaburi','Khon Kaen','Songkhla']

# +7 Hours to localize the time
fire_all.index = fire_all.index + timedelta(hours=7)

mega = {}
for province in provinces:
    if province == 'Khon Kaen' : continue
    df = pd.read_csv(f"./data/Train/{province}_imputed.csv", parse_dates=True, index_col=0)
    mega[province] = df.merge(fire_all, left_index=True, right_index=True, how='left')

This is an example of *preprocessed* data.

In [None]:
mega['Bangkok'].loc['2016-3-3']

Lastly, we save it the *preprocessed* dataset ready to be fed to the model.

In [64]:
for province in provinces:
    if province == 'Khon Kaen' : continue
    path = f'./data/Train/fire_integrated/{province}_fire_integrated.csv'
    if not os.path.exists(path):
        mega[province].to_csv(path)
    else:
        print(f"{province} already")

save ข้อมูลไฟทั้งหมด เก็บไว้ใช้ต่อกับ test set

In [66]:
import glob

if len(glob.glob("./data/fire_2016_to_2019_localize.csv"))==0:
    fire_all.to_csv('./data/fire_2016_to_2019_localize.csv')
else:
    print('already')

# Expand Fire data
In this project, the model will be evaluated using **test data** which is the data from `2019-03-18 08:00:00` to `2020-03-18 20:00:00`. So, we need to find more fire data since the labeled data we had ends at 2019-12-31.

We have several unlabeled fire data. So, We have to choose one. First we import 2016-2019 fire data.

In [6]:
fire_all = pd.read_csv('./data/fire_2016_to_2019_localize.csv', index_col=0, parse_dates=True)
fire_all.tail()

Unnamed: 0,Cambodia_frp,Myanmar_frp,Thailand_frp,Lao_PDR_frp
2020-01-01 02:00:00,0.0,0.612,1.628704,0.0
2020-01-01 03:00:00,0.0,0.0,0.0,0.0
2020-01-01 04:00:00,0.0,0.0,0.0,0.0
2020-01-01 05:00:00,0.0,0.0,0.0,0.0
2020-01-01 06:00:00,0.0,0.0,0.0,0.0


## Lat-long info from viirs-snpp data
We record the latitude-longitude from the labeled dataset to later usage. We will use it to identify the country of unlabeled dataset.

In [7]:
from collections import defaultdict

lat_long = defaultdict(lambda: list())

for y in years:
    for c in countries:
        df = pd.read_csv(f'./Fire hotspot/viirs-snpp_{y}_{c}.csv')
        lat_long[c].append([(df['latitude'].min(), df['latitude'].max()), (df['longitude'].min(),df['longitude'].max())])

In [8]:
records = defaultdict(lambda: dict())
for c in countries:
    arr = np.array(lat_long[c]).reshape((4,4))
    print(f'{c} : lat min = {arr[:,0].min()} , lat max = {arr[:,1].max()} , long min = {arr[:,2].min()} , long max = {arr[:,3].max()}')
    records[c]['lat_min'] = arr[:,0].min()
    records[c]['lat_max'] = arr[:,1].max()
    records[c]['long_min'] = arr[:,2].min()
    records[c]['long_max'] = arr[:,3].max()

Cambodia : lat min = 10.435731 , lat max = 14.693866 , long min = 102.314522 , long max = 107.609604
Myanmar : lat min = 10.00162 , lat max = 28.344353 , long min = 92.177818 , long max = 101.160408
Thailand : lat min = 5.695879 , lat max = 20.440176 , long min = 97.352341 , long max = 105.649544
Lao_PDR : lat min = 13.916207 , lat max = 22.483566 , long min = 100.097649 , long max = 107.655373


After inspecting the dataset, we see that these 2 dataset `fire_archive_V1_163551.csv`,` fire_nrt_J1V-C2_163550.csv` have the information in date range that we're interested in.

In [9]:
fire_files = [ 'fire_archive_V1_163551.csv', 'fire_nrt_J1V-C2_163550.csv']

data = []
for f in fire_files:
    df = pd.read_csv(f'./Fire hotspot/{f}')
    df['hour'] = df['acq_time'].apply(str).str[:-2]
    df['datetime'] = pd.to_datetime(
                                df['acq_date']+'T'+df['hour'].str.zfill(2), 
                                format='%Y-%m-%dT%H')
    
    df = df.groupby(by='datetime').mean()
    data.append(df)

## Ambiguity
Identifying the country from max,min of latitude,longitude may ambiguous since those values might overlaped. We address this issue by using external API to find the exact location of latitude, longitude. We'll use `geopy` package.

In [10]:
import geopy
from geopy.geocoders import Nominatim

def assign_country(x):
    lati, longi = x[0], x[1]

    # If lat-long is clear
    choice = []
    for c in countries:
        lat_const = lati<records[c]['lat_max'] and lati>records[c]['lat_min']
        long_const = longi<records[c]['long_max'] and longi>records[c]['long_min']
        if lat_const and long_const:
            choice.append(c)
    if len(choice)==1:
        return choice[0]
    
    # If lat-long is not clear
    del choice
    geolocator = Nominatim(user_agent="app")
    try:
        location = geolocator.reverse([lati, longi])
        return location.raw['address']['country_code']
    except:
        return np.nan

In [11]:
if not os.path.exists('archive_V1_country_code.csv'):
    country_code = data[0].apply(lambda x: assign_country(x), axis=1)
    country_code.to_csv('archive_V1_country_code.csv')
else: print('archive_V1_country_code.csv is already')

if not os.path.exists('nrt_J1V-C2_country_code.csv'):
    country_code = data[1].apply(lambda x: assign_country(x), axis=1)
    country_code.to_csv('nrt_J1V-C2_country_code.csv')
else: print('nrt_J1V-C2_country_code.csv is already')

archive_V1_country_code.csv is already


In [81]:
country_code = pd.read_csv('archive_V1_country_code.csv', parse_dates=True, index_col=0)
country_code.columns = ['country_code']

code = {'Thailand':'th', 'Myanmar':'mm', 'Lao_PDR':'la', 'Cambodia':'kh'}
data[0]['country_code'] = country_code
data[0].replace(code)

Unnamed: 0_level_0,latitude,longitude,bright_ti4,scan,track,acq_time,version,bright_ti5,frp,type,country_code
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-12-01 03:00:00,34.485201,133.531911,327.673333,0.413333,0.606667,318.000000,1.0,286.786667,4.516667,2.333333,jp
2019-12-01 04:00:00,-12.467219,126.595392,346.078962,0.460555,0.481223,445.892086,1.0,302.408109,15.996351,0.084275,
2019-12-01 05:00:00,38.605851,116.062874,330.978780,0.415854,0.396585,500.292683,1.0,280.858049,3.665610,0.780488,cn
2019-12-01 06:00:00,13.668623,101.822657,336.834853,0.440730,0.411355,634.485308,1.0,299.306730,8.370957,0.098578,th
2019-12-01 08:00:00,29.008121,75.353245,334.528711,0.447692,0.419376,820.218295,1.0,297.378815,4.955676,0.054054,in
...,...,...,...,...,...,...,...,...,...,...,...
2020-05-31 17:00:00,28.132490,119.744363,304.977270,0.466962,0.565154,1709.440273,1.0,286.161536,1.903925,1.655290,cn
2020-05-31 18:00:00,34.661149,110.291332,304.787110,0.429032,0.470116,1848.614194,1.0,286.190632,1.301535,1.607742,cn
2020-05-31 19:00:00,-6.006604,105.983075,307.395000,0.490000,0.650000,1900.000000,1.0,279.890000,1.390000,2.000000,id
2020-05-31 20:00:00,34.881759,84.182679,309.277091,0.413909,0.447818,2030.068182,1.0,289.368591,1.515500,1.440909,cn


In [82]:
data[0].replace(code)['country_code'].value_counts()

in    430
cn    264
th    132
au    128
id    101
mm     96
la     60
jp     43
kh     28
pk     26
ph     22
ru     20
kp     19
kr     15
kg     12
vn     11
kz      9
tw      6
bd      5
my      5
np      4
mn      3
lk      3
uz      2
tl      2
tj      1
Name: country_code, dtype: int64

In [26]:
ls = [fire_2020_c[c][f'{c}_frp'] for c in countries]
fire_2020_all = pd.concat(ls, axis=1)

fire_all.append(fire_2020_all)

Unnamed: 0,Cambodia_frp,Myanmar_frp,Thailand_frp,Lao_PDR_frp
2016-01-01 00:00:00,0.0,0.000000,0.000000,0.000000
2016-01-01 01:00:00,0.0,0.000000,0.000000,0.000000
2016-01-01 02:00:00,0.0,0.000000,0.000000,0.000000
2016-01-01 03:00:00,0.0,0.000000,0.000000,0.000000
2016-01-01 04:00:00,0.0,0.000000,0.000000,0.000000
...,...,...,...,...
2020-03-17 06:00:00,,13.759829,13.759829,
2020-03-17 18:00:00,,1.482160,,
2020-03-17 19:00:00,,1.495826,1.495826,
2020-03-18 06:00:00,,19.212768,19.212768,19.212768


In [30]:
#fire_complete = fire_all.append(fire_2020_all)
idx = pd.date_range('2020-01-01','2020-03-18 20:00:00',freq='H')
left = pd.DataFrame(index=idx)
h = left.merge(fire_all.append(fire_2020_all), left_index=True, right_index=True, how='left')

In [31]:
h.tail(20)

Unnamed: 0,Cambodia_frp,Myanmar_frp,Thailand_frp,Lao_PDR_frp
2020-03-17 05:00:00,,,,
2020-03-17 06:00:00,,13.759829,13.759829,
2020-03-17 07:00:00,,,,
2020-03-17 08:00:00,,,,
2020-03-17 09:00:00,,,,
2020-03-17 10:00:00,,,,
2020-03-17 11:00:00,,,,
2020-03-17 12:00:00,,,,
2020-03-17 13:00:00,,,,
2020-03-17 14:00:00,,,,
