In [1]:
import pandas as pd
import numpy as np
import os
from datetime import timedelta

# Extract Fire data
There are 2 types of fire data
- Country labeled dataset
- Unlabeled dataset

This first part will deal with labeled one.

In [2]:
years = ['2016','2017','2018','2019']
countries = ['Cambodia','Myanmar','Thailand','Lao_PDR']

fire = {'2016':{}, '2017':{}, '2018':{}, '2019':{}}

Load all the data (all years, all countries) and transform them.

In [3]:
data = {}

for year in years:
    # Create full timeline
    timeline = pd.date_range(start=f'{year}-01-01', end=f'{year}-12-31 23:00:00', freq='H')
    left = pd.DataFrame(index=timeline)
    for country in countries:

        # Import data & manually parse dates
        df = pd.read_csv(f'./Fire hotspot/viirs-snpp_{year}_{country}.csv')
        df['hour'] = df['acq_time'].apply(str).str[:-2]
        df['datetime'] = pd.to_datetime(df['acq_date']+'T'+df['hour'].str.zfill(2), 
                            format='%Y-%m-%dT%H')   

        # Assume : frp represent brightness
        df = df[['latitude','longitude','frp','datetime']]
        df.rename(columns={'frp':country+'_frp'}, inplace=True)
        df.rename(columns={'latitude':country+'_lat'}, inplace=True)
        df.rename(columns={'longitude':country+'_long'}, inplace=True)

        # Assume : ไม่มีค่าไฟใน record = ไม่มีไฟ -> frp = 0
        df = df.groupby('datetime').mean()
        df = df.resample('H').mean().fillna(value=0.0)
        left = left.merge(df, how='left', left_index=True, right_index=True)
    data[year] = left.fillna(value=0.0)

Assemble them as the <u><strong>fire data for 2016 - 2019</strong></u>. And record the max, min of latitude, longitude of each country for further use.

In [21]:
# 2016-2019 fire data
fire_all = pd.concat(data)
fire_all.index = fire_all.index.levels[1]

# Find max, min of latitude, longitude of each country
records = {}
will_drop = []
for c in countries:
    temporary = fire_all[[c+'_lat', c+'_long']].replace(0.0, np.nan)
    records[c] = {'lat':
                    {'min':temporary[c+'_lat'].min(),
                     'max':temporary[c+'_lat'].max()},
                  'long':
                    {'min':temporary[c+'_long'].min(),
                     'max':temporary[c+'_long'].max()}
                  }
    will_drop += [c+'_lat', c+'_long']

# Drop unused columns
fire_all.drop(will_drop, inplace=True, axis=1)

In [22]:
# 2016-2019 fire data
fire_all

Unnamed: 0,Cambodia_frp,Myanmar_frp,Thailand_frp,Lao_PDR_frp
2016-01-01 00:00:00,0.0,0.000,0.000000,0.0
2016-01-01 01:00:00,0.0,0.000,0.000000,0.0
2016-01-01 02:00:00,0.0,0.000,0.000000,0.0
2016-01-01 03:00:00,0.0,0.000,0.000000,0.0
2016-01-01 04:00:00,0.0,0.000,0.000000,0.0
...,...,...,...,...
2019-12-31 19:00:00,0.0,0.612,1.628704,0.0
2019-12-31 20:00:00,0.0,0.000,0.000000,0.0
2019-12-31 21:00:00,0.0,0.000,0.000000,0.0
2019-12-31 22:00:00,0.0,0.000,0.000000,0.0


Next, we'll localize fire data to Thailand local timezone (UTC+7). Then, we integrate fire data to full dataset which we cleaned in `3.fillna.ipynb`.

In [44]:
provinces = ['Bangkok','Chanthaburi','Chiang Mai','Kanchanaburi','Khon Kaen','Songkhla']

# +7 Hours to localize the time
fire_all.index = fire_all.index + timedelta(hours=7)

# Integrate to full,imputed data
mega = {}
for province in provinces:
    if province == 'Khon Kaen' : continue
    df = pd.read_csv(f"./data/Train/{province}_imputed.csv", parse_dates=True, index_col=0)
    mega[province] = df.merge(fire_all, left_index=True, right_index=True, how='left')

NameError: name 'fire_all' is not defined

This is an example of *preprocessed* data.

In [10]:
# This is data integrated with localized fire
mega['Bangkok'].loc['2016-3-3']

Unnamed: 0,PM2.5,Temp(C),WindDir,Wind Speed(km/h),Cambodia_frp,Myanmar_frp,Thailand_frp,Lao_PDR_frp
2016-03-03 08:00:00,62.9,26.4,65.0,15.0,0.0,0.0,0.0,0.0
2016-03-03 09:00:00,62.9,26.4,65.0,15.0,0.0,0.0,0.0,0.0
2016-03-03 10:00:00,55.5,31.4,75.0,13.0,0.0,0.0,0.0,0.0
2016-03-03 11:00:00,55.5,31.4,75.0,13.0,0.0,0.0,0.0,0.0
2016-03-03 12:00:00,47.9,31.4,75.0,13.0,8.984035,9.31925,6.58053,7.991304
2016-03-03 13:00:00,43.6,34.1,70.0,12.0,0.0,0.0,0.0,0.0
2016-03-03 14:00:00,28.6,34.1,70.0,12.0,0.0,8.099819,9.423972,0.0
2016-03-03 15:00:00,33.6,34.1,70.0,12.0,0.0,0.0,0.0,0.0
2016-03-03 16:00:00,34.8,34.6,60.0,12.0,0.0,0.0,0.0,0.0
2016-03-03 17:00:00,31.3,34.6,60.0,12.0,0.0,0.0,0.0,0.0


Lastly, we <u>save</u> it the *preprocessed* dataset ready to be fed to the model.

In [11]:
# Save imputed, fire integrated data
for province in provinces:
    if province == 'Khon Kaen' : continue
    path = f'./data/Train/fire_integrated/{province}_fire_integrated.csv'
    if not os.path.exists(path):
        mega[province].to_csv(path)
    else:
        print(f"{province} already")

save ข้อมูลไฟทั้งหมด เก็บไว้ใช้ต่อกับ test set

In [12]:
import glob

# Save 2016-2019 fire data
if len(glob.glob("./data/fire_2016_to_2019_localize.csv"))==0:
    fire_all.to_csv('./data/fire_2016_to_2019_localize.csv')
else:
    print('already')

already


# Expand Fire data
In this project, the model will be evaluated using **test data** which is the data from `2019-03-18 08:00:00` to `2020-03-18 20:00:00`. So, we need to find more fire data since the labeled data we had ends at 2019-12-31.

We have several unlabeled fire data. So, We have to choose one. First we import 2016-2019 fire data.

In [3]:
localized_2016_2019 = pd.read_csv('./data/fire_2016_to_2019_localize.csv', index_col=0, parse_dates=True)
localized_2016_2019.tail()

Unnamed: 0,Cambodia_frp,Myanmar_frp,Thailand_frp,Lao_PDR_frp
2020-01-01 02:00:00,0.0,0.612,1.628704,0.0
2020-01-01 03:00:00,0.0,0.0,0.0,0.0
2020-01-01 04:00:00,0.0,0.0,0.0,0.0
2020-01-01 05:00:00,0.0,0.0,0.0,0.0
2020-01-01 06:00:00,0.0,0.0,0.0,0.0


## Lat-long info from viirs-snpp data
We recorded the latitude-longitude from the labeled dataset. We will use it to identify the country of unlabeled dataset.

In [23]:
records

{'Cambodia': {'lat': {'min': 10.58011, 'max': 14.397147},
  'long': {'min': 102.421844, 'max': 107.492813}},
 'Myanmar': {'lat': {'min': 10.243567, 'max': 28.307707},
  'long': {'min': 92.191887, 'max': 101.07398}},
 'Thailand': {'lat': {'min': 5.728347, 'max': 20.3328055},
  'long': {'min': 97.46129833333333, 'max': 105.436768}},
 'Lao_PDR': {'lat': {'min': 13.922927, 'max': 22.350048},
  'long': {'min': 100.198189, 'max': 107.598183}}}

After inspecting the dataset, we see that these 2 dataset `fire_archive_V1_163551.csv`,` fire_nrt_J1V-C2_163550.csv` have the information in date range that we're interested in.

In [4]:
fire_files = [ 'fire_archive_V1_163551.csv', 'fire_nrt_J1V-C2_163550.csv']

# Read two 2020 fire files
data = {}
for f in fire_files:
    df = pd.read_csv(f'./Fire hotspot/{f}')
    df['hour'] = df['acq_time'].apply(str).str[:-2]
    df['datetime'] = pd.to_datetime(
                                df['acq_date']+'T'+df['hour'].str.zfill(2), 
                                format='%Y-%m-%dT%H')
    
    df = df.groupby(by='datetime').mean()
    data[f] = df

## Ambiguity
Identifying the country from max,min of latitude,longitude may ambiguous since those values might overlaped. We address this issue by using external API to find the exact location of latitude, longitude. We'll use `geopy` package.

In [5]:
import geopy
from geopy.geocoders import Nominatim

def assign_country(x):
    lati, longi = x[0], x[1]

    # If lat-long is clear
    choice = []
    for c in countries:
        lat_const = lati<records[c]['lat']['max'] and lati>records[c]['lat']['min']
        long_const = longi<records[c]['long']['max'] and longi>records[c]['long']['min']
        if lat_const and long_const:
            choice.append(c)
    if len(choice)==1:
        return choice[0]
    
    # If lat-long is not clear
    del choice
    geolocator = Nominatim(user_agent="app")
    try:
        location = geolocator.reverse([lati, longi])
        return location.raw['address']['country_code']
    except:
        return np.nan

It took so long to find all country code from latitude, longitude. So, we decide to keep the code.

In [6]:
# Identify country
if not os.path.exists('archive_V1_country_code.csv'):
    # Find country code 
    country_code = data[fire_files[0]].apply(lambda x: assign_country(x), axis=1)
    country_code.to_csv('archive_V1_country_code.csv')
else: print('archive_V1_country_code.csv is already')

if not os.path.exists('nrt_J1V-C2_country_code.csv'):
    # Find country code 
    country_code = data[fire_files[1]].apply(lambda x: assign_country(x), axis=1)
    country_code.to_csv('nrt_J1V-C2_country_code.csv')
else: print('nrt_J1V-C2_country_code.csv is already')

archive_V1_country_code.csv is already
nrt_J1V-C2_country_code.csv is already


Assign country code back to the `data[0], data[1]`

In [7]:
code = {'Thailand':'th', 'Myanmar':'mm', 'Lao_PDR':'la', 'Cambodia':'kh'}
focus = {}
for f in fire_files:
    # Read country code file
    if f.split('_')[1] == 'archive':
        country_code = pd.read_csv('archive_V1_country_code.csv', parse_dates=True, index_col=0)
    elif f.split('_')[1] == 'nrt':
        country_code = pd.read_csv('nrt_J1V-C2_country_code.csv', parse_dates=True, index_col=0)
    country_code.columns = ['country_code']

    # Assign back
    data[f]['country_code'] = country_code
    data[f].replace(code, inplace=True)

    # Filter out out-of-interest countries
    mask = data[f]['country_code'].isin(code.values())
    focus[f] = data[f][mask]

Next, we'll append 2020 fire data to localized 2016-2019 data. But first, we have to localize it.

In [25]:
# Pivoting to right format
selected = focus[fire_files[0]].loc['2020-01-01 00:00:00':,['country_code','frp']].reset_index()
selected = pd.pivot(selected, index='datetime', columns = 'country_code', values='frp')
selected.rename(columns={'kh':'Cambodia_frp', 
                        'la':'Lao_PDR_frp', 
                        'mm':'Myanmar_frp', 
                        'th':'Thailand_frp'}, inplace=True)
selected.head()

country_code,Cambodia_frp,Lao_PDR_frp,Myanmar_frp,Thailand_frp
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-01 06:00:00,,,,6.869011
2020-01-01 19:00:00,,,,1.861488
2020-01-02 06:00:00,,,,7.11292
2020-01-02 19:00:00,,,,1.312886
2020-01-03 06:00:00,,,,6.361829


In [29]:
# Create full time line
extended_timeline = pd.DataFrame(index=pd.date_range('2020-01-01 00:00:00', '2020-05-31 06:00:00', freq='H'))
selected = extended_timeline.merge(selected, left_index=True, right_index=True, how='left').fillna(0.0)

# Localize 
selected.index = selected.index + timedelta(hours=7)

localized_2020 = selected.copy()
localized_2020

Unnamed: 0,Cambodia_frp,Lao_PDR_frp,Myanmar_frp,Thailand_frp
2020-01-01 07:00:00,0.0,0.0,0.0,0.0
2020-01-01 08:00:00,0.0,0.0,0.0,0.0
2020-01-01 09:00:00,0.0,0.0,0.0,0.0
2020-01-01 10:00:00,0.0,0.0,0.0,0.0
2020-01-01 11:00:00,0.0,0.0,0.0,0.0
...,...,...,...,...
2020-05-31 09:00:00,0.0,0.0,0.0,0.0
2020-05-31 10:00:00,0.0,0.0,0.0,0.0
2020-05-31 11:00:00,0.0,0.0,0.0,0.0
2020-05-31 12:00:00,0.0,0.0,0.0,0.0


In [31]:
localized_2016_2019.tail()

Unnamed: 0,Cambodia_frp,Myanmar_frp,Thailand_frp,Lao_PDR_frp
2020-01-01 02:00:00,0.0,0.612,1.628704,0.0
2020-01-01 03:00:00,0.0,0.0,0.0,0.0
2020-01-01 04:00:00,0.0,0.0,0.0,0.0
2020-01-01 05:00:00,0.0,0.0,0.0,0.0
2020-01-01 06:00:00,0.0,0.0,0.0,0.0


In [40]:
localized_2016_2020 = localized_2016_2019.append(localized_2020)
localized_2016_2020

Unnamed: 0,Cambodia_frp,Myanmar_frp,Thailand_frp,Lao_PDR_frp
2016-01-01 07:00:00,0.0,0.0,0.0,0.0
2016-01-01 08:00:00,0.0,0.0,0.0,0.0
2016-01-01 09:00:00,0.0,0.0,0.0,0.0
2016-01-01 10:00:00,0.0,0.0,0.0,0.0
2016-01-01 11:00:00,0.0,0.0,0.0,0.0
...,...,...,...,...
2020-05-31 09:00:00,0.0,0.0,0.0,0.0
2020-05-31 10:00:00,0.0,0.0,0.0,0.0
2020-05-31 11:00:00,0.0,0.0,0.0,0.0
2020-05-31 12:00:00,0.0,0.0,0.0,0.0


In [43]:
if not os.path.exists('./data/fire_2016_to_2020_localize.csv'):
    localized_2016_2020.to_csv('./data/fire_2016_to_2020_localize.csv')
else:
    print('Full fire already')

Full fire already


## Test: fire integrated
Once full fire is ready, we integrate it to the `Test set`.

In [48]:
full_fire_localized = pd.read_csv('./data/fire_2016_to_2020_localize.csv', parse_dates=True, index_col=0)
full_fire_localized

Unnamed: 0,Cambodia_frp,Myanmar_frp,Thailand_frp,Lao_PDR_frp
2016-01-01 07:00:00,0.0,0.0,0.0,0.0
2016-01-01 08:00:00,0.0,0.0,0.0,0.0
2016-01-01 09:00:00,0.0,0.0,0.0,0.0
2016-01-01 10:00:00,0.0,0.0,0.0,0.0
2016-01-01 11:00:00,0.0,0.0,0.0,0.0
...,...,...,...,...
2020-05-31 09:00:00,0.0,0.0,0.0,0.0
2020-05-31 10:00:00,0.0,0.0,0.0,0.0
2020-05-31 11:00:00,0.0,0.0,0.0,0.0
2020-05-31 12:00:00,0.0,0.0,0.0,0.0


In [54]:
# Integrate to full,imputed Test data
mega = {}
for province in provinces:
    if province == 'Khon Kaen' : continue

    # Read imputed data & Integrate the localized fire
    df = pd.read_csv(f"./data/Test/{province}_imputed.csv", parse_dates=True, index_col=0)
    mega[province] = df.merge(full_fire_localized, left_index=True, right_index=True, how='left')

    # Save imputed, fire integrated Test data
    path = f'./data/Test/fire_integrated/{province}_fire_integrated.csv'
    if not os.path.exists(path):
        mega[province].to_csv(path)
    else:
        print(f"{province} already")

In [53]:
mega[province]

Unnamed: 0,PM2.5,Temp(C),WindDir,Wind Speed(km/h),Cambodia_frp,Myanmar_frp,Thailand_frp,Lao_PDR_frp
2019-03-18 08:00:00,32.7,26.1,90.0,42.0,0.000000,0.0000,0.000000,0.000000
2019-03-18 09:00:00,31.7,26.1,90.0,42.0,0.000000,0.0000,0.000000,0.000000
2019-03-18 10:00:00,26.4,29.3,90.0,42.0,0.000000,0.0000,0.000000,0.000000
2019-03-18 11:00:00,24.6,29.3,90.0,42.0,0.000000,0.0000,0.000000,0.000000
2019-03-18 12:00:00,24.2,29.3,90.0,42.0,10.406856,25.2634,6.844051,26.476113
...,...,...,...,...,...,...,...,...
2020-03-18 16:00:00,11.3,29.6,100.0,31.0,0.000000,0.0000,1.495826,0.000000
2020-03-18 17:00:00,11.7,29.6,100.0,31.0,0.000000,0.0000,0.000000,0.000000
2020-03-18 18:00:00,11.7,29.6,100.0,31.0,0.000000,0.0000,0.000000,0.000000
2020-03-18 19:00:00,10.9,26.5,100.0,29.0,0.000000,0.0000,0.000000,0.000000


In [55]:
mega[province]

Unnamed: 0,PM2.5,Temp(C),WindDir,Wind Speed(km/h),Cambodia_frp,Myanmar_frp,Thailand_frp,Lao_PDR_frp
2019-03-18 08:00:00,32.7,26.1,90.0,42.0,0.000000,0.0000,0.000000,0.000000
2019-03-18 09:00:00,31.7,26.1,90.0,42.0,0.000000,0.0000,0.000000,0.000000
2019-03-18 10:00:00,26.4,29.3,90.0,42.0,0.000000,0.0000,0.000000,0.000000
2019-03-18 11:00:00,24.6,29.3,90.0,42.0,0.000000,0.0000,0.000000,0.000000
2019-03-18 12:00:00,24.2,29.3,90.0,42.0,10.406856,25.2634,6.844051,26.476113
...,...,...,...,...,...,...,...,...
2020-03-18 16:00:00,11.3,29.6,100.0,31.0,0.000000,0.0000,1.495826,0.000000
2020-03-18 17:00:00,11.7,29.6,100.0,31.0,0.000000,0.0000,0.000000,0.000000
2020-03-18 18:00:00,11.7,29.6,100.0,31.0,0.000000,0.0000,0.000000,0.000000
2020-03-18 19:00:00,10.9,26.5,100.0,29.0,0.000000,0.0000,0.000000,0.000000
