In [None]:
import glob
from datetime import datetime
from tqdm import tqdm

In [None]:
import pandas as pdo

In [None]:
import modin.pandas as pdm

In [None]:
!python --version

In [None]:
!pip show pandas

In [None]:
!pip show modin

In [None]:
!ls -lah ../wildfire

In [None]:
all_csv_files = glob.glob('../wildfire/wildfire-data/**/*.csv', recursive=True)

In [None]:
print(len(all_csv_files))
all_csv_files

In [None]:
all_us_files = list(filter(lambda k: 'United_States' in k, all_csv_files))
all_aus_files = list(filter(lambda k: 'Australia.csv' in k, all_csv_files))
all_greenland_files = list(filter(lambda k: 'Greenland.csv' in k, all_csv_files))
all_Papua_New_Guinea_files = list(filter(lambda k: 'Papua_New_Guinea.csv' in k, all_csv_files))
all_Mexico_files = list(filter(lambda k: 'Mexico' in k, all_csv_files))
all_China_files = list(filter(lambda k: 'China' in k, all_csv_files))

In [None]:
all_fire_files = all_aus_files + all_us_files + all_greenland_files + all_Papua_New_Guinea_files + all_Mexico_files + all_China_files

In [None]:
all_fire_files

In [None]:
print(len(all_fire_files))

In [None]:
# Taken this idea from the H2O.ai wild fire competition github repo and changed it meet the local need
rows = []
fire_df = []
row_sum = 0
t_start = datetime.now()
for f in tqdm(all_fire_files):
    # Using pandas
    df = pdo.read_csv(f, parse_dates=['acq_time'], low_memory=False) 
    csv_name = f.split('/')[-1]
    row = [
        f, csv_name, df.shape[0], df.shape[1], df.acq_date.min(), df.acq_date.max(),
        df.satellite.unique(), df.instrument.max(), df.version.max(),
        df.latitude.nunique(), df.longitude.nunique(),
        df.confidence.nunique(), df.satellite.nunique(), df.acq_date.nunique()
    ]
    if isinstance(df.confidence[0], str):
        df.confidence = df.confidence.replace({'l': 0, 'n': 50, 'h': 100})
    rows.append(row)
    row_sum = row_sum + df.shape[0]
    fire_df.append(df)
cols = [
    'path', 'csv', 'rows', 'cols', 'start', 'end',
    'satellite', 'instrument', 'version',
    'lats', 'lons', 'confs', 'sats', 'days'
]
# Using pandas
filestats = pdo.DataFrame(rows, columns=cols)
filestats.sort_values(by=['start', 'instrument'])
print("Total Rows: " + str(row_sum))

# Using pandas
master_fire_df = pdo.concat(fire_df)
t_end = datetime.now()
t_end.strftime('%Y-%m-%d %H:%M:%S')
f'Total time {(t_end - t_start).seconds} (s)'

```
Total Rows: 37164492
'Total time 77 (s)'
```

## initializing modin and setting its configuration ##

In [None]:
from modin.config import Engine
Engine.put("dask")
from distributed import Client
client = Client()

In [None]:
# Taken this idea from the H2O.ai wild fire competition github repo and changed it meet the local need
rows = []
fire_df = []
row_sum = 0
t_start = datetime.now()
for f in tqdm(all_fire_files):
    # Using modin pandas
    df = pdm.read_csv(f, parse_dates=['acq_time'], low_memory=False) 
    csv_name = f.split('/')[-1]
    row = [
        f, csv_name, df.shape[0], df.shape[1], df.acq_date.min(), df.acq_date.max(),
        df.satellite.unique(), df.instrument.max(), df.version.max(),
        df.latitude.nunique(), df.longitude.nunique(),
        df.confidence.nunique(), df.satellite.nunique(), df.acq_date.nunique()
    ]
    if isinstance(df.confidence[0], str):
        df.confidence = df.confidence.replace({'l': 0, 'n': 50, 'h': 100})
    rows.append(row)
    row_sum = row_sum + df.shape[0]
    fire_df.append(df)
cols = [
    'path', 'csv', 'rows', 'cols', 'start', 'end',
    'satellite', 'instrument', 'version',
    'lats', 'lons', 'confs', 'sats', 'days'
]
# Using modin pandas
filestats = pdm.DataFrame(rows, columns=cols)
filestats.sort_values(by=['start', 'instrument'])
print("Total Rows: " + str(row_sum))

# Using modin pandas
master_fire_df = pdm.concat(fire_df)
t_end = datetime.now()
t_end.strftime('%Y-%m-%d %H:%M:%S')
f'Total time {(t_end - t_start).seconds} (s)'

In [None]:
f'Total time {(t_end - t_start).seconds} (s)'

In [None]:
master_fire_df.shape

In [None]:
master_fire_df

In [None]:
master_fire_df.to_csv('mix_fire.csv.gz', index=False, compression='gzip')
#master_fire_df.to_csv('mix_fire.csv', index=False)