# Scale your pandas workflows by changing one line of code #
- https://github.com/modin-project/modin
    
## With Pandas ##
```
import pandas as pd
```
## With modin pandas ##
```
import modin.pandas as pd
```

In [None]:
import glob
from tqdm import tqdm
import pandas as pd
from datetime import datetime

In [None]:
!python --version

In [None]:
!pip show pandas

In [None]:
!pip show modin

In [None]:
!ls  -lha ../../wildfire/wildfire-data/

In [None]:
all_csv_files = glob.glob('../../wildfire/wildfire-data/**/*.csv', recursive=True)

In [None]:
print(len(all_csv_files))
all_csv_files

In [None]:
all_modis_files = list(filter(lambda k: 'modis' in k, all_csv_files))

In [None]:
print(len(all_modis_files))
all_modis_files

In [None]:
all_viirs_files = list(filter(lambda k: 'viirs' in k, all_csv_files))

In [None]:
print(len(all_viirs_files))
all_viirs_files

In [None]:
# Taken this idea from the H2O.ai wild fire competition github repo and changed it meet the local need
rows = []
fire_df = []
row_sum = 0
for f in tqdm(all_viirs_files):
    df = pd.read_csv(f, parse_dates=['acq_time'], low_memory=False) 
    csv_name = f.split('/')[-1]
    row = [
        f, csv_name, df.shape[0], df.shape[1], df.acq_date.min(), df.acq_date.max(),
        df.satellite.unique(), df.instrument.max(), df.version.max(),
        df.latitude.nunique(), df.longitude.nunique(),
        df.confidence.nunique(), df.satellite.nunique(), df.acq_date.nunique()
    ]
    if isinstance(df.confidence[0], str):
        df.confidence = df.confidence.replace({'l': 0, 'n': 50, 'h': 100})
    rows.append(row)
    row_sum = row_sum + df.shape[0]
    fire_df.append(df)
cols = [
    'path', 'csv', 'rows', 'cols', 'start', 'end',
    'satellite', 'instrument', 'version',
    'lats', 'lons', 'confs', 'sats', 'days'
]
filestats = pd.DataFrame(rows, columns=cols)
filestats.sort_values(by=['start', 'instrument'])
print("Total Rows: " + str(row_sum))

In [None]:
viirs_fire_df = pd.concat(fire_df)
viirs_fire_df.shape

In [None]:
viirs_fire_df['acq_date'] = pd.to_datetime(viirs_fire_df['acq_date'])

In [None]:
viirs_fire_df.to_csv('viirs_fire_data_raw.csv.gz', index=False, compression='gzip')

In [None]:
daily_fires_viirs = viirs_fire_df.groupby(
            ['latitude', 'longitude', 'acq_date', 'satellite', 'instrument']).confidence.max().reset_index()

In [None]:
daily_fires_viirs['year'] = daily_fires_viirs.acq_date.dt.year

In [None]:
daily_fires_viirs['month'] = daily_fires_viirs.acq_date.dt.month

In [None]:
daily_fires_viirs.to_csv('daily_fires_viirs.csv.gz', index=False, compression='gzip')

In [None]:
PRECISION = 1

## Using Pandas

In [None]:
t_start = datetime.now()
## ----
df = df_pd_gz[['latitude', 'longitude', 'acq_date', 'confidence']]
df.latitude = df.latitude.round(PRECISION)
df.longitude = df.longitude.round(PRECISION)
df['acq_date'] = pd.to_datetime(df['acq_date'])
df['year'] = df.acq_date.dt.year
df['month'] = df.acq_date.dt.month
total_fires = df.groupby(['latitude', 'longitude', 'year', 'month']).size().reset_index()
total_fires.columns = ['latitude', 'longitude', 'year', 'month', 'fire_count']
yearly_fires = total_fires.groupby(['longitude', 'latitude', 'year', ]).count().reset_index()
global_fires = total_fires.groupby(['latitude', 'longitude']).sum().reset_index()
# ----
t_end = datetime.now()
t_end.strftime('%Y-%m-%d %H:%M:%S')
f'Total time {(t_end - t_start).seconds} (s)'

In [None]:
print(yearly_fires.shape)
yearly_fires

In [None]:
print(global_fires.shape)
global_fires

In [None]:
import plotly.express as px
import plotly.figure_factory as ff
import numpy as np
MAPBOX_TOKEN = 'pk.eyJ1IjoiZm9kZ2Fib3JtYXRoIiwiYSI6ImNrZmY3Nzc2bjBiemkyeG8zdGNzcXgzMGIifQ.J0dZhMiuZTPVexL8nrpS6Q'
px.set_mapbox_access_token(MAPBOX_TOKEN)

In [None]:
t_start = datetime.now()
## ----
fig = ff.create_hexbin_mapbox(
    data_frame=global_fires, lat='latitude', lon='longitude',
    nx_hexagon=100, opacity=0.9, labels={"color": "Hotspot records"},
    color='fire_count', agg_func=np.sum, color_continuous_scale="Reds"
)
fig.show()
# ----
t_end = datetime.now()
t_end.strftime('%Y-%m-%d %H:%M:%S')
f'Total time {(t_end - t_start).seconds} (s)'

In [None]:
global_fires.columns

In [None]:
fig = px.density_mapbox(global_fires, lat=0, lon=1, z=4, radius=10,
                        center=dict(lat=0, lon=180), zoom=0,
                        mapbox_style="stamen-terrain")
fig.show()