#### Import block

In [1]:
import os
import pandas as pd
import datetime as dt
import numpy as np
import altair as alt
import geopy.distance as gpd
from scipy.sparse import coo_matrix

#### Function definitions

In [2]:
def one_hot_stack(px_combined,
                 centroid_lat = 0,
                 centroid_lon = 0,
                 centroid_date = dt.datetime.strptime('2017/01/01','%Y/%m/%d'),
                 span = 100,
                 days_back = 10,
                 days_forward = 5):
    
    min_date = centroid_date - dt.timedelta(days = days_back)
    max_date = centroid_date + dt.timedelta(days = days_forward)

    # Calculate location if we move 375 meters North and use to get grid steps.
    lat_375 = gpd.distance(kilometers=0.375).destination((centroid_lat, centroid_lon), bearing=0).latitude
    lat_step = abs(centroid_lat - lat_375)

    # Calculate location if we move 375 meters East and use to get grid steps.
    lon_375 = gpd.distance(kilometers=0.375).destination((centroid_lat, centroid_lon), bearing=90).longitude
    lon_step = abs(centroid_lon - lon_375)

    # Get bounding box of centroid and 100 pixels in all directions.
    min_lat = centroid_lat - (lat_step * span)
    max_lat = centroid_lat + (lat_step * span)
    min_lon = centroid_lon - (lon_step * span)
    max_lon = centroid_lon + (lon_step * span)

    # Filter data down to the range we want to plot
    t_df = px_combined[(px_combined['Date'] >= min_date) & (px_combined['Date'] <= max_date)]
    t_df = t_df[(t_df['Lat'] >= min_lat) & (t_df['Lat'] <= max_lat) & (t_df['Lon'] >= min_lon) & (t_df['Lon'] <= max_lon)]

    # Calculate pixel locations in the new grid.
    t_df['delta_lat'] = t_df['Lat'] - centroid_lat
    t_df['delta_lon'] = t_df['Lon'] - centroid_lon
    t_df['lat_grid'] = t_df['delta_lat'] // lat_step
    t_df['lon_grid'] = t_df['delta_lon'] // lon_step

    # Express date difference
    t_df['delta_day'] = (t_df['Date'] - centroid_date).dt.days
    
    # Convert grids to not have negative locations.
    t_df['lat_grid'] = t_df['lat_grid'] + span
    t_df['lon_grid'] = t_df['lon_grid'] + span
    
    # Build an array for each day in our date range then stack them. One for the inputs and one for the outputs.
    input_stack = []
    for x in range(-days_back, 1):
        s_df = t_df[t_df['Date'] == (centroid_date + dt.timedelta(days = x))]
        if len(s_df) > 0:
            cm = coo_matrix((s_df['value'],(s_df['lon_grid'].astype(int),s_df['lat_grid'].astype(int)))).toarray()
            cm = np.pad(cm, pad_width=((0, (2 * span + 1) - cm.shape[0]),(0,(2 * span + 1) - cm.shape[1])), mode = 'constant')
        else:
            cm = np.zeros(((2 * span + 1),(2 * span + 1)))
        input_stack.append(cm)
    input_stack = np.stack(input_stack)

    output_stack = []
    for x in range(1, days_forward + 1):
        s_df = t_df[t_df['Date'] == (centroid_date + dt.timedelta(days = x))]
        if len(s_df) > 0:
            cm = coo_matrix((s_df['value'],(s_df['lon_grid'].astype(int),s_df['lat_grid'].astype(int)))).toarray()
            cm = np.pad(cm, pad_width=((0, (2 * span + 1) - cm.shape[0]),(0,(2 * span + 1) - cm.shape[1])), mode = 'constant')
        else:
            cm = np.zeros(((2 * span + 1),(2 * span + 1)))
        output_stack.append(cm)
    output_stack = np.stack(output_stack)
    
    return input_stack, output_stack

In [3]:
def feature_stack(px_combined,
                 centroid_lat = 0,
                 centroid_lon = 0,
                 centroid_date = dt.datetime.strptime('2017/01/01','%Y/%m/%d'),
                 span = 100,
                 days_back = 10,
                 days_forward = 5):
    
    min_date = centroid_date - dt.timedelta(days = days_back)
    max_date = centroid_date + dt.timedelta(days = days_forward)

    # Calculate location if we move 375 meters North and use to get grid steps.
    lat_375 = gpd.distance(kilometers=0.375).destination((centroid_lat, centroid_lon), bearing=0).latitude
    lat_step = abs(centroid_lat - lat_375)

    # Calculate location if we move 375 meters East and use to get grid steps.
    lon_375 = gpd.distance(kilometers=0.375).destination((centroid_lat, centroid_lon), bearing=90).longitude
    lon_step = abs(centroid_lon - lon_375)

    # Get bounding box of centroid and 100 pixels in all directions.
    min_lat = centroid_lat - (lat_step * span)
    max_lat = centroid_lat + (lat_step * span)
    min_lon = centroid_lon - (lon_step * span)
    max_lon = centroid_lon + (lon_step * span)

    # Filter data down to the range we want to plot
    t_df = px_combined[(px_combined['Date'] >= min_date) & (px_combined['Date'] <= max_date)]
    t_df = t_df[(t_df['Lat'] >= min_lat) & (t_df['Lat'] <= max_lat) & (t_df['Lon'] >= min_lon) & (t_df['Lon'] <= max_lon)]

    # Calculate pixel locations in the new grid.
    t_df['delta_lat'] = t_df['Lat'] - centroid_lat
    t_df['delta_lon'] = t_df['Lon'] - centroid_lon
    t_df['lat_grid'] = t_df['delta_lat'] // lat_step
    t_df['lon_grid'] = t_df['delta_lon'] // lon_step

    # Express date difference
    t_df['delta_day'] = (t_df['Date'] - centroid_date).dt.days
    
    # Convert grids to not have negative locations.
    t_df['lat_grid'] = t_df['lat_grid'] + span
    t_df['lon_grid'] = t_df['lon_grid'] + span
    
    # Build an array for each day in our date range then stack them. One for the inputs and one for the outputs.
    input_stack = []
    for x in range(-days_back, 1):
        s_df = t_df[t_df['Date'] == (centroid_date + dt.timedelta(days = x))]
        if len(s_df) > 0:
            cm1 = coo_matrix((s_df['T4'],(s_df['lon_grid'].astype(int),s_df['lat_grid'].astype(int)))).toarray()
            cm1 = np.pad(cm1, pad_width=((0, 201 - cm1.shape[0]),(0,(2 * span + 1) - cm1.shape[1])), mode = 'constant')
            cm2 = coo_matrix((s_df['T5'],(s_df['lon_grid'].astype(int),s_df['lat_grid'].astype(int)))).toarray()
            cm2 = np.pad(cm2, pad_width=((0, 201 - cm2.shape[0]),(0,(2 * span + 1) - cm2.shape[1])), mode = 'constant')
            cm3 = coo_matrix((s_df['FRP'],(s_df['lon_grid'].astype(int),s_df['lat_grid'].astype(int)))).toarray()
            cm3 = np.pad(cm3, pad_width=((0, 201 - cm3.shape[0]),(0,(2 * span + 1) - cm3.shape[1])), mode = 'constant')
            cm4 = coo_matrix((s_df['Rad13'],(s_df['lon_grid'].astype(int),s_df['lat_grid'].astype(int)))).toarray()
            cm4 = np.pad(cm4, pad_width=((0, 201 - cm4.shape[0]),(0,(2 * span + 1) - cm4.shape[1])), mode = 'constant')
        else:
            cm1 = np.zeros(((2 * span + 1),(2 * span + 1)))
            cm2 = np.zeros(((2 * span + 1),(2 * span + 1)))
            cm3 = np.zeros(((2 * span + 1),(2 * span + 1)))
            cm4 = np.zeros(((2 * span + 1),(2 * span + 1)))
        input_stack.append(np.stack([cm1, cm2, cm3, cm4]))
    input_stack = np.stack(input_stack)

    output_stack = []
    for x in range(1, days_forward + 1):
        s_df = t_df[t_df['Date'] == (centroid_date + dt.timedelta(days = x))]
        if len(s_df) > 0:
            cm = coo_matrix((s_df['value'],(s_df['lon_grid'].astype(int),s_df['lat_grid'].astype(int)))).toarray()
            cm = np.pad(cm, pad_width=((0, (2 * span + 1) - cm.shape[0]),(0,(2 * span + 1) - cm.shape[1])), mode = 'constant')
        else:
            cm = np.zeros(((2 * span + 1),(2 * span + 1)))
        output_stack.append(cm)
    output_stack = np.stack(output_stack)
    
    return input_stack, output_stack

In [4]:
def one_hot_stack_and_save(
                 centroid_df,
                 px_combined,
                 meta_folder = '/data/',
                 feature_folder = '/data/features/',
                 label_folder = '/data/labels/',
                 span = 100,
                 days_back = 10,
                 days_forward = 5):
    
    meta_df = centroid_df.copy()
    meta_df['features'] = feature_folder + meta_df.index.astype(str) + '.npy'
    meta_df['labels'] = label_folder + meta_df.index.astype(str) + '.npy'
    
    for x in range(0,len(centroid_df)):
        
        t_index = centroid_df.index[x].astype(str)
        centroid_lat = centroid_df.iloc[x].Lat
        centroid_lon = centroid_df.iloc[x].Lon
        centroid_date = centroid_df.iloc[x].Date
        
        min_date = centroid_date - dt.timedelta(days = days_back)
        max_date = centroid_date + dt.timedelta(days = days_forward)

        # Calculate location if we move 375 meters North and use to get grid steps.
        lat_375 = gpd.distance(kilometers=0.375).destination((centroid_lat, centroid_lon), bearing=0).latitude
        lat_step = abs(centroid_lat - lat_375)

        # Calculate location if we move 375 meters East and use to get grid steps.
        lon_375 = gpd.distance(kilometers=0.375).destination((centroid_lat, centroid_lon), bearing=90).longitude
        lon_step = abs(centroid_lon - lon_375)

        # Get bounding box of centroid and 100 pixels in all directions.
        min_lat = centroid_lat - (lat_step * span)
        max_lat = centroid_lat + (lat_step * span)
        min_lon = centroid_lon - (lon_step * span)
        max_lon = centroid_lon + (lon_step * span)

        # Filter data down to the range we want to plot
        t_df = px_combined[(px_combined['Date'] >= min_date) & (px_combined['Date'] <= max_date)]
        t_df = t_df[(t_df['Lat'] >= min_lat) & (t_df['Lat'] <= max_lat) & (t_df['Lon'] >= min_lon) & (t_df['Lon'] <= max_lon)]

        # Calculate pixel locations in the new grid.
        t_df['delta_lat'] = t_df['Lat'] - centroid_lat
        t_df['delta_lon'] = t_df['Lon'] - centroid_lon
        t_df['lat_grid'] = t_df['delta_lat'] // lat_step
        t_df['lon_grid'] = t_df['delta_lon'] // lon_step

        # Express date difference
        t_df['delta_day'] = (t_df['Date'] - centroid_date).dt.days

        # Convert grids to not have negative locations.
        t_df['lat_grid'] = t_df['lat_grid'] + span
        t_df['lon_grid'] = t_df['lon_grid'] + span

        # Build an array for each day in our date range then stack them. One for the inputs and one for the outputs.
        input_stack = []
        for x in range(-days_back, 1):
            s_df = t_df[t_df['Date'] == (centroid_date + dt.timedelta(days = x))]
            if len(s_df) > 0:
                cm = coo_matrix((s_df['value'],(s_df['lon_grid'].astype(int),s_df['lat_grid'].astype(int)))).toarray()
                cm = np.pad(cm, pad_width=((0, (2 * span + 1) - cm.shape[0]),(0,(2 * span + 1) - cm.shape[1])), mode = 'constant')
            else:
                cm = np.zeros(((2 * span + 1),(2 * span + 1)))
            input_stack.append(cm)
        input_stack = np.stack(input_stack)

        output_stack = []
        for x in range(1, days_forward + 1):
            s_df = t_df[t_df['Date'] == (centroid_date + dt.timedelta(days = x))]
            if len(s_df) > 0:
                cm = coo_matrix((s_df['value'],(s_df['lon_grid'].astype(int),s_df['lat_grid'].astype(int)))).toarray()
                cm = np.pad(cm, pad_width=((0, (2 * span + 1) - cm.shape[0]),(0,(2 * span + 1) - cm.shape[1])), mode = 'constant')
            else:
                cm = np.zeros(((2 * span + 1),(2 * span + 1)))
            output_stack.append(cm)
        output_stack = np.stack(output_stack)
        
        # Save to disk
        np.save(feature_folder + t_index + '.npy', input_stack)
        np.save(label_folder + t_index + '.npy', output_stack)
        
    meta_df.to_csv(meta_folder + 'meta.csv')

#### Data import

In [5]:
# Load saved data back into memory.
px_combined = pd.read_csv('pixel_data_filtered.csv')

In [6]:
# Adjust date format.
px_combined['Date'] = pd.to_datetime(px_combined['YYYYMMDD'].astype(str))

In [7]:
# Add a column where every value is a 1 to use in 1-hot encoding strategy.
px_combined['value'] = 1

In [8]:
# Limit to night pixels only.
px_combined = px_combined[px_combined['DNFlag'] == 'night']

#### Testing code concepts on one example

In [9]:
centroid_lat = -17.859758
centroid_lon = -57.332260
centroid_date = dt.datetime.strptime('2020/07/06','%Y/%m/%d')
days_back = 10
days_forward = 5
min_date = centroid_date - dt.timedelta(days = days_back)
max_date = centroid_date + dt.timedelta(days = days_forward)

# Calculate location if we move 375 meters North and use to get grid steps.
lat_375 = gpd.distance(kilometers=0.375).destination((centroid_lat, centroid_lon), bearing=0).latitude
lat_step = abs(centroid_lat - lat_375)

# Calculate location if we move 375 meters East and use to get grid steps.
lon_375 = gpd.distance(kilometers=0.375).destination((centroid_lat, centroid_lon), bearing=90).longitude
lon_step = abs(centroid_lon - lon_375)

# Get bounding box of centroid and 100 pixels in all directions.
min_lat = centroid_lat - (lat_step * 100)
max_lat = centroid_lat + (lat_step * 100)
min_lon = centroid_lon - (lon_step * 100)
max_lon = centroid_lon + (lon_step * 100)

# Filter data down to the range we want to plot
t_df = px_combined[(px_combined['Date'] >= min_date) & (px_combined['Date'] <= max_date)]
t_df = t_df[(t_df['Lat'] >= min_lat) & (t_df['Lat'] <= max_lat) & (t_df['Lon'] >= min_lon) & (t_df['Lon'] <= max_lon)]

# Calculate pixel locations in the new grid.
t_df['delta_lat'] = t_df['Lat'] - centroid_lat
t_df['delta_lon'] = t_df['Lon'] - centroid_lon
t_df['lat_grid'] = t_df['delta_lat'] // lat_step
t_df['lon_grid'] = t_df['delta_lon'] // lon_step

# Express date difference
t_df['delta_day'] = (t_df['Date'] - centroid_date).dt.days

In [10]:
# Add tooltip for plot
t_df['tool'] = t_df['lon_grid'].astype(str) + ' x ' + t_df['lat_grid'].astype(str)

# Show new grid in a plot.
alt.Chart(t_df[['lat_grid', 'lon_grid', 'Date', 'tool']]).mark_circle(size=250).encode(
    alt.X('lon_grid', title = 'Lon', axis=alt.Axis(labels=False)),
    alt.Y('lat_grid', title = 'Lat', axis=alt.Axis(labels=False)),
    color = alt.Color('Date:N', scale=alt.Scale(scheme = 'category20')),
    tooltip = 'tool'
).properties(height=600, width=600).interactive()

In [11]:
# Illustrate what we are asking model to predict.
source = alt.Chart(t_df[t_df['Date'] <= centroid_date][['lat_grid', 'lon_grid', 'Date', 'tool']]
                  ,title = 'Source Data').mark_circle(size=250).encode(
    alt.X('lon_grid', title = 'Lon', axis=alt.Axis(labels=False), scale=alt.Scale(domain=[-100, 100])),
    alt.Y('lat_grid', title = 'Lat', axis=alt.Axis(labels=False), scale=alt.Scale(domain=[-100, 100])),
    color = alt.Color('Date:N', scale=alt.Scale(scheme = 'category20')),
    tooltip = 'tool'
).properties(height=300, width=300).interactive()

target = alt.Chart(t_df[t_df['Date'] > centroid_date][['lat_grid', 'lon_grid', 'Date', 'tool']]
                  ,title = 'Target Prediction').mark_circle(size=250).encode(
    alt.X('lon_grid', title = 'Lon', axis=alt.Axis(labels=False), scale=alt.Scale(domain=[-100, 100])),
    alt.Y('lat_grid', title = 'Lat', axis=alt.Axis(labels=False), scale=alt.Scale(domain=[-100, 100])),
    color = alt.Color('Date:N', scale=alt.Scale(scheme = 'category20')),
    tooltip = 'tool'
).properties(height=300, width=300).interactive()

source | target

### One-hot encoding strategy

In [12]:
# Convert grids to not have negative locations.
t_df['lat_grid'] = t_df['lat_grid'] + 100
t_df['lon_grid'] = t_df['lon_grid'] + 100

In [13]:
# Examine array for centroid date.
s_df = t_df[t_df['Date'] == centroid_date]
cm = coo_matrix((s_df['value'],(s_df['lon_grid'].astype(int),s_df['lat_grid'].astype(int)))).toarray()
cm = np.pad(cm, pad_width=((0, 201 - cm.shape[0]),(0,201 - cm.shape[1])), mode = 'constant')

# Confirm that the right number of pixels are mapped.
pd.DataFrame(cm).sum().sum()

6

In [14]:
s_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,YYYYMMDD,HHMM,sat,Line,Sample,Lat,Lon,T4,...,Type,PixArea,Date,value,delta_lat,delta_lon,lat_grid,lon_grid,delta_day,tool
6219262,16125,319850,20200706,412,VNP,5210,530,-17.852758,-57.335457,298.63,...,0,0.321,2020-07-06,1,0.007,-0.003197,102.0,99.0,0,-1.0 x 2.0
6219263,16126,319851,20200706,412,VNP,5211,531,-17.859758,-57.33226,299.43,...,0,0.32,2020-07-06,1,0.0,0.0,100.0,100.0,0,0.0 x 0.0
6219445,16308,324131,20200706,554,VNP,3511,6256,-17.867916,-57.319241,325.2,...,0,0.505,2020-07-06,1,-0.008158,0.013019,97.0,103.0,0,3.0 x -3.0
6219446,16309,324132,20200706,554,VNP,3511,6255,-17.867306,-57.325962,313.15,...,0,0.504,2020-07-06,1,-0.007548,0.006298,97.0,101.0,0,1.0 x -3.0
6219450,16313,324136,20200706,554,VNP,3528,6256,-17.86689,-57.324802,321.05,...,0,0.505,2020-07-06,1,-0.007132,0.007458,97.0,102.0,0,2.0 x -3.0
6219451,16314,324137,20200706,554,VNP,3528,6257,-17.867458,-57.318073,316.99,...,0,0.506,2020-07-06,1,-0.0077,0.014187,97.0,104.0,0,4.0 x -3.0


In [15]:
# Build an array for each day in our date range then stack them. One for the inputs and one for the outputs.
input_stack = []
for x in range(-days_back, 1):
    s_df = t_df[t_df['Date'] == (centroid_date + dt.timedelta(days = x))]
    if len(s_df) > 0:
        cm = coo_matrix((s_df['value'],(s_df['lon_grid'].astype(int),s_df['lat_grid'].astype(int)))).toarray()
        cm = np.pad(cm, pad_width=((0, 201 - cm.shape[0]),(0,201 - cm.shape[1])), mode = 'constant')
    else:
        cm = np.zeros((201,201))
    input_stack.append(cm)
input_stack = np.stack(input_stack)

output_stack = []
for x in range(1, days_forward + 1):
    s_df = t_df[t_df['Date'] == (centroid_date + dt.timedelta(days = x))]
    if len(s_df) > 0:
        cm = coo_matrix((s_df['value'],(s_df['lon_grid'].astype(int),s_df['lat_grid'].astype(int)))).toarray()
        cm = np.pad(cm, pad_width=((0, 201 - cm.shape[0]),(0,201 - cm.shape[1])), mode = 'constant')
    else:
        cm = np.zeros((201,201))
    output_stack.append(cm)
output_stack = np.stack(output_stack)

In [16]:
print('Input Stack Shape:', input_stack.shape)
print('Input Stack Sum:', input_stack.sum())
print('Outpt Stack Shape:', output_stack.shape)
print('Output Stack Sum:', output_stack.sum())
print('Validation Check:', input_stack.sum() + output_stack.sum() == t_df.value.sum())

Input Stack Shape: (11, 201, 201)
Input Stack Sum: 72.0
Outpt Stack Shape: (5, 201, 201)
Output Stack Sum: 73.0
Validation Check: True


In [17]:
# Test function call.
i,o = one_hot_stack(px_combined, 
                    centroid_lat = -17.859758, 
                    centroid_lon = -57.332260, 
                    centroid_date = dt.datetime.strptime('2020/07/06','%Y/%m/%d'))
print('Input Stack Shape:', i.shape)
print('Input Stack Sum:', i.sum())
print('Outpt Stack Shape:', o.shape)
print('Output Stack Sum:', o.sum())

Input Stack Shape: (11, 201, 201)
Input Stack Sum: 72.0
Outpt Stack Shape: (5, 201, 201)
Output Stack Sum: 73.0


### Feature-based encoding strategy

In [18]:
# Create arrays for each feature
s_df = t_df[t_df['Date'] == centroid_date]
cm1 = coo_matrix((s_df['T4'],(s_df['lon_grid'].astype(int),s_df['lat_grid'].astype(int)))).toarray()
cm1 = np.pad(cm1, pad_width=((0, 201 - cm1.shape[0]),(0,201 - cm1.shape[1])), mode = 'constant')
cm2 = coo_matrix((s_df['T5'],(s_df['lon_grid'].astype(int),s_df['lat_grid'].astype(int)))).toarray()
cm2 = np.pad(cm2, pad_width=((0, 201 - cm2.shape[0]),(0,201 - cm2.shape[1])), mode = 'constant')
cm3 = coo_matrix((s_df['FRP'],(s_df['lon_grid'].astype(int),s_df['lat_grid'].astype(int)))).toarray()
cm3 = np.pad(cm3, pad_width=((0, 201 - cm3.shape[0]),(0,201 - cm3.shape[1])), mode = 'constant')
cm4 = coo_matrix((s_df['Rad13'],(s_df['lon_grid'].astype(int),s_df['lat_grid'].astype(int)))).toarray()
cm4 = np.pad(cm4, pad_width=((0, 201 - cm4.shape[0]),(0,201 - cm4.shape[1])), mode = 'constant')

np.stack([cm1,cm2,cm3,cm4]).shape

(4, 201, 201)

In [19]:
# Test function call.
i,o = feature_stack(px_combined, 
                    centroid_lat = -17.859758, 
                    centroid_lon = -57.332260, 
                    centroid_date = dt.datetime.strptime('2020/07/06','%Y/%m/%d'))
print('Input Stack Shape:', i.shape)
print('Input Stack Sum:', i.sum())
print('Outpt Stack Shape:', o.shape)
print('Output Stack Sum:', o.sum())

Input Stack Shape: (11, 4, 201, 201)
Input Stack Sum: 43676.83200000001
Outpt Stack Shape: (5, 201, 201)
Output Stack Sum: 73.0


### Create sample datasets

In [20]:
# Build training set.
train_df = px_combined[px_combined['Date'] < '2018/1/1'].sample(frac=0.001,random_state=321)[['Lon','Lat','Date']]
train_df

Unnamed: 0,Lon,Lat,Date
356962,-53.345535,-6.535028,2017-07-30
546517,-47.329685,-8.260676,2017-08-20
799359,-50.967861,-8.255809,2017-09-04
714590,-66.144379,-12.624272,2017-08-31
240012,-48.350338,-11.838207,2017-07-14
...,...,...,...
1983526,-51.431313,0.659630,2017-11-26
2079183,-45.239655,-4.660801,2017-12-12
1934639,-47.025230,-2.296389,2017-11-18
1417977,-53.128605,-2.962739,2017-09-30


In [21]:
# Build validation set.
val_df = px_combined[(px_combined['Date'] > '2017/12/31') & (px_combined['Date'] < '2019/1/1')].sample(frac=0.00002,random_state=321)[['Lon','Lat','Date']]
val_df

Unnamed: 0,Lon,Lat,Date
2590445,-61.569847,-15.477804,2018-08-14
2512882,-57.37524,-28.650763,2018-08-01
2206666,-61.064445,1.956841,2018-02-21
2475186,-46.623047,-14.629879,2018-07-25
3312005,-57.839279,-2.900392,2018-10-18
3509161,-49.368294,-17.581152,2018-12-13
2839259,-53.822853,-13.254734,2018-09-05


In [22]:
## Construct training DataFrame.
#train_df[['Source','Target']] = train_df.apply(lambda x: one_hot_stack(px_combined, x['Lat'], x['Lon'], x['Date']), axis=1, result_type='expand')
#
## Construct validation DataFrame.
#val_df[['Source','Target']] = val_df.apply(lambda x: one_hot_stack(px_combined, x['Lat'], x['Lon'], x['Date']), axis=1, result_type='expand')

In [23]:
#train_df.to_pickle('train.pkl')
#val_df.to_pickle('val.pkl')

In [24]:
train_df.index

Int64Index([ 356962,  546517,  799359,  714590,  240012, 1629036, 1550906,
             435911,  642953, 1756323,
            ...
             433748,  457203, 1508652,  605367, 1144958, 1983526, 2079183,
            1934639, 1417977,  797891],
           dtype='int64', length=650)

In [28]:
one_hot_stack_and_save(train_df, 
                       px_combined, 
                       'Sample_Dataset/', 
                       'Sample_Dataset/train_features/', 
                       'Sample_Dataset/train_labels/',
                       span = 127,
                       days_back = 15,
                       days_forward = 8
                      )