In [109]:
import os
import pickle
import datetime
import threading
import numpy as np
import pandas as pd
import multiprocessing
from tqdm import tqdm, tqdm_notebook
from matplotlib import pyplot as plt

%matplotlib notebook

# What is done
<br>
The splitting for each data-point will be done day by day:

Like:
               
        ____   ____   ____   ____   ____            ____   ____   ____
      _/1day\_/1day\_/1day\_/1day\_/1day\........._/1day\_/1day\_/1day\
    1996.01------1996.07------1997.01------1997.07---------------2018.01
    ____________________________________________________________________

So as a result we are getting a vector each element of which shows how many events occured the corresponding day. This format is needed for Poisson Regression model that we are going to implement.
<br> <br>

In [74]:
########################
##    Reading data    ##
########################

data = pd.read_pickle('selected_data.pkl')
print('data :', data.shape)
data.head()

data : (1301399, 5)


Unnamed: 0,STATE,EVENT_TYPE,BEGIN_YEARMONTH,DATE,DIST_FROM_START
22087,NEW JERSEY,Astronomical Low Tide,199703,1997-03-12,17226
22088,NEW JERSEY,Astronomical Low Tide,199703,1997-03-01,17215
22089,NEW JERSEY,Astronomical Low Tide,199703,1997-03-14,17228
26506,DELAWARE,Astronomical Low Tide,199703,1997-03-18,17232
26892,NEW JERSEY,Astronomical Low Tide,199703,1997-03-03,17217


In [114]:
def vectorize(data, event_type, window):
    event_data = data[data.EVENT_TYPE==event_type]
    
    start = event_data.DIST_FROM_START.min()   # start of the timeline
    finish = event_data.DIST_FROM_START.max()  # end of the timeline
    
    result = []
    for date in np.arange(start, finish, step=1):
        hold = event_data[np.logical_and(event_data.DIST_FROM_START>=date,
                                         event_data.DIST_FROM_START<date+1)]
        result.append(hold.shape[0])
    return np.array(result)

def process(data, event_types, window, file_name):
    result = {event_type:vectorize(data, event_type, window) for event_type in tqdm(event_types)}
    file = open(file_name, 'wb')
    pickle.dump(result, file)
    file.close()


In [115]:
event_types = data.EVENT_TYPE.unique()

event_types_0 = event_types[:event_types.shape[0]//4]
event_types_1 = event_types[event_types.shape[0]//4:event_types.shape[0]//2]
event_types_2 = event_types[event_types.shape[0]//2:event_types.shape[0]*3//4]
event_types_3 = event_types[event_types.shape[0]*3//4:]

assert (event_types_0.shape[0]+
        event_types_1.shape[0]+
        event_types_2.shape[0]+
        event_types_3.shape[0]) == event_types.shape[0]

In [116]:
window = 1

p0 = multiprocessing.Process(target=process, args=(data, event_types_0, window, 'vecs/process_0.npy'))
p1 = multiprocessing.Process(target=process, args=(data, event_types_1, window, 'vecs/process_1.npy'))
p2 = multiprocessing.Process(target=process, args=(data, event_types_2, window, 'vecs/process_2.npy'))
p3 = multiprocessing.Process(target=process, args=(data, event_types_3, window, 'vecs/process_3.npy'))

p0.start()
p1.start()
p2.start()
p3.start()

p0.join()
p1.join()
p2.join()
p3.join()

100%|██████████| 13/13 [01:36<00:00,  7.70s/it]
100%|██████████| 13/13 [01:59<00:00,  8.27s/it]
100%|██████████| 14/14 [02:03<00:00,  8.61s/it]
100%|██████████| 13/13 [02:09<00:00,  8.01s/it]
