In [3]:
import os
import datetime
import threading
import numpy as np
import pandas as pd
import multiprocessing
from tqdm import tqdm, tqdm_notebook
from matplotlib import pyplot as plt

# %matplotlib notebook


<br>
This is how data splitting shall be done:
               
                         -------------------------->                     ------------>
                      -------------------------->                     ------------>
                   ------------------------->                     ------------>
                __________   ___________   ___________   _______   ____________
              _/ 3 window \_/ 33 window \_/ 63 window \_/ ..... \_/ N+3 window \_
            __________   ___________   ___________   _______   ____________
          _/ 2 window \_/ 32 window \_/ 62 window \_/ ..... \_/ N+1 window \_
        __________   ___________   ___________   _______   __________
      _/ 1 window \_/ 31 window \_/ 61 window \_/ ..... \_/ N window \_
    1996.01------1996.07------1997.01------1997.07------_____............------2018.01
    __________________________________________________________________________________

Each resulting data-point will be a 1d histogram (histogram over the states) and it will have depth each layer of which will represent the count of each event type taken place in the given state.
<br> <br>

In [5]:
########################
##    Reading data    ##
########################

data = pd.read_pickle('selected_data.pkl')
print('data :', data.shape)
data.head()

data : (1301399, 5)


Unnamed: 0,STATE,EVENT_TYPE,BEGIN_YEARMONTH,DATE,DIST_FROM_START
22087,NEW JERSEY,Astronomical Low Tide,199703,1997-03-12,17226
22088,NEW JERSEY,Astronomical Low Tide,199703,1997-03-01,17215
22089,NEW JERSEY,Astronomical Low Tide,199703,1997-03-14,17228
26506,DELAWARE,Astronomical Low Tide,199703,1997-03-18,17232
26892,NEW JERSEY,Astronomical Low Tide,199703,1997-03-03,17217


In [6]:
#######################################
##    Split with the given window    ##
#######################################

def split(data, window):
    start = data.DIST_FROM_START.min()   # start of the timeline
    finish = data.DIST_FROM_START.max()  # end of the timeline
    splits = []
    chunk_count = 0
    for date in tqdm(np.arange(start, finish)):
        hold = data[np.logical_and(data.DIST_FROM_START>=date, data.DIST_FROM_START<date+window)]
        splits.append(hold)
    return splits

In [7]:
##################################################
##          This is the most heavy part         ##
##    After running, "splits" results to 22GB   ##
##################################################

splits = split(data, 6*30)

100%|██████████| 8397/8397 [00:50<00:00, 165.88it/s]


In [10]:
############################################
##   Saving the resulting 11 GB of data   ##
############################################

if not os.path.exists('splited_data'):
    os.makedirs('splited_data')

for i, split in tqdm(enumerate(splits)):
    split.to_pickle('splited_data/split_{}.pkl'.format(i))

8397it [04:39, 30.06it/s] 


<br>
Now we have the 6-month splits and it's time to make the final data-structure for them model.
<hr>
It will be a matrix with len(states) rows and len(event_types) columns. So that in each matrix[i, j] element the number of j-th events that occured in i-th state will be written.
<br> <br>

In [6]:
event_types = data.EVENT_TYPE.unique()
states = data.STATE.unique()

print('event_types  :', event_types.shape[0])
print('states       :', states.shape[0], end='\n\n')
print('So our final data-point will be a matrix with shape ({}, {})'.format(states.shape[0], 
                                                                            event_types.shape[0]))

event_types  : 53
states       : 68

So our final data-point will be a matrix with shape (68, 53)


In [19]:
###############################
##    Making the matrixes    ##
###############################

def make_matrixes(file_names, save_path):
    chunk_count = 0
    matrix_data = []
    for file_name in tqdm(file_names):
        hold = pd.read_pickle(file_name)
        matrix = []
        for state in states:
            state_data = hold[hold.STATE==state]
            state_vec = []
            for event_type in event_types:
                state_vec.append(np.sum(state_data.EVENT_TYPE==event_type))
            matrix.append(state_vec)
        matrix_data.append(matrix)
    np.save(os.path.join(save_path,'matrixes.npy'),matrix_data)
    print('\n\t Done ! \n\n')

In [26]:
#############################################################
##   Seperating the tast to 4 processes(processor cores)   ##
#############################################################

file_names_0 = []
file_names_1 = []
file_names_2 = []
file_names_3 = []

#####################################
##   Checking correct indexation   ##
#####################################
assert (set(map(lambda x: int(x[6:-4]), os.listdir('splited_data')))==
        set(range(min(map(lambda x: int(x[6:-4]), os.listdir('splited_data'))), 
                  max(map(lambda x: int(x[6:-4]), os.listdir('splited_data')))+1)))

for i in range(min(map(lambda x: int(x[6:-4]), os.listdir('splited_data'))), 
               max(map(lambda x: int(x[6:-4]), os.listdir('splited_data')))+1):
    if i%4==0:
        file_names_0.append('splited_data/split_{}.pkl'.format(i))
    elif i%4==1:
        file_names_1.append('splited_data/split_{}.pkl'.format(i))
    elif i%4==2:
        file_names_2.append('splited_data/split_{}.pkl'.format(i))
    else:
        file_names_3.append('splited_data/split_{}.pkl'.format(i))

print('file_names_0 :', len(file_names_0))
print('file_names_1 :', len(file_names_1))
print('file_names_2 :', len(file_names_2))
print('file_names_3 :', len(file_names_3))

file_names_0 : 2100
file_names_1 : 2099
file_names_2 : 2099
file_names_3 : 2099


In [28]:
################################
##    Runing the processes    ##
################################

p0 = multiprocessing.Process(target=make_matrixes, args=(file_names_0, 'matrix_data/process_0'))
p1 = multiprocessing.Process(target=make_matrixes, args=(file_names_1, 'matrix_data/process_1'))
p2 = multiprocessing.Process(target=make_matrixes, args=(file_names_2, 'matrix_data/process_2'))
p3 = multiprocessing.Process(target=make_matrixes, args=(file_names_3, 'matrix_data/process_3'))

p0.start()
p1.start()
p2.start()
p3.start()

p0.join()
p1.join()
p2.join()
p3.join()

100%|██████████| 2099/2099 [1:08:37<00:00,  1.36s/it]
100%|█████████▉| 2093/2099 [1:08:37<00:08,  1.37s/it]


	 Done ! 




100%|██████████| 2100/2100 [1:08:38<00:00,  1.34s/it]
100%|█████████▉| 2094/2099 [1:08:39<00:06,  1.35s/it]


	 Done ! 




100%|██████████| 2099/2099 [1:08:39<00:00,  1.35s/it]



	 Done ! 




100%|██████████| 2099/2099 [1:08:45<00:00,  1.25s/it]



	 Done ! 


