In [1]:
import os
import datetime
import threading
import numpy as np
import pandas as pd
import multiprocessing
from tqdm import tqdm, tqdm_notebook

# What I did here
<br>
Here, we split the data into 6-months' window (so that the difference of the max and min dates in it will be 6 months) starting from the begining. 

This looks something like this:

               
                         -------------------------->                     ------------>
                      -------------------------->                     ------------>
                   ------------------------->                     ------------>
                __________   ___________   ___________   _______   ____________
              _/ 3 window \_/ 33 window \_/ 63 window \_/ ..... \_/ N+3 window \_
            __________   ___________   ___________   _______   ____________
          _/ 2 window \_/ 32 window \_/ 62 window \_/ ..... \_/ N+1 window \_
        __________   ___________   ___________   _______   __________
      _/ 1 window \_/ 31 window \_/ 61 window \_/ ..... \_/ N window \_
    1950.01------1950.07------1951.01------1951.07------_____............------2018.01
    __________________________________________________________________________________


Firstly I split the initial data with 6-months' window and save the resulting data-pieces to .pkl format.

After I ready each one and make the coresponding histogram saving it to .npy format (numpy array format).
<br> <br> <br>


In [3]:
########################
##    Reading data    ##
########################

data = pd.read_pickle('total_data.pkl')

print('data :', data.shape)
data.head()

data : (948193, 15)


Unnamed: 0,BEGIN_LAT,BEGIN_LON,BEGIN_YEARMONTH,STATE,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,EVENT_TYPE,DAMAGE_CROPS,MAGNITUDE,MAGNITUDE_TYPE,FLOOD_CAUSE,DATE
0,35.12,-99.2,195004,OKLAHOMA,0,0,0,0,250K,Tornado,0,0.0,,,1950-04-13
1,31.9,-98.6,195004,TEXAS,0,0,0,0,25K,Tornado,0,0.0,,,1950-04-22
2,40.58,-75.7,195007,PENNSYLVANIA,2,0,0,0,25K,Tornado,0,0.0,,,1950-07-11
3,40.6,-76.75,195007,PENNSYLVANIA,0,0,0,0,2.5K,Tornado,0,0.0,,,1950-07-10
4,41.63,-79.68,195007,PENNSYLVANIA,0,0,0,0,2.5K,Tornado,0,0.0,,,1950-07-18


In [32]:
###########################################################
##   Froming DIST_FROM_START column for fast splitting   ##
###########################################################

start = data.DATE.min()
data['DIST_FROM_START'] = data.DATE.apply(lambda x: (x-start).days)

In [9]:
#######################################
##    Split with the given window    ##
#######################################

def split(data, window):
    start = data.DIST_FROM_START.min()   # start of the timeline
    finish = data.DIST_FROM_START.max()  # end of the timeline
    
    splits = []
    chunk_count = 0
    for date in tqdm(np.arange(start, finish)):
        hold = data[np.logical_and(data.DIST_FROM_START>=date, data.DIST_FROM_START<date+window)]
        splits.append(hold)
    
    return splits

In [6]:
##################################################
##          This is the most heavy part         ##
##    After running, "splits" results to 22GB   ##
##################################################

splits = split(data, 6*30)

100%|██████████| 25193/25193 [02:02<00:00, 193.65it/s]


In [8]:
############################################
##   Saving the resulting 22 GB of data   ##
############################################

for i, split in tqdm(enumerate(splits)):
    split.to_pickle('splited_data/split_'+str(i))

25193it [10:22, 40.47it/s] 


<br>
Now we have the 6-month splits and it's time to make the histograms/heatmaps. 
<br> <br>

In [5]:
######################################
##    Making the heatmap dataset    ##
######################################

def make_heatmaps(file_names, save_path):
    chunk_count = 0
    heatmap_data = []
    for file_name in tqdm(file_names):
#         print(i+1, 'from', len(file_names))
        hold = pd.read_pickle(file_name)
        heatmap_data.append(np.histogram2d(hold.BEGIN_LON,hold.BEGIN_LAT,
                                           bins=(50, 20),range=[[-130, -64], [24, 50]])[0])
#     print('chunk_count :', chunk_count)
    np.save(os.path.join(save_path,'heatmaps.npy'), heatmap_data)
    print('\n\t Done ! \n\n') 

In [3]:
#############################################################
##   Seperating the tast to 4 processes(processor cores)   ##
#############################################################

file_names_0 = []
file_names_1 = []
file_names_2 = []
file_names_3 = []

#####################################
##   Checking correct indexation   ##
#####################################
assert (set(map(lambda x: int(x[6:]), os.listdir('splited_data')))==
        set(range(min(map(lambda x: int(x[6:]), os.listdir('splited_data'))), 
                  max(map(lambda x: int(x[6:]), os.listdir('splited_data')))+1)))

for i in range(min(map(lambda x: int(x[6:]), os.listdir('splited_data'))), 
               max(map(lambda x: int(x[6:]), os.listdir('splited_data')))+1):
    if i%4==0:
        file_names_0.append('splited_data/split_'+str(i))
    elif i%4==1:
        file_names_1.append('splited_data/split_'+str(i))
    elif i%4==2:
        file_names_2.append('splited_data/split_'+str(i))
    else:
        file_names_3.append('splited_data/split_'+str(i))

print('file_names_0 :', len(file_names_0))
print('file_names_1 :', len(file_names_1))
print('file_names_2 :', len(file_names_2))
print('file_names_3 :', len(file_names_3))

file_names_0 : 6299
file_names_1 : 6298
file_names_2 : 6298
file_names_3 : 6298


In [6]:
################################
##    Runing the processes    ##
################################

p0 = multiprocessing.Process(target=make_heatmaps, args=(file_names_0, 'hist_data/process_0'))
p1 = multiprocessing.Process(target=make_heatmaps, args=(file_names_1, 'hist_data/process_1'))
p2 = multiprocessing.Process(target=make_heatmaps, args=(file_names_2, 'hist_data/process_2'))
p3 = multiprocessing.Process(target=make_heatmaps, args=(file_names_3, 'hist_data/process_3'))

p0.start()
p1.start()
p2.start()
p3.start()

p0.join()
p1.join()
p2.join()
p3.join()

100%|██████████| 6298/6298 [01:37<00:00, 64.52it/s] 
 99%|█████████▊| 6208/6298 [01:37<00:02, 34.20it/s]


	 Done ! 



 99%|█████████▉| 6230/6299 [01:37<00:03, 19.55it/s]




100%|██████████| 6298/6298 [01:39<00:00, 63.41it/s]
100%|██████████| 6299/6299 [01:39<00:00, 63.36it/s]
 99%|█████████▉| 6244/6298 [01:39<00:02, 21.23it/s]


	 Done ! 



	 Done ! 




100%|██████████| 6298/6298 [01:40<00:00, 62.97it/s]



	 Done ! 


