In [1]:
import argparse
import os
import sys
import psutil
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import math
from IPython.display import display
from multiprocessing import cpu_count,Pool 
import multiprocessing
from joblib import Parallel, delayed
from orderedset import OrderedSet
import datetime
import pickle

In [2]:
def print_memory_usage():
    print ("memory log:")
    process = psutil.Process(os.getpid())
    print("%5.2f GB (RSS)" % (process.memory_info().rss / 2**30))
    print("%5.2f GB (VMS)" % (process.memory_info().vms / 2**30))
    print("%5.2f GB (Used)" % (psutil.virtual_memory().used / 2**30))
    print("%5.2f GB (Available)" % (psutil.virtual_memory().available / 2**30))
    print("%5.2f GB (Total)" % (psutil.virtual_memory().total / 2**30))




def distance(data,lat,lng,idx):
    fLat = np.radians(data.LocationLat)
    fLon = np.radians(data.LocationLng)
    sLat = np.radians(lat)
    sLon = np.radians(lng)
    R = 3958.7564 #mi
    #R = 6371000.0 #meters
    #R = 6371.0 #km
    
    dLon = sLon - fLon
    dLat = sLat - fLat
    a = np.sin(dLat/2.0)**2 + (np.cos(fLat) * np.cos(sLat) * np.power(np.sin(dLon/2.0), 2))
    
    c = 2.0 * np.arctan2(np.sqrt(a), np.sqrt(1.0 - a))
    data['dis2event_'+str(idx)] =  R * c
    return data

class WithExtraArgs(object):
    def __init__(self, func, **args):
        self.func = func
        self.args = args
    def __call__(self, df):
        return self.func(df, **self.args)

def parjob_long_event_group_T(data,filepath,key_ds):
    process_name = str(multiprocessing.current_process())
    id = int(process_name.split(',')[0].split('-')[1])
    print("process ",id," started")
    
    
    ds = pd.read_hdf(filepath,key=key_ds)
    print ("data set is loaded data size is ",ds.shape[0])
    traffic_events = ds[ds.Type!='W']
    
    re_list=[]
    raduis=14
    total = data.shape[0]
    counter=0
    event_duration_week_offset = 60*24*60
    print ("partial long event for process ",id," is ",data.shape[0])
    out_dict={}
    
    for idx,long_event in data.iterrows():
        #(traffic_events.City == long_event.City) &
        
        temp_df = traffic_events[  (traffic_events.State == long_event.State)  & 
                                 ((traffic_events.StartTime >= long_event.StartTime- 
                                     pd.Timedelta(event_duration_week_offset, unit='D')) & 
                                (traffic_events.EndTime <= long_event.EndTime+
                                      pd.Timedelta(event_duration_week_offset, unit='D')))]#time limit
        temp_df = distance(temp_df,long_event.LocationLat,long_event.LocationLng,idx)
        
        
        filtered_by_distance = temp_df [(temp_df['dis2event_'+str(idx)] <=raduis) & (temp_df.index != idx) ]
        #['dis2event_'+str(idx)] != 0.0)
        out_dict[idx] = filtered_by_distance.index
        #re_list.append(filtered_by_distance.index)
        
        counter+=1
        if counter%100==0:
            print ("process ", id, counter,"/",total, "long event proccesed ",datetime.datetime.now().time())        
    
    f = open('dict_files/T_'+'split_2_'+str(id)+'.pkl',"wb")
    pickle.dump(out_dict,f)
    f.close()
    return True #re_list

def applyParallel_list(pool,data, func, kwargs):
    data_split = np.array_split(data,partitions)
    data_tag =pool.map(WithExtraArgs(func, **kwargs), data_split)
    return data_tag


In [3]:
cores = 8#cpu_count() #Number of CPU cores on your system
partitions = cores #Define as many partitions as you want

In [4]:
import os, glob

path = "dict_files/"
dirpath = os.getcwd()
dirpath+='/'+path
for filename in glob.glob(os.path.join(dirpath, 'T_split_2*')):
    os.remove(filename)

In [5]:
effective_days_list=[300]
for effective_days in effective_days_list:
    long_events_ = pd.read_hdf('../../LG.h5',key='T_split_1')
    print ("long event size is ",long_events_.shape[0])
    
    subset_df = long_events_[long_events_.Type != 'W']
    print ("Traffic long events size is ",subset_df.shape[0])
    pool_t = Pool(cores)
    Traffic_list = applyParallel_list(pool_t,subset_df,parjob_long_event_group_T,{"filepath":'../../data_set_.h5','key_ds':'DS_'+str(effective_days)})
    pool_t.close()
    pool_t.join()
    print ("done with traffic set")
    print ("*"*80)
    print_memory_usage()
    
    #np.save("event_list_45_t"+str(effective_days),Traffic_list)

long event size is  13036
Traffic long events size is  13036
process  1  started
process  2  started
process  3  started
process  4  started
process  5  started
process  6  started
process  7  started
process  8  started
data set is loaded data size is  15192678
data set is loaded data size is  15192678
data set is loaded data size is  15192678
data set is loaded data size is  15192678
data set is loaded data size is  15192678
data set is loaded data size is  15192678
data set is loaded data size is  15192678
data set is loaded data size is  15192678
partial long event for process  6  is  1629
partial long event for process  1  is  1630
partial long event for process  3  is  1630
partial long event for process  7  is  1629
partial long event for process  4  is  1630


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


partial long event for process  2  is  1630
partial long event for process  8  is  1629


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


partial long event for process  5  is  1629


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

process  4 100 / 1630 long event proccesed  20:43:50.252613
process  3 100 / 1630 long event proccesed  20:43:52.029649
process  8 100 / 1629 long event proccesed  20:43:52.953860
process  6 100 / 1629 long event proccesed  20:43:57.447358
process  7 100 / 1629 long event proccesed  20:43:57.717372
process  5 100 / 1629 long event proccesed  20:44:01.022181
process  2 100 / 1630 long event proccesed  20:44:07.360098
process  1 100 / 1630 long event proccesed  20:44:31.869075
process  4 200 / 1630 long event proccesed  20:45:33.869509
process  8 200 / 1629 long event proccesed  20:45:37.874733
process  3 200 / 1630 long event proccesed  20:45:46.836427
process  7 200 / 1629 long event proccesed  20:45:47.918437
process  6 200 / 1629 long event proccesed  20:45:49.138141
process  5 200 / 1629 long event proccesed  20:45:56.053560
process  2 200 / 1630 long event proccesed  20:45:58.782037
process  1 200 / 1630 long event proccesed  20:46:56.475041
process  4 300 / 1630 long event procces