In [1]:
import os
import time
import pandas as pd
import numpy as np
from multiprocessing import Pool
import boto3

In [2]:
s3 = boto3.resource('s3')
client = boto3.client('s3')
bucket = s3.Bucket('usa-edges-for-speed-mapping')

In [3]:
edges_csv = os.listdir('/home/ec2-user/osm/edges/')
n_csvs = len(edges_csv)
edges_csv[0]
n_csvs


3110

In [None]:
def processCSVs(n):
    edges = pd.read_csv('/home/ec2-user/osm/edges/'+edges_csv[n], low_memory=False)
    # number of edges
    nedges = edges.geometry.size
    # get longitude/latitude dataFrame for edges
    df_edge_loc = pd.DataFrame()
    print('Creating exploded dataframe for '+edges_csv[n].replace('.csv',''))
    for i in range(0, nedges):
        edge_string = edges['geometry'][i][12:-1].split(",")
        tmp = []
        for j in range(0, len(edge_string)):
            edge_array = np.fromstring(edge_string[j], sep=' ')
            edge_array = np.flip(edge_array,axis=0)
            tmp.append(edge_array)
            tmp2 = np.concatenate(tmp)
            dftmp = pd.DataFrame(tmp2)
        df_edge_loc = pd.concat([df_edge_loc, dftmp], axis=1, ignore_index=True)
    path = '/home/ec2-user/osm/edges_exploded/'+edges_csv[n].replace('.csv','_exploded.csv')
    df_edge_loc.to_csv(path)
    del df_edge_loc
    # write CSV to s3
    s3.meta.client.upload_file(path, 'usa-edges-for-speed-mapping', 'edges_exploded/{}'.format(edges_csv[n].replace('.csv','_exploded.csv')))
    print('Saved '+path+' to S3.')    
    s3dir = client.list_objects(Bucket='usa-edges-for-speed-mapping',
                Prefix='edges_exploded/')
    # remove CSV from local EC2 instance.
    os.remove(path)
    files = []
    for key in bucket.objects.filter(Prefix='edges_exploded/'):
        files.append(key.key)
    print((len(files)-1)/len(edges_csv)*100, 'percent complete.')

    
    
    


In [None]:
#t0 = time.time();
if __name__ == '__main__':
    with Pool(processes=61) as p:
#        p.map(processCSVs, range(0,n_csvs))
        # Rerun for missing n's due to crash:
        p.map(processCSVs, isnotin)

#print(time.time() - t0)

Creating exploded dataframe for OR_ShermanCounty
Creating exploded dataframe for ID_TetonCounty
Creating exploded dataframe for GA_WarrenCounty
Creating exploded dataframe for ND_RansomCounty
Creating exploded dataframe for AR_SearcyCounty
Creating exploded dataframe for OH_ErieCounty
Creating exploded dataframe for MI_OgemawCounty
Creating exploded dataframe for VA_NottowayCounty
Creating exploded dataframe for WY_ParkCounty
Creating exploded dataframe for PerryCounty_AL
Creating exploded dataframe for TX_UpshurCounty
Creating exploded dataframe for IL_ColesCounty
Creating exploded dataframe for MO_ClintonCounty
Creating exploded dataframe for IA_SiouxCounty
Creating exploded dataframe for GA_SumterCounty
Creating exploded dataframe for MD_DorchesterCounty
Creating exploded dataframe for KS_SalineCounty
Creating exploded dataframe for OH_AuglaizeCounty
Creating exploded dataframe for LA_SabineParish
Creating exploded dataframe for WV_MercerCounty
Creating exploded dataframe for FL_Mar

In [None]:
# Test parallelization speed up of CSV processing...

# Serial Test:

t0 = time.time();
for n in range(0,2):
    processCSVs(n)
print('Serial process took ',time.time() - t0, ' seconds')

In [None]:
os.cpu_count()

### Check for missing files in s3 due to jupyter notebook crash

In [5]:
files = []
for key in bucket.objects.filter(Prefix='edges_exploded/'):
    files.append(key.key)
files = files[1:]
len(files)

2552

In [6]:
for i in range(0,len(files)):
    files[i] = files[i].replace('edges_exploded/','')
for i in range(0,len(files)):
    files[i] = files[i].replace('_exploded','')
files

['AR_ArkansasCounty.csv',
 'AR_AshleyCounty.csv',
 'AR_BaxterCounty.csv',
 'AR_BentonCounty.csv',
 'AR_BooneCounty.csv',
 'AR_BradleyCounty.csv',
 'AR_CalhounCounty.csv',
 'AR_CarrollCounty.csv',
 'AR_ChicotCounty.csv',
 'AR_ClarkCounty.csv',
 'AR_ClayCounty.csv',
 'AR_CleburneCounty.csv',
 'AR_ClevelandCounty.csv',
 'AR_ColumbiaCounty.csv',
 'AR_ConwayCounty.csv',
 'AR_CrawfordCounty.csv',
 'AR_CrittendenCounty.csv',
 'AR_CrossCounty.csv',
 'AR_DallasCounty.csv',
 'AR_DeshaCounty.csv',
 'AR_DrewCounty.csv',
 'AR_FaulknerCounty.csv',
 'AR_FranklinCounty.csv',
 'AR_FultonCounty.csv',
 'AR_GarlandCounty.csv',
 'AR_GrantCounty.csv',
 'AR_GreeneCounty.csv',
 'AR_HempsteadCounty.csv',
 'AR_HotSpringCounty.csv',
 'AR_HowardCounty.csv',
 'AR_IndependenceCounty.csv',
 'AR_IzardCounty.csv',
 'AR_JacksonCounty.csv',
 'AR_JeffersonCounty.csv',
 'AR_JohnsonCounty.csv',
 'AR_LafayetteCounty.csv',
 'AR_LawrenceCounty.csv',
 'AR_LeeCounty.csv',
 'AR_LincolnCounty.csv',
 'AR_LittleRiverCounty.csv',
 '

In [7]:
edges_csv

['WV_TuckerCounty.csv',
 'KY_MarshallCounty.csv',
 'OR_ColumbiaCounty.csv',
 'WV_WetzelCounty.csv',
 'IA_BlackHawkCounty.csv',
 'ID_FranklinCounty.csv',
 'IL_PikeCounty.csv',
 'VA_VirginiaBeach.csv',
 'OK_KiowaCounty.csv',
 'SD_MeadeCounty.csv',
 'MT_McConeCounty.csv',
 'EtowahCounty_AL.csv',
 'CA_SanJoaquinCounty.csv',
 'IL_PeoriaCounty.csv',
 'VA_ArlingtonCounty.csv',
 'IN_AdamsCounty.csv',
 'SD_DeweyCounty.csv',
 'LA_PointeCoupeeParish.csv',
 'MT_PowellCounty.csv',
 'NC_WashingtonCounty.csv',
 'MS_NoxubeeCounty.csv',
 'FL_MonroeCounty.csv',
 'CO_DenverCounty.csv',
 'NE_BannerCounty.csv',
 'ID_MadisonCounty.csv',
 'MS_ClarkeCounty.csv',
 'ND_LaMoureCounty.csv',
 'NC_OrangeCounty.csv',
 'NY_FultonCounty.csv',
 'HenryCounty_AL.csv',
 'AR_MontgomeryCounty.csv',
 'SD_TrippCounty.csv',
 'SC_OconeeCounty.csv',
 'NM_McKinleyCounty.csv',
 'TN_ObionCounty.csv',
 'TX_GrayCounty.csv',
 'OH_HenryCounty.csv',
 'KY_JeffersonCounty.csv',
 'KY_RobertsonCounty.csv',
 'KS_EllsworthCounty.csv',
 'MS_Ja

In [8]:
isin = []
isnotin = []
for n in range(0, n_csvs):
    if edges_csv[n] in files:
        print('Match for n = ',n)
        #print(n)
        isin.append(n)
    else:
        isnotin.append(n)

Match for n =  0
Match for n =  1
Match for n =  2
Match for n =  3
Match for n =  4
Match for n =  5
Match for n =  6
Match for n =  7
Match for n =  8
Match for n =  9
Match for n =  10
Match for n =  11
Match for n =  12
Match for n =  13
Match for n =  14
Match for n =  15
Match for n =  16
Match for n =  17
Match for n =  18
Match for n =  19
Match for n =  20
Match for n =  21
Match for n =  22
Match for n =  23
Match for n =  24
Match for n =  25
Match for n =  26
Match for n =  27
Match for n =  28
Match for n =  29
Match for n =  30
Match for n =  31
Match for n =  32
Match for n =  33
Match for n =  34
Match for n =  35
Match for n =  36
Match for n =  37
Match for n =  38
Match for n =  39
Match for n =  40
Match for n =  41
Match for n =  42
Match for n =  43
Match for n =  44
Match for n =  45
Match for n =  46
Match for n =  47
Match for n =  48
Match for n =  49
Match for n =  50
Match for n =  51
Match for n =  52
Match for n =  53
Match for n =  54
Match for n =  55
Ma

Match for n =  476
Match for n =  477
Match for n =  478
Match for n =  479
Match for n =  480
Match for n =  481
Match for n =  482
Match for n =  483
Match for n =  484
Match for n =  485
Match for n =  486
Match for n =  487
Match for n =  488
Match for n =  489
Match for n =  490
Match for n =  491
Match for n =  492
Match for n =  493
Match for n =  494
Match for n =  495
Match for n =  496
Match for n =  497
Match for n =  498
Match for n =  499
Match for n =  500
Match for n =  501
Match for n =  502
Match for n =  503
Match for n =  504
Match for n =  505
Match for n =  506
Match for n =  507
Match for n =  508
Match for n =  509
Match for n =  510
Match for n =  511
Match for n =  512
Match for n =  513
Match for n =  514
Match for n =  515
Match for n =  516
Match for n =  517
Match for n =  518
Match for n =  519
Match for n =  520
Match for n =  521
Match for n =  522
Match for n =  523
Match for n =  524
Match for n =  525
Match for n =  526
Match for n =  528
Match for n 

Match for n =  984
Match for n =  985
Match for n =  987
Match for n =  988
Match for n =  989
Match for n =  990
Match for n =  991
Match for n =  992
Match for n =  993
Match for n =  994
Match for n =  995
Match for n =  996
Match for n =  997
Match for n =  998
Match for n =  999
Match for n =  1000
Match for n =  1001
Match for n =  1002
Match for n =  1003
Match for n =  1004
Match for n =  1005
Match for n =  1006
Match for n =  1007
Match for n =  1008
Match for n =  1009
Match for n =  1010
Match for n =  1012
Match for n =  1013
Match for n =  1014
Match for n =  1015
Match for n =  1016
Match for n =  1017
Match for n =  1018
Match for n =  1019
Match for n =  1020
Match for n =  1021
Match for n =  1022
Match for n =  1023
Match for n =  1024
Match for n =  1025
Match for n =  1026
Match for n =  1027
Match for n =  1028
Match for n =  1029
Match for n =  1030
Match for n =  1031
Match for n =  1032
Match for n =  1033
Match for n =  1034
Match for n =  1035
Match for n =  

Match for n =  1503
Match for n =  1504
Match for n =  1505
Match for n =  1506
Match for n =  1507
Match for n =  1508
Match for n =  1509
Match for n =  1510
Match for n =  1511
Match for n =  1512
Match for n =  1513
Match for n =  1514
Match for n =  1515
Match for n =  1516
Match for n =  1517
Match for n =  1518
Match for n =  1519
Match for n =  1520
Match for n =  1521
Match for n =  1522
Match for n =  1523
Match for n =  1524
Match for n =  1525
Match for n =  1526
Match for n =  1527
Match for n =  1528
Match for n =  1529
Match for n =  1530
Match for n =  1531
Match for n =  1532
Match for n =  1533
Match for n =  1534
Match for n =  1535
Match for n =  1536
Match for n =  1537
Match for n =  1538
Match for n =  1539
Match for n =  1540
Match for n =  1541
Match for n =  1542
Match for n =  1543
Match for n =  1544
Match for n =  1545
Match for n =  1547
Match for n =  1548
Match for n =  1549
Match for n =  1550
Match for n =  1551
Match for n =  1552
Match for n =  1553


Match for n =  2016
Match for n =  2017
Match for n =  2018
Match for n =  2019
Match for n =  2020
Match for n =  2021
Match for n =  2022
Match for n =  2023
Match for n =  2024
Match for n =  2025
Match for n =  2026
Match for n =  2028
Match for n =  2030
Match for n =  2033
Match for n =  2034
Match for n =  2035
Match for n =  2036
Match for n =  2037
Match for n =  2038
Match for n =  2039
Match for n =  2040
Match for n =  2041
Match for n =  2042
Match for n =  2043
Match for n =  2044
Match for n =  2045
Match for n =  2046
Match for n =  2047
Match for n =  2048
Match for n =  2049
Match for n =  2050
Match for n =  2051
Match for n =  2052
Match for n =  2053
Match for n =  2054
Match for n =  2055
Match for n =  2056
Match for n =  2057
Match for n =  2058
Match for n =  2059
Match for n =  2060
Match for n =  2061
Match for n =  2062
Match for n =  2063
Match for n =  2064
Match for n =  2065
Match for n =  2066
Match for n =  2067
Match for n =  2068
Match for n =  2069


Match for n =  2543
Match for n =  2544
Match for n =  2545
Match for n =  2546
Match for n =  2547
Match for n =  2548
Match for n =  2550
Match for n =  2553
Match for n =  2554
Match for n =  2555
Match for n =  2558
Match for n =  2559
Match for n =  2560
Match for n =  2561
Match for n =  2564
Match for n =  2565
Match for n =  2566
Match for n =  2567
Match for n =  2568
Match for n =  2569
Match for n =  2572
Match for n =  2573
Match for n =  2574
Match for n =  2575
Match for n =  2576
Match for n =  2577
Match for n =  2581
Match for n =  2582
Match for n =  2583
Match for n =  2587
Match for n =  2588
Match for n =  2589
Match for n =  2590
Match for n =  2591
Match for n =  2592
Match for n =  2593
Match for n =  2594
Match for n =  2595
Match for n =  2596
Match for n =  2597
Match for n =  2598
Match for n =  2602
Match for n =  2603
Match for n =  2604
Match for n =  2605
Match for n =  2606
Match for n =  2607
Match for n =  2608
Match for n =  2609
Match for n =  2610


In [9]:

print('lenth of files array =', len(isin))
print('length of edges array =', len(edges_csv))
print('difference =',len(isnotin))
len(edges_csv) - len(isin) - len(isnotin)
isnotin

lenth of files array = 2552
length of edges array = 3110
difference = 558


[123,
 124,
 125,
 196,
 320,
 390,
 401,
 423,
 527,
 580,
 581,
 711,
 712,
 832,
 896,
 980,
 986,
 1011,
 1128,
 1219,
 1220,
 1243,
 1246,
 1271,
 1301,
 1367,
 1368,
 1369,
 1392,
 1393,
 1439,
 1441,
 1442,
 1463,
 1464,
 1465,
 1546,
 1620,
 1630,
 1632,
 1633,
 1702,
 1771,
 1772,
 1802,
 1882,
 1911,
 1948,
 1987,
 2027,
 2029,
 2031,
 2032,
 2093,
 2094,
 2095,
 2155,
 2156,
 2169,
 2170,
 2176,
 2192,
 2209,
 2214,
 2261,
 2358,
 2359,
 2363,
 2427,
 2441,
 2475,
 2476,
 2487,
 2494,
 2512,
 2528,
 2530,
 2549,
 2551,
 2552,
 2556,
 2557,
 2562,
 2563,
 2570,
 2571,
 2578,
 2579,
 2580,
 2584,
 2585,
 2586,
 2599,
 2600,
 2601,
 2622,
 2624,
 2625,
 2627,
 2628,
 2631,
 2638,
 2639,
 2640,
 2650,
 2651,
 2652,
 2654,
 2655,
 2656,
 2657,
 2658,
 2662,
 2663,
 2664,
 2667,
 2668,
 2669,
 2670,
 2671,
 2672,
 2673,
 2674,
 2675,
 2676,
 2677,
 2678,
 2679,
 2680,
 2681,
 2682,
 2683,
 2684,
 2685,
 2686,
 2687,
 2688,
 2689,
 2690,
 2691,
 2692,
 2693,
 2694,
 2695,
 2696,
 2

In [None]:
import re
string = '_AL_exploded.csv'
regex=re.compile(".*(_AL_exploded.csv).*")
#if string in files:
#    print('True')
x = [m.group(0) for l in files for m in [regex.search(l)] if m]

In [None]:
x

In [None]:
type(isnotin)
type(edges_csv)

In [9]:
isnotin[0:10]

[123, 124, 125, 126, 127, 128, 129, 196, 197, 198]

In [14]:
edges_csv[198]

'NE_BoxButteCounty.csv'

In [13]:
isnotin[9:10]

[198]