# Select k nearest store by using Python Geopy 
Purpose 
 - Apply Python Geopy to get 'latitude', 'longitude' for each store
 - Construct a distance matrix by applying  function distance() to calculate distance between stores
 - Convert the distance matrix to a dataframe and load it into a table 
 
## This module includes the following steps:
 1. Initialization : Import packages/libraries and set up Teradata connection 
 2. Load store address
 3. Apply Geopy to get Geopy location and coordinators and save them to an excel file
 4. Read the excel file and construct matrix distance 
 5. Convert the distance matrix to a dataframe and load it into a table:  temp_tables.rs_store_dist 
 6. Create a table to store k nearest store: temp_tables.rs_nearst_store
    (This step might not need. top k store can get from temp_tables.rs_store_dist 
    
## Input table
 - dw_bi_vw.dim_2_fac 
   - with  fac_nbr > 0 and end_dt ='9998-12-31'
     - There are 3429 stores meet the criteria
       - Three are 101 stores with blank address
         - The stores do not oprate
       - There are 2613 stores with address not recognized by Geopy  
       
## Output
- C:\SYUE\RecSys\df_str_addr_jpoint.xlsx'
  - store_nbr with physical address, geopy location and point 
- temp_tables.rs_store_dist
  - distance between stores
-  temp_tables.rs_nearst_store

## Module name: sel_nbrK_store
- Author: Sophia Yue
- Date  : Oct 2019  
 
## Comments
- The distance between stores can be used to cluster stores
  - Application(s) : TBD
- Might need to correct the address to be able to get 'latitude', 'longitude' from Geopy   

## Main process starts here
## Initialization 
 - Python compile() function is used to compile the source into code object or AST module object. 
 - The returned code object can be executed using exec()
 - Use compile function to execute the following codes which might be used by other modules 
   - c_import.py        : Import packages/libraries 
   - c_setup_dbs_con.py : Set up Teradata connection
   - c_time_dte.py      : Calculate elapsed time 


In [1]:
prg_name = ""
path_code = "C:\\Users\\syue003\\wip_RecSys\\"
c_import  = path_code + "c_import.py"
c_setup_dbs_con = path_code + "c_setup_dbs_con.py"
c_timedte = path_code + "c_time_dte.py" 

exec(compile(open(c_import, 'rb').read(), c_import,  'exec'))
exec(compile(open(c_setup_dbs_con, 'rb').read(),c_setup_dbs_con, 'exec'))
exec(compile(open(c_timedte, 'rb').read(),c_timedte, 'exec'))
session, td_enginex = cf_setup_dbs_con(userName = 'syue003', passWord = 'Chungli#1')


t_engine teradata://syue003:Chungli#1@tqdpr02/temp_tables


## Extract store address from dw_bi_vw.dim_2_fac

In [2]:

#Load the transactional data into a Pandas dataframe  
query = """
    select fac_nbr,   
           ( trim(addr_line1_txt) || ' ' ||  trim( city_nm) || ' ' ||  trim( state_id) || ' '  || trim( zip5_id) ) 
           as address  
    from   dw_bi_vw.dim_2_fac   
    where  fac_nbr > 0 and end_dt ='9998-12-31' 
    group  by fac_nbr, address order by  fac_nbr 
    """
df_str_address = pd.read_sql(query,session) 

In [3]:
# Change data type of FAC_NBR to int64
df_str_address ['FAC_NBR']= df_str_address ['FAC_NBR'].astype(np.int64)

In [4]:
"""
 Create store internal id
 
"""      
unq_fac_nbr = set(df_str_address.FAC_NBR.values)

facnbr_idx = {}
count = 0
for FAC_NBR in unq_fac_nbr:
    facnbr_idx[FAC_NBR] = count
    count += 1
df_str_address['facnbr_idx'] = df_str_address.apply(lambda row: facnbr_idx[row.FAC_NBR], axis=1)   

## Get geopy location and point

In [5]:
"""
 Define RateLimiter object : geopy.extra.rate_limiter.RateLimiter
"""
locator = Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=0.0)

In [6]:
df = df_str_address

In [7]:
"""
 Add a column "location" by applying "geocode" to df column "address" to get location
  - geocode will return geopy.Location.Location 
  - Very time consumming
"""
start_time = time.time()
df['location'] = df['address'].apply(geocode)
fnc_name = "Apply Geocode"
end_time = time.time()
cf_elapse_time (  start_time, end_time, "Function {0} completed.".format(fnc_name))

 Function Apply Geocode completed. It took 33919.002931 seconds - 9hh:25mm:19ss.
 start time: Oct 09 2019 09:16:51  end time:  Oct 09 2019 18:42:10


In [9]:
"""
 Create column 'point' by applying point function to "location" 
 - point will return 'latitude', 'longitude', 'altitude'
   - will not extract  'altitude'
   - Define value of point as None if the value of location is None  
"""
start_time = time.time()
df['point'] = df['location'].apply(lambda loc: tuple(loc.point)[0:2] if loc else None)
end_time = time.time()
cf_elapse_time (  start_time, end_time, "Function {0} completed.".format("point"))

 Function point completed. It took 0.015000 seconds - 0hh:0mm:0ss.
 start time: Oct 09 2019 19:46:51  end time:  Oct 09 2019 19:46:51


## Save dataframe to an excel file
 - It is time consuming to apply 'geocode' to get the geotype location
 - Save dataframe to an excel file for future use 

In [10]:
df.to_excel('C:\SYUE\RecSys\df_str_addr_jpoint.xlsx', index=False)

## Construct a distance matrix
 - Calculate distance between stores and save them in a  matrix 

   

In [2]:
#Cal matrix of dis
df = pd.read_excel('C:\SYUE\RecSys\df_str_addr_jpoint.xlsx')


In [222]:
df.head()

Unnamed: 0,FAC_NBR,address,facnbr_idx,location,point
0,1,1340 PATRIOT BLVD GLENVIEW IL 60026,0,"Jewel-Osco, 1340, Patriot Boulevard, Glenview,...","(42.07856325, -87.8194154543438)"
1,2,201 SOUTH STEPHANIE STREET Henderson NV 89012,1,"South Stephanie Street, Macdonald Ranch, Hende...","(36.0324434, -115.0471777)"
2,3,14800 S E SUNNYSIDE RD Clackamas OR 97015,2,,
3,4,3130 WEST CAREFREE HIGHWAY Phoenix AZ 85086,3,"3130, West Carefree Highway, North Gateway, Ph...","(33.7984748371083, -112.127522406678)"
4,5,11120 S. LAKE DR. RESTON VA 20191,4,,


### Construct a list l_point  to save the latitude, longitude to be ready for geopy.Point
- If the address is validate, the column point would be like  a string
  (latitude, longitude), e.g. (42.07856325, -87.8194154543438)
   - required to convert it to a string of 'latitude, longitude' 
      - A format accepted by geopy.Point
- If the address is invalidate,the value of point would be nan and the format would be float   
  - Will save ""  in the list 
- Use iterrows to iterate over DataFrame rows as (index, Series) pairs.
  - The columns of df are
    - FAC_NBR, address, facnbr_id, location, point
- l_point would be a list of strings with 'latitude, longitude' or empty string. 


In [3]:
l_point = []
for index, row in df.iterrows(): 
    point = list(row)[4]  # Get VALUE FROM COLUMN "point"    
    if type(point) == float and math.isnan(point): 
       s_lat_lon = "" 
    else:      
       s_lat_lon = point[1:-1] #  Remove parenthesis
    l_point.append(s_lat_lon)

In [4]:
len (l_point)

3429

In [5]:
l_point[0:9]

['42.07856325, -87.8194154543438',
 '36.0324434, -115.0471777',
 '',
 '33.7984748371083, -112.127522406678',
 '',
 '',
 '46.000883, -112.519532',
 '39.5762384, -104.914441523727',
 '45.7727214646465, -111.185115020202']

## Calculate distance
- Convert the string  'latitude, longitude' in l_point to Point object and calculate the distance
  - If the p1 is 'nan' will assign 'nan' to the distance for all stores
  - If the p2 is 'nan' will assign 'nan' to the distance bwtween p1 and p2
  - Apply the geopy function distance.distance with parameter p1 and p2 to calculate the distane
    - Return the distance as mile
    
 ### Notes
  - Need to import math first, otherwise will get the mnessage of 'nan' is not defined;
    - nan cab be replaced by NaN

In [6]:
distMatrix = []
l_point_x = l_point
l_nan = [float('nan') for i in range(len(l_point))]  # 

for s_lat_lon_1  in l_point:
    l_dis_x = []
    for s_lat_lon_2 in l_point_x:
        if s_lat_lon_1 is "":
           l_dis_x = l_nan
           continue
        else:
          if s_lat_lon_2 == "":           
             dist = float("nan") 
          else:
             p1 = Point(s_lat_lon_1)
             p2 = Point(s_lat_lon_2)
             dist = round(distance.distance(p1, p2).mi, 4)
          l_dis_x.append(dist)
    distMatrix.append(l_dis_x) 

In [7]:
len(distMatrix)

3429

In [8]:
distMatrix[0: 2]

[[0.0,
  1515.3969,
  nan,
  1438.8518,
  nan,
  nan,
  1254.2021,
  910.8979,
  1188.9415,
  nan,
  nan,
  nan,
  1524.9841,
  1511.6606,
  nan,
  nan,
  nan,
  1116.285,
  1329.7978,
  nan,
  nan,
  1437.8323,
  1328.2048,
  nan,
  nan,
  nan,
  910.2298,
  nan,
  nan,
  891.24,
  nan,
  1206.9245,
  1062.0418,
  nan,
  nan,
  920.4738,
  nan,
  1331.067,
  1515.8737,
  1521.8449,
  nan,
  nan,
  25.0304,
  24.0656,
  990.5019,
  nan,
  29.0291,
  948.811,
  1517.8377,
  943.6534,
  34.9493,
  977.5544,
  878.8068,
  881.5891,
  903.9133,
  20.2054,
  812.8167,
  8.4919,
  50.4853,
  13.2879,
  1014.4444,
  nan,
  nan,
  nan,
  nan,
  1513.4191,
  1519.24,
  1754.7492,
  27.5744,
  1509.4865,
  9.5029,
  907.0901,
  1438.7938,
  1329.1673,
  1462.6336,
  nan,
  nan,
  813.8568,
  613.6513,
  1747.4767,
  nan,
  1737.3546,
  1747.1957,
  nan,
  836.6195,
  673.5382,
  1747.4991,
  nan,
  754.0131,
  1519.9898,
  1438.195,
  851.5094,
  783.5468,
  nan,
  23.7505,
  615.4535,
  1720.91

## Build a dictionary to convert the internal store id to raw store id
   - value  : FAC_NBR
   - key    : Inner store id      

In [9]:
keys = df.facnbr_idx.to_list()
val_fac_nbr = df.FAC_NBR.to_list()
d_fac_nbr = dict(zip(keys, val_fac_nbr))
val_addr = df.address.to_list()
d_fac_nbr_addr = dict(zip(keys, val_addr))

In [19]:
d_fac_nbr[0]

1

## Get k nearest stores
- Build an array to get all the distances  
   -  store_dist: a list to have all the inner store id, the distance related to the target store ssid
      -  store_dist will pass to heapq.nsmallest to get K nearst strores
         - heapq.nsmallest is a Python method to sore a dictionary by value
      - innerID: inner store id
         - kNeighbors is a list of tuple with (innerID, dist), e.g.  
            [(iid0, dist0), (iid1, dist1) .... (iid9, dist9)] 
            - iid0 would have the smallest distance and is close to siid most
 

In [43]:
K = 11
l_knbr=[]
for siid in range(len(df)):   
    distRow = distMatrix [siid]
    
    store_dist = []        
    for innerID, dist in enumerate(distRow):
        if (math.isnan(float(dist) )!= True):
            store_dist.append( (innerID, dist) )  
  
    if len(store_dist) != 0:        
       kNeighbors = heapq.nsmallest(K, store_dist, key=lambda strDist: strDist[1]) 
       knbr_x     = list (itertools.chain(*kNeighbors)) 
       
       knbr_id    = knbr_x[::2]       # Get internal store id from knbr_x (:2 would skip the dist)
       knbr_dist  = knbr_x[1::2]      # get distance 
       knbr_strid = [d_fac_nbr[x] for x in knbr_id]      # Convert internal id to raw id  
       knbr_addr  = [d_fac_nbr_addr[x] for x in knbr_id] # Get store address
          
       
       for i in range(K):
           rid = d_fac_nbr[siid]
           dist_rnk = i 
           l_knbr.append([rid, knbr_strid[i], knbr_dist[i], dist_rnk, knbr_addr[i]] )    
      

## Create a daframe from l_knbr

In [46]:

df_knbr = pd.DataFrame(l_knbr, columns =['FAC_NBR', 'knbr_strid', 'knbr_dist', 'dist_rnk', 'knbr_addr']) 

In [47]:
df_knbr.head(20)

Unnamed: 0,FAC_NBR,knbr_strid,knbr_dist,dist_rnk,knbr_addr
0,1,1,0.0,0,1340 PATRIOT BLVD GLENVIEW IL 60026
1,1,1014,1.0821,1,1020 WAUKEGAN RD. GLENVIEW IL 60025
2,1,1052,5.1567,2,4125 DUNDEE RD. NORTHBROOK IL 60062
3,1,1028,5.2291,3,1145-55 MT. PROSPECT PLAZA MT. PROSPECT IL 60056
4,1,1101,5.3156,4,1555 LEE ST. DES PLAINES IL 60018
5,1,1163,5.498,5,1919 SKOKIE VALLEY RD. HIGHLAND PARK IL 60035
6,1,1113,5.9151,6,6312 N. NAGLE AVE. CHICAGO IL 60646
7,1,1137,6.0654,7,2748 GREEN BAY RD. EVANSTON IL 60201
8,1,1032,6.368,8,1900 S. CUMBERLAND AVE. PARK RIDGE IL 60068
9,1,1093,8.0538,9,525 CHICAGO AVE. EVANSTON IL 60202


## load the dataframe to a table

In [57]:
df_knbr.knbr_addr = df_knbr.knbr_addr.astype(str)

In [60]:
    fnc_name = 'Load table'
    start_time = time.time()
    tbl_nm = "rs_nearst_store"
    
    df_knbr.to_sql(con=td_enginex, name=tbl_nm, if_exists='replace', index = False)
    end_time = time.time()
    cf_elapse_time (  start_time, end_time, "Function {0} to load table {1} completed.".format(fnc_name, tbl_nm))

    

 Function Load table to load table rs_nearst_store completed. It took 127.388979 seconds - 0hh:2mm:7ss.
 start time: Oct 10 2019 09:07:48  end time:  Oct 10 2019 09:09:56


In [10]:
df.location.isnull().sum()

2714

In [11]:
df.shape


(3429, 5)

In [232]:
l_knbr[0:1]

[[1, 1017, 1.082132327351341, 1, '1414 LOOP 336 W. CONROE TX 77304']]

In [12]:
df_matrix = pd.DataFrame(data = distMatrix, columns = val_fac_nbr, index = val_fac_nbr   )

In [13]:
df_matrix.head() 

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,4932,4933,4964,4967,4970,4979,4984,4995,4999,9953
1,0.0,1515.3969,,1438.8518,,,1254.2021,910.8979,1188.9415,,...,723.2867,602.1927,603.7575,,,,,,,1825.6769
2,1515.3969,0.0,,226.2499,,,700.3509,605.4764,701.6488,,...,2227.8388,2077.4211,2081.8431,,,,,,,398.2385
3,,,,,,,,,,,...,,,,,,,,,,
4,1438.8518,226.2499,,0.0,,,842.1355,564.5797,827.6273,,...,2131.098,1969.9663,1975.2583,,,,,,,612.6175
5,,,,,,,,,,,...,,,,,,,,,,


## Create a dataframe from distMatrix
    - There are 3429 columns from the distMatrix
      - Since too many columns, can not save the DF directly from distMatrix to an excel or a table
        - Need to dismelt one row with length of 3429 to 3429 rows and create a dataframe 
          -  distMatrix[0] is the row of distances bwtween fac_nbr = 1 and all the stores
          -  distMatrix[0] = [0, 1515.3969, nan, 1438.8518,nan, ....]
             -  1515.3969 is the distance between fac_nbr = 1 and fac_nbr = 2
             -  The address of fac_nbr = 3 is invalid,  the distance between fac_nbr = 1 and fac_nbr = 3 
                would be nan 
                - The address of fac_nbr = 3 is 14800 S E SUNNYSIDE RD Clackamas OR 97015
                  - The valid address of fac_nbr = 3 S.B.  14800 SE SUNNYSIDE RD Clackamas OR 97015 
          -  will create an list with value like
             [[1,1,0], [1,2, 1515.3969], [1,3,nan], [1, 4, 1438.8518],......]
          -  The lists will create an array and create a dataframe which will be loaded into a table
          -  l_str_dis is a list, use extend to add the list to a_str_dis
             - append adds its argument as a single element to the end of a list. The length of the list itself will                  increase by one.
             - extend iterates over its argument adding each element to the list, extending the list. The length of the            list will increase by however many elements were in the iterable argument

In [24]:
a_str_dis=[]
str_cnt = len(df)
for siid in range(str_cnt): 
    id = d_fac_nbr[siid]
    l_id  = [id for i in range(str_cnt)]
    l_dis = distMatrix[siid]
    l_str_dis_tuple = list(zip(l_id, val_fac_nbr, l_dis))
    l_str_dis = [list(x) for x in l_str_dis_tuple]
    a_str_dis.extend(l_str_dis) 

In [16]:
df_dist = pd.DataFrame(a_str_dis, columns =['fac_nbr', 'str_id', 'distance']) 

In [31]:
df_dist_x = df_dist.dropna()

In [32]:
df_dist_x.shape

(511225, 3)

In [34]:

fnc_name = 'Load matrix table'
start_time = time.time()
tbl_nm = "rs_store_dist"

df_dist_x.to_sql(con=td_enginex, name=tbl_nm, if_exists='replace', index = False)
end_time = time.time()
cf_elapse_time (  start_time, end_time, "Function {0} to load table {1} completed.".format(fnc_name, tbl_nm))

 Function Load matrix table to load table rs_store_dist completed. It took 5559.345044 seconds - 1hh:32mm:39ss.
 start time: Oct 15 2019 15:35:50  end time:  Oct 15 2019 17:08:29


In [144]:
df_dist.shape

(11758041, 3)

In [36]:
pwd


'C:\\Users\\syue003\\wip_RecSys'

In [38]:
import pandas
pandas.__path__

['C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas']

In [92]:
distMatrix[0]

[0.0,
 1515.3969,
 nan,
 1438.8518,
 nan,
 nan,
 1254.2021,
 910.8979,
 1188.9415,
 nan,
 nan,
 nan,
 1524.9841,
 1511.6606,
 nan,
 nan,
 nan,
 1116.285,
 1329.7978,
 nan,
 nan,
 1437.8323,
 1328.2048,
 nan,
 nan,
 nan,
 910.2298,
 nan,
 nan,
 891.24,
 nan,
 1206.9245,
 1062.0418,
 nan,
 nan,
 920.4738,
 nan,
 1331.067,
 1515.8737,
 1521.8449,
 nan,
 nan,
 25.0304,
 24.0656,
 990.5019,
 nan,
 29.0291,
 948.811,
 1517.8377,
 943.6534,
 34.9493,
 977.5544,
 878.8068,
 881.5891,
 903.9133,
 20.2054,
 812.8167,
 8.4919,
 50.4853,
 13.2879,
 1014.4444,
 nan,
 nan,
 nan,
 nan,
 1513.4191,
 1519.24,
 1754.7492,
 27.5744,
 1509.4865,
 9.5029,
 907.0901,
 1438.7938,
 1329.1673,
 1462.6336,
 nan,
 nan,
 813.8568,
 613.6513,
 1747.4767,
 nan,
 1737.3546,
 1747.1957,
 nan,
 836.6195,
 673.5382,
 1747.4991,
 nan,
 754.0131,
 1519.9898,
 1438.195,
 851.5094,
 783.5468,
 nan,
 23.7505,
 615.4535,
 1720.9153,
 nan,
 43.4947,
 577.6,
 nan,
 1344.8521,
 1474.876,
 nan,
 890.5663,
 nan,
 nan,
 905.1217,


In [8]:
df.shape

(3429, 4)

In [25]:
distMatrix[0]

[0.0,
 1515.3969,
 nan,
 1438.8518,
 nan,
 nan,
 1254.2021,
 910.8979,
 1188.9415,
 nan,
 nan,
 nan,
 1524.9841,
 1511.6606,
 nan,
 nan,
 nan,
 1116.285,
 1329.7978,
 nan,
 nan,
 1437.8323,
 1328.2048,
 nan,
 nan,
 nan,
 910.2298,
 nan,
 nan,
 891.24,
 nan,
 1206.9245,
 1062.0418,
 nan,
 nan,
 920.4738,
 nan,
 1331.067,
 1515.8737,
 1521.8449,
 nan,
 nan,
 25.0304,
 24.0656,
 990.5019,
 nan,
 29.0291,
 948.811,
 1517.8377,
 943.6534,
 34.9493,
 977.5544,
 878.8068,
 881.5891,
 903.9133,
 20.2054,
 812.8167,
 8.4919,
 50.4853,
 13.2879,
 1014.4444,
 nan,
 nan,
 nan,
 nan,
 1513.4191,
 1519.24,
 1754.7492,
 27.5744,
 1509.4865,
 9.5029,
 907.0901,
 1438.7938,
 1329.1673,
 1462.6336,
 nan,
 nan,
 813.8568,
 613.6513,
 1747.4767,
 nan,
 1737.3546,
 1747.1957,
 nan,
 836.6195,
 673.5382,
 1747.4991,
 nan,
 754.0131,
 1519.9898,
 1438.195,
 851.5094,
 783.5468,
 nan,
 23.7505,
 615.4535,
 1720.9153,
 nan,
 43.4947,
 577.6,
 nan,
 1344.8521,
 1474.876,
 nan,
 890.5663,
 nan,
 nan,
 905.1217,


In [26]:
len(l_point)

3429

In [28]:
l_point[100:200]

['',
 '43.521182, -114.316583',
 '44.024655693727, -116.985470858081',
 '',
 '35.247991, -101.831244',
 '',
 '',
 '39.7402925414353, -104.863000850084',
 '',
 '43.4824539, -112.0160211',
 '39.64018055, -104.790295011917',
 '33.9430653, -118.1387202',
 '39.78621225, -104.808503140611',
 '',
 '',
 '',
 '43.496737, -112.068687',
 '33.5895130232558, -99.2601204186046',
 '40.1023575555556, -75.0287513737374',
 '',
 '',
 '32.510672, -99.744151',
 '',
 '',
 '33.8449489, -118.3536298',
 '43.6181556, -116.272858275256',
 '40.8513049761959, -115.744700226718',
 '43.652449, -116.253135',
 '41.7058956, -87.720382',
 '33.9397633422819, -117.968001107383',
 '',
 '43.5903729246862, -116.313335200837',
 '',
 '43.5899212815534, -116.244909203883',
 '42.389242, -71.117934',
 '',
 '',
 '43.6522825, -116.671174099734',
 '32.427073, -99.5775145',
 '',
 '43.59072035, -116.175995811305',
 '33.7121068, -117.9545398',
 '43.835485, -111.77797',
 '32.7151089, -102.6437775',
 '',
 '33.4885786, -112.081809380963',

In [39]:
knbr_id

[0, 873, 908, 886, 957, 1016, 968, 991, 890, 949, 57]

In [34]:
range(0)

range(0, 0)

In [40]:
knbr_strid 

[1, 1014, 1052, 1028, 1101, 1163, 1113, 1137, 1032, 1093, 74]

In [41]:
knbr_addr 

['1340 PATRIOT BLVD GLENVIEW IL 60026',
 '1020 WAUKEGAN RD. GLENVIEW IL 60025',
 '4125 DUNDEE RD. NORTHBROOK IL 60062',
 '1145-55 MT. PROSPECT PLAZA MT. PROSPECT IL 60056',
 '1555 LEE ST. DES PLAINES IL 60018',
 '1919 SKOKIE VALLEY RD. HIGHLAND PARK IL 60035',
 '6312 N. NAGLE AVE. CHICAGO IL 60646',
 '2748 GREEN BAY RD. EVANSTON IL 60201',
 '1900 S. CUMBERLAND AVE. PARK RIDGE IL 60068',
 '525 CHICAGO AVE. EVANSTON IL 60202',
 '7000 W. FOREST PRESERVE DR. NORRIDGE IL 60176']

In [45]:
l_knbr[0:10]

[[1, 1, 0.0, 0, '1340 PATRIOT BLVD GLENVIEW IL 60026'],
 [1, 1014, 1.0821, 1, '1020 WAUKEGAN RD. GLENVIEW IL 60025'],
 [1, 1052, 5.1567, 2, '4125 DUNDEE RD. NORTHBROOK IL 60062'],
 [1, 1028, 5.2291, 3, '1145-55 MT. PROSPECT PLAZA MT. PROSPECT IL 60056'],
 [1, 1101, 5.3156, 4, '1555 LEE ST. DES PLAINES IL 60018'],
 [1, 1163, 5.498, 5, '1919 SKOKIE VALLEY RD. HIGHLAND PARK IL 60035'],
 [1, 1113, 5.9151, 6, '6312 N. NAGLE AVE. CHICAGO IL 60646'],
 [1, 1137, 6.0654, 7, '2748 GREEN BAY RD. EVANSTON IL 60201'],
 [1, 1032, 6.368, 8, '1900 S. CUMBERLAND AVE. PARK RIDGE IL 60068'],
 [1, 1093, 8.0538, 9, '525 CHICAGO AVE. EVANSTON IL 60202']]

In [50]:
df_knbr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7865 entries, 0 to 7864
Data columns (total 5 columns):
FAC_NBR       7865 non-null int64
knbr_strid    7865 non-null int64
knbr_dist     7865 non-null float64
dist_rnk      7865 non-null int64
knbr_addr     7865 non-null object
dtypes: float64(1), int64(3), object(1)
memory usage: 307.3+ KB


In [67]:
df_knbr.dtypes

FAC_NBR         int64
knbr_strid      int64
knbr_dist     float64
dist_rnk        int64
knbr_addr      object
dtype: object

In [66]:
df_knbr.knbr_addr = df_knbr.knbr_addr.astype(np.str)

In [64]:
df_knbr.knbr_addr = df_knbr.knbr_addr.astype('|S')

In [87]:
a_str_dis = []
str_cnt = len(df)
for siid in range(str_cnt): 
    id = d_fac_nbr[siid]
    l_id  = [id for i in range(str_cnt)]
    l_dis = distMatrix[siid]
    l_str_dis_tuple = list(zip(l_id, val_fac_nbr, l_dis))
    l_str_dis = [list(x) for x in l_str_dis_tuple]
    a_str_dis.append(l_str_dis) 

In [70]:
id = 1
l_id = [id for i in range(len(l_point))]

In [81]:
l_dis = distMatrix[0]
l_str_dis_tuple = list(zip(l_id, val_fac_nbr, l_dis))
#z_str_dis  = zip(l_id, val_fac_nbr, l_dis) 
#l_str_dis = list(itertools.chain(*z_str_dis))

In [90]:
len(a_str_dis[0]) 

3429

In [84]:
l_str_dis = [list(x) for x in l_str_dis_tuple]

In [96]:
a_str_dis[0]

[[1, 1, 0.0],
 [1, 2, 1515.3969],
 [1, 3, nan],
 [1, 4, 1438.8518],
 [1, 5, nan],
 [1, 6, nan],
 [1, 7, 1254.2021],
 [1, 8, 910.8979],
 [1, 9, 1188.9415],
 [1, 10, nan],
 [1, 11, nan],
 [1, 12, nan],
 [1, 13, 1524.9841],
 [1, 14, 1511.6606],
 [1, 15, nan],
 [1, 17, nan],
 [1, 18, nan],
 [1, 19, 1116.285],
 [1, 20, 1329.7978],
 [1, 21, nan],
 [1, 22, nan],
 [1, 23, 1437.8323],
 [1, 24, 1328.2048],
 [1, 25, nan],
 [1, 27, nan],
 [1, 28, nan],
 [1, 29, 910.2298],
 [1, 30, nan],
 [1, 33, nan],
 [1, 34, 891.24],
 [1, 35, nan],
 [1, 37, 1206.9245],
 [1, 38, 1062.0418],
 [1, 39, nan],
 [1, 40, nan],
 [1, 41, 920.4738],
 [1, 42, nan],
 [1, 43, 1331.067],
 [1, 45, 1515.8737],
 [1, 46, 1521.8449],
 [1, 47, nan],
 [1, 52, nan],
 [1, 54, 25.0304],
 [1, 56, 24.0656],
 [1, 57, 990.5019],
 [1, 58, nan],
 [1, 59, 29.0291],
 [1, 60, 948.811],
 [1, 61, 1517.8377],
 [1, 62, 943.6534],
 [1, 63, 34.9493],
 [1, 64, 977.5544],
 [1, 65, 878.8068],
 [1, 66, 881.5891],
 [1, 67, 903.9133],
 [1, 68, 20.2054],
 [1

In [97]:
df_dist = pd.DataFrame(a_str_dis)
#(, columns =['fac_nbr', 'str_id', 'distance']) 

In [98]:
df_dist.shape

(3429, 3429)

In [99]:
df_dist.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3419,3420,3421,3422,3423,3424,3425,3426,3427,3428
0,"[1, 1, 0.0]","[1, 2, 1515.3969]","[1, 3, nan]","[1, 4, 1438.8518]","[1, 5, nan]","[1, 6, nan]","[1, 7, 1254.2021]","[1, 8, 910.8979]","[1, 9, 1188.9415]","[1, 10, nan]",...,"[1, 4932, 723.2867]","[1, 4933, 602.1927]","[1, 4964, 603.7575]","[1, 4967, nan]","[1, 4970, nan]","[1, 4979, nan]","[1, 4984, nan]","[1, 4995, nan]","[1, 4999, nan]","[1, 9953, 1825.6769]"
1,"[2, 1, 1515.3969]","[2, 2, 0.0]","[2, 3, nan]","[2, 4, 226.2499]","[2, 5, nan]","[2, 6, nan]","[2, 7, 700.3509]","[2, 8, 605.4764]","[2, 9, 701.6488]","[2, 10, nan]",...,"[2, 4932, 2227.8388]","[2, 4933, 2077.4211]","[2, 4964, 2081.8431]","[2, 4967, nan]","[2, 4970, nan]","[2, 4979, nan]","[2, 4984, nan]","[2, 4995, nan]","[2, 4999, nan]","[2, 9953, 398.2385]"
2,"[3, 1, nan]","[3, 2, nan]","[3, 3, nan]","[3, 4, nan]","[3, 5, nan]","[3, 6, nan]","[3, 7, nan]","[3, 8, nan]","[3, 9, nan]","[3, 10, nan]",...,"[3, 4932, nan]","[3, 4933, nan]","[3, 4964, nan]","[3, 4967, nan]","[3, 4970, nan]","[3, 4979, nan]","[3, 4984, nan]","[3, 4995, nan]","[3, 4999, nan]","[3, 9953, nan]"
3,"[4, 1, 1438.8518]","[4, 2, 226.2499]","[4, 3, nan]","[4, 4, 0.0]","[4, 5, nan]","[4, 6, nan]","[4, 7, 842.1355]","[4, 8, 564.5797]","[4, 9, 827.6273]","[4, 10, nan]",...,"[4, 4932, 2131.098]","[4, 4933, 1969.9663]","[4, 4964, 1975.2583]","[4, 4967, nan]","[4, 4970, nan]","[4, 4979, nan]","[4, 4984, nan]","[4, 4995, nan]","[4, 4999, nan]","[4, 9953, 612.6175]"
4,"[5, 1, nan]","[5, 2, nan]","[5, 3, nan]","[5, 4, nan]","[5, 5, nan]","[5, 6, nan]","[5, 7, nan]","[5, 8, nan]","[5, 9, nan]","[5, 10, nan]",...,"[5, 4932, nan]","[5, 4933, nan]","[5, 4964, nan]","[5, 4967, nan]","[5, 4970, nan]","[5, 4979, nan]","[5, 4984, nan]","[5, 4995, nan]","[5, 4999, nan]","[5, 9953, nan]"


In [127]:
# case 1
""" result 
[[(1, 1, 0.0),
  (1, 2, 1515.3969),
"""  
a_str_dis=[]
str_cnt = len(df)

for siid in range(str_cnt): 
    id = d_fac_nbr[siid]
    l_id  = [id for i in range(str_cnt)]
    l_dis = distMatrix[siid]
    a_str_dis.append( list(list(zip(l_id, val_fac_nbr, l_dis))))


In [129]:
# case2
"""
[[[1, 1, 0.0],
  [1, 2, 1515.3969],
  [1, 3, nan],
  [1, 4, 1438.8518],
  [1, 5, nan],

"""
a_str_dis=[]
str_cnt = 1
str_cnt1 = len(df)
for siid in range(str_cnt): 
    id = d_fac_nbr[siid]
    l_id  = [id for i in range(str_cnt1)]
    l_dis = distMatrix[siid]
    l_str_dis_tuple = list(zip(l_id, val_fac_nbr, l_dis))
    a_str_dis.append([list(x) for x in l_str_dis_tuple])
 

In [131]:
a_str_dis=[]
str_cnt = 1
str_cnt1 = len(df)
for siid in range(str_cnt): 
    id = d_fac_nbr[siid]
    l_id  = [id for i in range(str_cnt1)]
    l_dis = distMatrix[siid]
    l_str_dis_tuple = list(zip(l_id, val_fac_nbr, l_dis))
    l_str_dis = [list(x) for x in l_str_dis_tuple]
    a_str_dis.extend(l_str_dis) 

In [124]:
l_str_dis_tuple

[(1, 1, 0.0)]

In [103]:
type(a_str_dis[0][0])

list

In [132]:
a_str_dis[0:3] 

[[1, 1, 0.0], [1, 2, 1515.3969], [1, 3, nan]]

In [113]:
df_dist = pd.DataFrame(a_str_dis[0:3])

In [114]:
df_dist.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3419,3420,3421,3422,3423,3424,3425,3426,3427,3428
0,"[1, 1, 0.0]","[1, 2, 1515.3969]","[1, 3, nan]","[1, 4, 1438.8518]","[1, 5, nan]","[1, 6, nan]","[1, 7, 1254.2021]","[1, 8, 910.8979]","[1, 9, 1188.9415]","[1, 10, nan]",...,"[1, 4932, 723.2867]","[1, 4933, 602.1927]","[1, 4964, 603.7575]","[1, 4967, nan]","[1, 4970, nan]","[1, 4979, nan]","[1, 4984, nan]","[1, 4995, nan]","[1, 4999, nan]","[1, 9953, 1825.6769]"
1,"[2, 1, 1515.3969]","[2, 2, 0.0]","[2, 3, nan]","[2, 4, 226.2499]","[2, 5, nan]","[2, 6, nan]","[2, 7, 700.3509]","[2, 8, 605.4764]","[2, 9, 701.6488]","[2, 10, nan]",...,"[2, 4932, 2227.8388]","[2, 4933, 2077.4211]","[2, 4964, 2081.8431]","[2, 4967, nan]","[2, 4970, nan]","[2, 4979, nan]","[2, 4984, nan]","[2, 4995, nan]","[2, 4999, nan]","[2, 9953, 398.2385]"
2,"[3, 1, nan]","[3, 2, nan]","[3, 3, nan]","[3, 4, nan]","[3, 5, nan]","[3, 6, nan]","[3, 7, nan]","[3, 8, nan]","[3, 9, nan]","[3, 10, nan]",...,"[3, 4932, nan]","[3, 4933, nan]","[3, 4964, nan]","[3, 4967, nan]","[3, 4970, nan]","[3, 4979, nan]","[3, 4984, nan]","[3, 4995, nan]","[3, 4999, nan]","[3, 9953, nan]"


In [119]:
l_str_dis_tuple

[(4999, 1, 1825.6769),
 (4999, 2, 398.2385),
 (4999, 3, nan),
 (4999, 4, 612.6175),
 (4999, 5, nan),
 (4999, 6, nan),
 (4999, 7, 750.2526),
 (4999, 8, 928.3711),
 (4999, 9, 785.6855),
 (4999, 10, nan),
 (4999, 11, nan),
 (4999, 12, nan),
 (4999, 13, 382.459),
 (4999, 14, 403.1102),
 (4999, 15, nan),
 (4999, 17, nan),
 (4999, 18, nan),
 (4999, 19, 907.8385),
 (4999, 20, 750.0924),
 (4999, 21, nan),
 (4999, 22, nan),
 (4999, 23, 646.3279),
 (4999, 24, 752.0524),
 (4999, 25, nan),
 (4999, 27, nan),
 (4999, 28, nan),
 (4999, 29, 927.7882),
 (4999, 30, nan),
 (4999, 33, nan),
 (4999, 34, 1079.8451),
 (4999, 35, nan),
 (4999, 37, 866.0897),
 (4999, 38, 886.9749),
 (4999, 39, nan),
 (4999, 40, nan),
 (4999, 41, 916.4524),
 (4999, 42, nan),
 (4999, 43, 751.4245),
 (4999, 45, 391.3062),
 (4999, 46, 385.0551),
 (4999, 47, nan),
 (4999, 52, nan),
 (4999, 54, 1810.745),
 (4999, 56, 1813.2452),
 (4999, 57, 2781.9296),
 (4999, 58, nan),
 (4999, 59, 1807.5536),
 (4999, 60, 892.4953),
 (4999, 61, 385.

In [133]:
df_dist = pd.DataFrame(a_str_dis, columns =['fac_nbr', 'str_id', 'distance']) 

In [134]:
df_dist .head(30)

Unnamed: 0,fac_nbr,str_id,distance
0,1,1,0.0
1,1,2,1515.3969
2,1,3,
3,1,4,1438.8518
4,1,5,
5,1,6,
6,1,7,1254.2021
7,1,8,910.8979
8,1,9,1188.9415
9,1,10,


In [141]:
df_dist.shape 

(3429, 3)

In [142]:
len(a_str_dis)

11758041

In [20]:
len(l_dis)

3429

In [22]:
df_dist.head(30)

Unnamed: 0,fac_nbr,str_id,distance
0,1,1,0.0
1,1,2,1515.3969
2,1,3,
3,1,4,1438.8518
4,1,5,
5,1,6,
6,1,7,1254.2021
7,1,8,910.8979
8,1,9,1188.9415
9,1,10,
