In [94]:
#import local files
from geo_address.processing import split_address
from geo_address.processing import begin_geocode
from geo_address.distances import address_distance
#import pandas for dataframe 
import pandas as pd 
import numpy as np


In [95]:
#load data
#sample of 5 publications
np.random.seed(123)
df = pd.read_csv('path/to/data.csv').sample(5) #replace with your data path, using sample
df[['DOI','Addresses']].head()

Unnamed: 0,DOI,Addresses
2133,10.1021/jp402440u,"[Chavan, Sachin; Bonino, Francesca; Civalleri,..."
2652,10.1016/j.actbio.2012.12.008,"[Munoz Noval, A.; Torres Costa, V.; Martin Pal..."
552,10.1021/es403368j,"[Servin, Alia D.; Morales, Maria Isabel; Heman..."
3401,10.1002/macp.201100507,"[Gamys, Ce Guinto; Beyou, Emmanuel; David, Lau..."
4597,10.1021/bi201171h,"[Goncalves, Susana; Esteves, Ana M.; Santos, H..."


In [96]:
#data shows multiple addresses with format -> [Authors] Address;
df['Addresses'].iloc[0]

'[Chavan, Sachin; Bonino, Francesca; Civalleri, Bartolomeo; Lamberti, Carlo; Bordiga, Silvia] Univ Turin, NIS Ctr Excellence, Dept Chem, I-10135 Turin, Italy; [Chavan, Sachin; Bonino, Francesca; Civalleri, Bartolomeo; Lamberti, Carlo; Bordiga, Silvia] Univ Turin, INSTM Reference Ctr, I-10135 Turin, Italy; [Valenzano, Loredana] Michigan Technol Univ, Dept Chem, Houghton, MI 49931 USA; [Acerbi, Nadia] Johnson Matthey Technol Ctr, Reading RG4 9NH, Berks, England; [Cavka, Jasmina H.] SINTEF Mat & Chem, N-0373 Oslo, Norway; [Leistner, Matthias] IWS, Fraunhofer Inst Mat & Beam Technol, Dept Chem Surface & React Technol, D-01277 Dresden, Germany'

In [97]:
#function split_address splits addresses and adds multiplier
df_addresses = split_address(df, 'Addresses')
df_addresses.head(1)

Processing row 4: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 1553.45it/s]


Unnamed: 0,Author_names,Addresses,Multiplier,Num_Addresses,DOI,Year
0,"[[Chavan, Sachin; Bonino, Francesca; Civalleri...",Univ Turin NIS Ctr Excellence Dept Chem I-1013...,"[5, 5, 1, 1, 1, 1]",6,10.1021/jp402440u,2013.0


In [98]:
#author column keeps format for validation with Multiplier (4,1,2 in this example)
#one author shows two affiliations in this case
df_addresses.Author_names[0]

['[Chavan, Sachin; Bonino, Francesca; Civalleri, Bartolomeo; Lamberti, Carlo; Bordiga, Silvia]',
 '[Chavan, Sachin; Bonino, Francesca; Civalleri, Bartolomeo; Lamberti, Carlo; Bordiga, Silvia]',
 '[Valenzano, Loredana]',
 '[Acerbi, Nadia]',
 '[Cavka, Jasmina H.]',
 '[Leistner, Matthias]']

In [99]:
#unique addresses
df_addresses.Addresses[0]

'Univ Turin NIS Ctr Excellence Dept Chem I-10135 Turin Italy;Univ Turin INSTM Reference Ctr I-10135 Turin Italy;Michigan Technol Univ Dept Chem Houghton MI 49931 USA;Johnson Matthey Technol Ctr Reading RG4 9NH Berks England;SINTEF Mat & Chem N-0373 Oslo Norway;IWS Fraunhofer Inst Mat & Beam Technol Dept Chem Surface & React Technol D-01277 Dresden Germany'

**Note**: Remember to add your API Key on the text file if you're using Google V3 as in the article. 
- If you use another API, consult the geopy documentation for the requirements. 
- Not providing a valid API Key for GoogleV3 will result in an error, such as: *GeocoderQueryError: The provided API key is invalid.*

In [100]:
#function begin_geocode() separates and geocodes each address, creates checkpoint file 
coords = begin_geocode(df_addresses)
df_addresses['coords'] = coords

Using <geopy.geocoders.google.GoogleV3 object at 0x000001A7771DA2F0> geocoder from geopy
Preparing address field for geocoding...


Geocoding row 4 of 5: 100%|██████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.43it/s]


In [101]:
#coordinate sample for first pub
#useful for maps and/or spatial statistics 
df_addresses['coords'].iloc[0]

[(45.0358565, 7.6695071),
 (45.0358565, 7.6695071),
 (47.1196551, -88.5482234),
 (51.5201606, -0.9715678),
 (59.9451407, 10.7122286),
 (51.0296177, 13.7825461)]

In [102]:
#you can also calculate distances between addresses, and to a specific point in lat,lon coordinates
origin = (45.2089892, 5.692755) 
df_addresses = address_distance(df_addresses, origin)

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 523.63it/s]


In [103]:
#adds distance between coordinates, distances to origin by address, and mean distance to origin in kms
#in this example, origin is the ESRF
df_addresses[['DOI','Addresses','distance_to_facility','mean_distance_between_addresses','mean_distance_to_facility']]

Unnamed: 0,DOI,Addresses,distance_to_facility,mean_distance_between_addresses,mean_distance_to_facility
0,10.1021/jp402440u,Univ Turin NIS Ctr Excellence Dept Chem I-1013...,"[156.28, 156.28, 6784.54, 856.48, 1672.14, 881...",2853.728412,1751.268333
1,10.1016/j.actbio.2012.12.008,Univ Autonoma Madrid Dept Fis Aplicada E-28049...,"[923.08, 931.6, 927.03, 923.08, 932.53, 922.52]",7.631725,926.64
2,10.1021/es403368j,Univ Texas El Paso Dept Chem El Paso TX 79968 ...,"[9065.24, 9065.24, 9065.24, 9065.24, 9060.86, ...",3027.431289,7553.636667
3,10.1002/macp.201100507,Univ Lyon 1 Ingenierie Mat Polymeres CNRS UMR5...,"[88.5, 88.5]",0.0,88.5
4,10.1021/bi201171h,Univ Nova Lisboa ITQB P-2781901 Oeiras Portugal,[1434.07],,1434.07


In [104]:
#example result
import random 
n=random.randint(0,4) #choose a random DOI
print('DOI: {} \n'.format(df_addresses.iloc[n].DOI))
print('Distances by address: {}\n'.format([i for i in zip(df_addresses.iloc[n].Addresses.split(';'), df_addresses.iloc[n].distance_to_facility)]))
print('Mean distance to facility: {} kms\n'.format(np.round(df_addresses.iloc[n].mean_distance_to_facility)))
print('Mean distance between addresses: {} kms\n'.format(np.round(df_addresses.iloc[n].mean_distance_between_addresses)))

DOI: 10.1016/j.actbio.2012.12.008 

Distances by address: [('Univ Autonoma Madrid Dept Fis Aplicada E-28049 Madrid Spain', 923.08), ('Univ Politecn Madrid ETSIT Grp Bioingn & Telemed E-28040 Madrid Spain', 931.6), ('Hosp Univ Ramon y Cajal Dept Oftalmol Madrid Spain', 927.03), ('Univ Autonoma Madrid Dept Biol Mol E-28049 Madrid Spain', 923.08), ('CSIC Inst Ceram & Vidrio Madrid Spain', 932.53), ('IMDEA Nanociencia Madrid Spain', 922.52)]

Mean distance to facility: 927.0 kms

Mean distance between addresses: 8.0 kms

