# Example
This example mirrors the methdology of "Title"(Add DOI Here). This is a static HTML version, to see the interactive maps below, see **[here](https://nbviewer.org/github/soderstromkr/geoaddress/blob/main/example.ipynb?flush_cache=true)**

In [1]:
#geo_address functions
from geo_address.processing import split_address, begin_geocode, to_coord_data
from geo_address.distances import address_distance
#pandas for dataframe and numpy for random seed
import pandas as pd 
import numpy as np
#plotly for mapping
import plotly.express as px

In [2]:
#load data
#sample of 5 publications
np.random.seed(123)
df = pd.read_csv('../path/to/data.csv').sample(5) #replace with your data path, using sample
df[['DOI','Addresses']].head()

Unnamed: 0,DOI,Addresses
2133,10.1021/jp402440u,"[Chavan, Sachin; Bonino, Francesca; Civalleri,..."
2652,10.1016/j.actbio.2012.12.008,"[Munoz Noval, A.; Torres Costa, V.; Martin Pal..."
552,10.1021/es403368j,"[Servin, Alia D.; Morales, Maria Isabel; Heman..."
3401,10.1002/macp.201100507,"[Gamys, Ce Guinto; Beyou, Emmanuel; David, Lau..."
4597,10.1021/bi201171h,"[Goncalves, Susana; Esteves, Ana M.; Santos, H..."


In [3]:
#data shows multiple addresses with format -> [Authors] Address;
df['Addresses'].iloc[0]

'[Chavan, Sachin; Bonino, Francesca; Civalleri, Bartolomeo; Lamberti, Carlo; Bordiga, Silvia] Univ Turin, NIS Ctr Excellence, Dept Chem, I-10135 Turin, Italy; [Chavan, Sachin; Bonino, Francesca; Civalleri, Bartolomeo; Lamberti, Carlo; Bordiga, Silvia] Univ Turin, INSTM Reference Ctr, I-10135 Turin, Italy; [Valenzano, Loredana] Michigan Technol Univ, Dept Chem, Houghton, MI 49931 USA; [Acerbi, Nadia] Johnson Matthey Technol Ctr, Reading RG4 9NH, Berks, England; [Cavka, Jasmina H.] SINTEF Mat & Chem, N-0373 Oslo, Norway; [Leistner, Matthias] IWS, Fraunhofer Inst Mat & Beam Technol, Dept Chem Surface & React Technol, D-01277 Dresden, Germany'

In [4]:
#function split_address splits addresses and adds multiplier
df_addresses = split_address(df, 'Addresses')
df_addresses.head(1)

Processing row 4: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 1666.79it/s]


Unnamed: 0,Author_names,Addresses,Multiplier,Num_Addresses,DOI,Year
0,"[[Chavan, Sachin; Bonino, Francesca; Civalleri...",Univ Turin NIS Ctr Excellence Dept Chem I-1013...,"[5, 5, 1, 1, 1, 1]",6,10.1021/jp402440u,2013.0


In [5]:
#author column keeps format for validation with Multiplier (4,1,2 in this example)
#one author shows two affiliations in this case
df_addresses.Author_names[0]

['[Chavan, Sachin; Bonino, Francesca; Civalleri, Bartolomeo; Lamberti, Carlo; Bordiga, Silvia]',
 '[Chavan, Sachin; Bonino, Francesca; Civalleri, Bartolomeo; Lamberti, Carlo; Bordiga, Silvia]',
 '[Valenzano, Loredana]',
 '[Acerbi, Nadia]',
 '[Cavka, Jasmina H.]',
 '[Leistner, Matthias]']

In [6]:
#unique addresses
df_addresses.Addresses[0]

'Univ Turin NIS Ctr Excellence Dept Chem I-10135 Turin Italy;Univ Turin INSTM Reference Ctr I-10135 Turin Italy;Michigan Technol Univ Dept Chem Houghton MI 49931 USA;Johnson Matthey Technol Ctr Reading RG4 9NH Berks England;SINTEF Mat & Chem N-0373 Oslo Norway;IWS Fraunhofer Inst Mat & Beam Technol Dept Chem Surface & React Technol D-01277 Dresden Germany'

In [7]:
#function begin_geocode() separates and geocodes each address, creates checkpoint file 
coords = begin_geocode(df_addresses)
df_addresses['coords'] = coords

API domain: maps.googleapis.com
No checkpoint found, starting from scratch!


Geocoding row 4 of 5: 100%|██████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  5.03it/s]

Done!





In [8]:
#coordinate sample for first pub
#useful for maps and/or spatial statistics 
df_addresses['coords'].iloc[0]

[(45.0358565, 7.6695071),
 (45.0358565, 7.6695071),
 (47.1196551, -88.5482234),
 (51.5201606, -0.9715678),
 (59.9451407, 10.7122286),
 (51.0296177, 13.7825461)]

In [9]:
#you can also calculate distances between addresses, and to a specific point in lat,lon coordinates
origin = (45.2089892, 5.692755) 
df_addresses = address_distance(df_addresses, origin)

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 384.64it/s]


In [10]:
#adds distance between coordinates, distances to origin by address, and mean distance to origin in kms
#in this example, origin is the ESRF
df_addresses[['DOI','Addresses','distance_to_facility','mean_distance_between_addresses','mean_distance_to_facility']]

Unnamed: 0,DOI,Addresses,distance_to_facility,mean_distance_between_addresses,mean_distance_to_facility
0,10.1021/jp402440u,Univ Turin NIS Ctr Excellence Dept Chem I-1013...,"[156.28, 156.28, 6784.54, 856.48, 1672.14, 881...",2853.728412,1751.268333
1,10.1016/j.actbio.2012.12.008,Univ Autonoma Madrid Dept Fis Aplicada E-28049...,"[923.08, 931.6, 927.03, 923.08, 932.53, 922.52]",7.631725,926.64
2,10.1021/es403368j,Univ Texas El Paso Dept Chem El Paso TX 79968 ...,"[9065.24, 9065.24, 9065.24, 9065.24, 9060.86, ...",3027.431289,7553.636667
3,10.1002/macp.201100507,Univ Lyon 1 Ingenierie Mat Polymeres CNRS UMR5...,"[88.5, 88.5]",0.0,88.5
4,10.1021/bi201171h,Univ Nova Lisboa ITQB P-2781901 Oeiras Portugal,[1434.07],,1434.07


In [11]:
#example result
import random 
n=random.randint(0,4) #choose a random DOI
print('DOI: {} \n'.format(df_addresses.iloc[n].DOI))
print('Distances by address: {}\n'.format([i for i in zip(df_addresses.iloc[n].Addresses.split(';'), df_addresses.iloc[n].distance_to_facility)]))
print('Mean distance to facility: {} kms\n'.format(np.round(df_addresses.iloc[n].mean_distance_to_facility)))
print('Mean distance between addresses: {} kms\n'.format(np.round(df_addresses.iloc[n].mean_distance_between_addresses)))

DOI: 10.1021/jp402440u 

Distances by address: [('Univ Turin NIS Ctr Excellence Dept Chem I-10135 Turin Italy', 156.28), ('Univ Turin INSTM Reference Ctr I-10135 Turin Italy', 156.28), ('Michigan Technol Univ Dept Chem Houghton MI 49931 USA', 6784.54), ('Johnson Matthey Technol Ctr Reading RG4 9NH Berks England', 856.48), ('SINTEF Mat & Chem N-0373 Oslo Norway', 1672.14), ('IWS Fraunhofer Inst Mat & Beam Technol Dept Chem Surface & React Technol D-01277 Dresden Germany', 881.89)]

Mean distance to facility: 1751.0 kms

Mean distance between addresses: 2854.0 kms



# Mapping
This step dissagregates the addresses from each publication. 

In [13]:
map_df = to_coord_data(df_addresses)
map_df.head()

Unnamed: 0,Address,Multiplier,Num_Addresses,DOI,Year,Coordinates,mean_distance_between_addresses,mean_distance_to_facility,distance_to_facility,Fractional pubs
0,Univ Turin NIS Ctr Excellence Dept Chem I-1013...,5,6,10.1021/jp402440u,2013.0,"45.0358565,7.6695071",2853.728412,1434.07,156.28,0.166667
1,Univ Turin INSTM Reference Ctr I-10135 Turin I...,5,6,10.1021/jp402440u,2013.0,"45.0358565,7.6695071",2853.728412,1434.07,156.28,0.166667
2,Michigan Technol Univ Dept Chem Houghton MI 49...,1,6,10.1021/jp402440u,2013.0,"47.1196551,-88.5482234",2853.728412,1434.07,6784.54,0.166667
3,Johnson Matthey Technol Ctr Reading RG4 9NH Be...,1,6,10.1021/jp402440u,2013.0,"51.5201606,-0.9715678",2853.728412,1434.07,856.48,0.166667
4,SINTEF Mat & Chem N-0373 Oslo Norway,1,6,10.1021/jp402440u,2013.0,"59.9451407,10.7122286",2853.728412,1434.07,1672.14,0.166667


In [14]:
#separate coordinates into lat/lon
map_df['lat'] = map_df['Coordinates'].str.split(',', expand=True)[0]
map_df['lon'] = map_df['Coordinates'].str.split(',', expand=True)[1]

# Visualisation
This step aggregates the spatial data to the individual coordinates. 

In [63]:
#aggregating, not a fancy solution but it works
var='Multiplier'

df1 = map_df.groupby(
    'Coordinates').sum().sort_values(by=var, ascending=False)[[var]]

var='Fractional pubs'
df2 = map_df.groupby(
    'Coordinates').sum().sort_values(by=var, ascending=False)[[var]]

df3 = map_df[['Coordinates']].value_counts()

df3 = pd.DataFrame(df3, columns=['Counts'])

var='lat'
lat = map_df.groupby(
    'Coordinates').first().sort_values(by=var, ascending=False)[[var]]

var='lon'
lon = map_df.groupby(
    'Coordinates').first().sort_values(by=var, ascending=False)[[var]]

var='Address'
df5 = map_df.groupby(
    'Coordinates').first().sort_values(by=var, ascending=False)[[var]]

var='distance_to_facility'
df6 = map_df.groupby(
    'Coordinates').first().sort_values(by=var, ascending=False)[[var]]




df_n=df3.merge(
    df1, left_on='Coordinates', right_index=True).merge(
    df2, left_index=True, right_index=True).merge(
    lat, left_index=True, right_index=True).merge(
    lon, left_index=True, right_index=True).merge(
    df5, left_index=True, right_index=True).merge(
    df6, left_index=True, right_index=True)
#some reordering and cleaning
df_n = df_n.reset_index()
df_n.insert(0, 'Address', df_n.pop('Address'))
df_n.insert(4, 'Coordinates', df_n.pop('Coordinates'))

df_n

Unnamed: 0,Address,Counts,Multiplier,Fractional pubs,Coordinates,lat,lon,distance_to_facility
0,Univ Texas El Paso Dept Chem El Paso TX 79968 USA,4,13,0.666667,"31.77016069999999,-106.5047596",31.77016069999999,-106.5047596,9065.24
1,Univ Autonoma Madrid Dept Fis Aplicada E-28049...,2,6,0.333333,"40.5466983,-3.6943619",40.5466983,-3.6943619,923.08
2,Univ Turin NIS Ctr Excellence Dept Chem I-1013...,2,10,0.333333,"45.0358565,7.6695071",45.0358565,7.6695071,156.28
3,Univ Lyon 1 Ingenierie Mat Polymeres CNRS UMR5...,2,5,1.0,"45.771944,4.8901709",45.771944,4.8901709,88.5
4,Univ Texas El Paso UC Ctr Environm Implicat Na...,1,3,0.166667,"31.6691879,-106.2995104",31.6691879,-106.2995104,9060.86
5,Univ Nova Lisboa ITQB P-2781901 Oeiras Portugal,1,5,1.0,"38.695871,-9.3219645",38.695871,-9.3219645,1434.07
6,CSIC Inst Ceram & Vidrio Madrid Spain,1,1,0.166667,"40.4167754,-3.7037902",40.4167754,-3.7037902,932.53
7,Univ Politecn Madrid ETSIT Grp Bioingn & Telem...,1,3,0.166667,"40.4525869,-3.7264339",40.4525869,-3.7264339,931.6
8,Hosp Univ Ramon y Cajal Dept Oftalmol Madrid S...,1,3,0.166667,"40.4885412,-3.6946589",40.4885412,-3.6946589,927.03
9,IMDEA Nanociencia Madrid Spain,1,1,0.166667,"40.5485421,-3.6879105",40.5485421,-3.6879105,922.52


In [88]:
def plot_fig(df,z=None,animation_frame=None,description=None):
    #description for maps
    
    #config for snapshots
    config = {
        'toImageButtonOptions': {
            'format': 'png', # one of png, svg, jpeg, webp
            'filename': 'custom_image',
            'height': 800,
            'width': 600,
            'scale': 3 # Multiply title/legend/axis/canvas sizes by this factor
  }
}
    print('Mapping {}: {}'.format(z,description))
    
    fig = px.density_mapbox(df_n,
                        z=z,
                        lat='lat',
                        lon='lon',
                        radius=10,
                        center=dict(lat=40.2, lon=15.715), zoom=0.5,
                        mapbox_style="open-street-map",
                        animation_frame=animation_frame,
                        width=800, height=600,
                        #color_continuous_scale=px.colors.sequential.Turbo, 
                        hover_name='Address'
                           )
               
    return fig.show(config=config)

In [90]:
#plotting the different measurements
var = [None, 'Counts', 'Multiplier', 'Fractional pubs']
descriptions = ['Shows all locations with equal weight',
                'Shows the number of ocurrences in the sample',
                'Shows the number of author affiliations by address',
                'Shows the fractional publications by address'
               ]

[plot_fig(df=df_n, z=i, animation_frame=None, description=j) for i,j in zip(var,descriptions)]

Mapping None: Shows all locations with equal weight


Mapping Counts: Shows the number of ocurrences in the sample


Mapping Multiplier: Shows the number of author affiliations by address


Mapping Fractional pubs: Shows the fractional publications by address


[None, None, None, None]