## Minimal notebook

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import requests

import seaborn as sns

from collections import Counter
import itertools

from matplotlib.colors import LogNorm, SymLogNorm, Normalize
from matplotlib.ticker import MaxNLocator, FormatStrFormatter

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Import database

In [3]:
# always use False
LOCAL = False

if LOCAL:
    f = open('data/inspire_LA.json','r')
    df = json.load(f)
    f.close()
else:
    r = requests.get('https://github.com/restrepo/inspireauth/raw/main/data/inspire_LA.json')
    df = r.json()

db_master = pd.DataFrame(df)
db_master.head()

Unnamed: 0,author_id,country,email_addresses,full_name,ids,institution,institution_id,name,papers,positions,profile_id,advisors
0,H.Albrecht.1,Venezuela,"[{'value': 'hartwig.albrecht@desy.de', 'curren...","Albrecht, Hermann","[{'value': 'H.Albrecht.1', 'schema': 'INSPIRE ...","Unlisted, VE",912061,"{'value': 'Albrecht, Hartwig', 'preferred_name...","[{'recid': 2023395, 'year': '2021', 'citation_...",[{'record': {'$ref': 'https://inspirehep.net/a...,1018731,[]
1,D.M.B.R.1,Venezuela,,"Bellorin, David","[{'value': 'D.M.B.R.1', 'schema': 'INSPIRE BAI'}]","Unlisted, VE",912061,"{'value': 'R., David M. Bellorin'}","[{'recid': 2023395, 'year': '2021', 'citation_...",,2023397,[]
2,D.F.Mundarain.1,Chile,,"Mundarain, Douglas F.","[{'value': 'D.F.Mundarain.1', 'schema': 'INSPI...",Catolica del Norte U.,908143,"{'value': 'Mundarain, Douglas F.'}","[{'recid': 2023395, 'year': '2021', 'citation_...",,2023398,[]
3,R.Gaitan.2,Venezuela,"[{'value': 'rgaitan@uc.edu.ve', 'current': Fal...","Dominguez, Yessica","[{'value': 'R.Gaitan.2', 'schema': 'INSPIRE BA...","Unlisted, VE",912061,"{'value': 'Gaitan, Rolando', 'preferred_name':...","[{'recid': 1863076, 'year': '2021', 'citation_...","[{'rank': 'PHD', 'record': {'$ref': 'https://i...",1864347,[]
4,J.Ntahompagaze.1,Rwanda,,"Ntahompagaze, Joseph","[{'value': 'J.Ntahompagaze.1', 'schema': 'INSP...","EAIFR, Kigali",1636656,"{'value': 'Ntahompagaze, Joseph'}","[{'recid': 2087212, 'year': '2022', 'citation_...",,2087213,[]


# All countries in the database

In [4]:
# find all countries in json
all_countries = db_master['country'].unique()
# get rid of Nonetypes in the database
all_countries = [country for country in all_countries if country is not None]
# sort countries 
all_countries.sort()

print(len(all_countries))

for country in all_countries:
    print(country)
    
# format country 
def format_country(country):
    country = country.lower()
    ans = country.replace(" ", "_")
    return ans

115
Algeria
Argentina
Armenia
Australia
Austria
Azerbaijan
Bahamas
Bangladesh
Belarus
Belgium
Benin
Bhutan
Bolivia
Bosnia and Herzegovina
Brazil
Brunei Darussalam
Bulgaria
Canada
Chile
China
Colombia
Congo
Costa Rica
Croatia
Cuba
Cyprus
Czechia
Denmark
Dominican Republic
Ecuador
Egypt
El Salvador
Estonia
Ethiopia
Finland
France
Georgia
Germany
Ghana
Greece
Guatemala
Holy See (Vatican City State)
Honduras
Hungary
Iceland
India
Indonesia
Iran, Islamic Republic of
Iraq
Ireland
Israel
Italy
Jamaica
Japan
Jordan
Kazakhstan
Korea, Republic of
Kuwait
Kyrgyzstan
Latvia
Lebanon
Lithuania
Luxembourg
Madagascar
Malaysia
Malta
Mexico
Moldova
Mongolia
Montenegro
Morocco
Netherlands
New Zealand
Niger
Nigeria
North Macedonia
Norway
Oman
Pakistan
Palestine, State of
Panama
Paraguay
Peru
Philippines
Poland
Portugal
Puerto Rico
Qatar
Romania
Russian Federation
Rwanda
Saudi Arabia
Serbia and Montenegro
Singapore
Slovakia
Slovenia
South Africa
Spain
Sri Lanka
Sweden
Switzerland
Syrian Arab Republic
Taiwan

# Latin American countries 

In [5]:
# 19 Latin American countries indexed in the database
LA_countries = [
    'Argentina',
    'Bolivia',
    'Brazil',
    'Chile',
    'Colombia',
    'Costa Rica',
    'Cuba',
    'Dominican Republic', # new
    'Ecuador',
    'El Salvador', # new
    'Guatemala',
    'Honduras',
    'Mexico',
    'Panama', # new
    'Paraguay',
    'Peru',
    'Puerto Rico', # new
    'Uruguay',
    'Venezuela'
]

# Jamaica, Bahamas not included. 

LA_countries_format = [format_country(country) for country in LA_countries]
print(LA_countries_format)

['argentina', 'bolivia', 'brazil', 'chile', 'colombia', 'costa_rica', 'cuba', 'dominican_republic', 'ecuador', 'el_salvador', 'guatemala', 'honduras', 'mexico', 'panama', 'paraguay', 'peru', 'puerto_rico', 'uruguay', 'venezuela']


# See what authors are duplicated for a given country

In [31]:
db = db_master[db_master['country'] == 'Chile']
duplicated_indices = db[db['author_id'].duplicated()].index
# select the first two rows that contain a duplicated entry
result = db.loc[duplicated_indices[:2]]
print(result)

       author_id country email_addresses       full_name   ids  \
504   N.Calvet.1   Chile            None   Calvet, Nuria  None   
506  C.Briceno.1   Chile            None  Briceno, Cesar  None   

            institution institution_id  name  \
504  Chile U., Catolica         904336  None   
506  Chile U., Catolica         904336  None   

                                                papers positions profile_id  \
504  [{'recid': 662521, 'year': '2005', 'citation_c...      None       None   
506  [{'recid': 662521, 'year': '2005', 'citation_c...      None       None   

    advisors  
504       []  
506       []  


# Total and active authors per year

In [32]:
# Get number of authors 

def get_number_authors(country):
    # select country
    db=db_master[db_master['country']==country].reset_index(drop=True)
    # remove duplicate authors according to `author_id`
    # if, after a selecting a country, there are rows with
    # the same author_id, it's because the author published
    # under two affilitaions of the same country. Obviously,
    # this only counts as one author of the country. 
    # We don't drop duplicates by suing `profile_id` because
    # it is full of Nonetypes 
    db = db.drop_duplicates(subset=['author_id'])
    number_of_authors = len(db.index)
    print(f"{country}: {number_of_authors} authors.")
    return number_of_authors


In [29]:
def generate_number_authors_all():
    ans = {}
    for country in LA_countries:
        ans[country] = get_number_authors(country)
    return ans

authors = generate_number_authors_all();

Argentina: 1984 authors.
Bolivia: 50 authors.
Brazil: 8456 authors.
Chile: 2043 authors.
Colombia: 723 authors.
Costa Rica: 29 authors.
Cuba: 236 authors.
Dominican Republic: 1 authors.
Ecuador: 66 authors.
El Salvador: 6 authors.
Guatemala: 32 authors.
Honduras: 11 authors.
Mexico: 3867 authors.
Panama: 3 authors.
Paraguay: 5 authors.
Peru: 156 authors.
Puerto Rico: 10 authors.
Uruguay: 87 authors.
Venezuela: 370 authors.


# Generate the histogram

In [34]:
# Population of each country

"""
Arrays account for 
- Argentina
- Bolivia
- Brazil
- Chile
- Colombia
- Costa Rica
- Cuba
- Dominican Republic
- Ecuador
- El Salvador
- Guatemala
- Honduras
- Mexico
- Panama
- Paraguay
- Peru
- Puerto Rico
- Uruguay
- Venezuela
"""

#populations = [45.606, 11.833, 213.993, 19.212, 51.266,
#               5.139, 11.318, 17.888, 18.250, 10.063,
#               130.262, 7.220, 33.359, 3.485, 28.705]

# population in millions of inhabitants
population = {element: None for element in LA_countries}

population['Argentina'] = 
population['Bolivia'] = 
population['Brazil'] = 
population['Chile'] = 
population['Colombia'] =
population['Costa Rica'] = 
population['Cuba'] = 
population['Dominican Republic'] = 
population['Ecuador'] =
population['El Salvador'] = 
population['Guatemala'] =
population['Honduras'] = 
population['Mexico'] = 
population['Panama'] = 
population['Paraguay'] = 
population['Peru'] = 
population['Puerto Rico'] = 
population['Uruguay'] =
population['Venezuela'] = 

In [None]:
act_authors = [406, 1, 1843, 461, 159, 
               7, 31, 21, 12, 1, 
               832, 2, 24, 23, 37]

act_authors_pc = np.divide(act_authors, populations)
#print(act_authors_pc)

def gen_all_auths_pc():
    labels = LA_countries
    all_auths = authors_pc
    act_auths = act_authors_pc

    x = np.arange(len(labels))  # the label locations
    width = 0.35  # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(x - width/2, all_auths, width, label='All authors')
    rects2 = ax.bar(x + width/2, act_auths, width, label='Active authors')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Value')
    ax.set_title('Total and active authors per million inhabitants')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.tick_params(axis='x', rotation=90)
    ax.legend()
    ax.grid()
    ax.set_axisbelow(True)


    fig.tight_layout()
    fig.savefig(f'plots_for_tables/all_authors_pc.pdf', 
                dpi=150, bbox_inches = 'tight')

    plt.show()

    
gen_all_auths_pc()

