# Save Tables Cleaned Data

Notebook used to create and save the tables that are used in the analysis.

In [None]:
from pathlib import Path
from time import time
from tqdm.auto import tqdm
import pandas as pd 
import numpy as np
import os
import pickle
import networkx as nx
import random
tqdm.pandas()

import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns
plt.rcParams.update({'font.size': 22})
sns.set(style="ticks", context="talk")
plt.style.use("dark_background")
pd.options.plotting.backend = 'plotly'
pio.templates.default = 'plotly_dark+presentation'

import matplotlib.dates as dates
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
def form(x,pos):
    if x<1e2:
        return '%1.0f' % (x)
    elif x<1e3:
        return '%1.3f' % (x)
    elif x<1e6:
        return '%1.1fK' % (x * 1e-3)
    else:
        return '%1.1fM' % (x * 1e-6)
formatter = FuncFormatter(form)

def plot_(df_,x_column,y_column,x_label,title):
    plt.style.use("dark_background")
    fig, ax = plt.subplots(figsize=(15, 5))
    x_dates = list(df_[x_column])
    y_data = df_[y_column]
    x_data = x_dates

    ax.plot(x_data, y_data, "co-", markersize=6,label='dataset')
    ax.axvline(pd.Timestamp(2020, 3, 1),color='r')

    plt.grid(True, linewidth=0.5)
    ax.yaxis.set_major_formatter(formatter)

    ax.set_xlabel(x_label,size=20)
    ax.set_title(title,size=30)
    
def read_parquet(name, **args):
    path = name
    print(f'Reading {name!r}')
    tic = time()
    df = pd.read_parquet(path, engine='fastparquet', **args)
    before = len(df)
    # df.drop_duplicates(inplace=True)
    toc = time()
    after = len(df)
        
    print(f'Read {len(df):,} rows from {path.stem!r} in {toc-tic:.2f} sec. {before-after:,} duplicates.')
    return df

In [None]:
if not os.path.exists("Tables"):
    os.makedirs("Tables")

basepath = Path('/N/project/openalex/slices/arxiv-preprints/dec-2024') #folder containing preprint data
basepath2 = Path('/N/project/openalex/slices/subset-1990-2022/dec-2024') #folder containing all Openalex data
basepath3 = Path('./Tables') #folder where to save final tables
basepath4 = Path('/N/project/openalex/ssikdar/processed-snapshots/csv-files/dec-2024') #folder cotaining institutions information

## Works

In [None]:
works = read_parquet(basepath / 'works')

preprint_id_set = set(works.index)
preprint_work_id_set = set(works.openalex_work_id)
print(f'{len(preprint_id_set)} preprint ids')
print(f'{len(preprint_work_id_set)} preprint work ids')

# #map
# preprint_work_id_id_dict = works[['openalex_work_id']].reset_index().set_index('openalex_work_id').to_dict()['work_id']
preprint_id_doi_dict = works[['doi']].to_dict()['doi']
my_file = "preprint_id_doi_dict.pickle"
pickle.dump(preprint_id_doi_dict, open(os.path.join(basepath3, my_file), 'wb'))

works = works[['preprint_submission_date']].reset_index().rename(columns={'preprint_submission_date':'publication_date'}).drop_duplicates('work_id')

In [None]:
works_all = read_parquet(basepath2 / 'works')
works_all = works_all.reset_index()[['work_id','publication_date']]
works_all = works_all[~works_all.work_id.isin(preprint_work_id_set)]#delate work_id connected with preprints
works_all = pd.concat([works_all,works])

In [None]:
#add month
works['publication_date_1'] = pd.to_datetime(pd.DataFrame({'day': 1, 
                                              'month': works.publication_date.dt.month, 
                                              'year': works.publication_date.dt.year},
                                              index=works.index))

works_all['publication_date_1'] = pd.to_datetime(pd.DataFrame({'day': 1, 
                                              'month': works_all.publication_date.dt.month, 
                                              'year': works_all.publication_date.dt.year},
                                              index=works_all.index))

In [None]:
works = works.sort_values(by='publication_date_1').set_index('publication_date_1')
works_all = works_all.sort_values(by='publication_date_1').set_index('publication_date_1')

In [None]:
#save
my_file = 'works'
works.to_parquet(os.path.join(basepath3, my_file))
my_file = 'works_all'
works_all.to_parquet(os.path.join(basepath3, my_file))
all_work_id_set = set(works_all.work_id)
my_file = "all_work_id_set.pickle"
pickle.dump(all_work_id_set, open(os.path.join(basepath3, my_file), 'wb'))
my_file = "preprint_work_id_set.pickle"
pickle.dump(preprint_work_id_set, open(os.path.join(basepath3, my_file), 'wb'))
my_file = "preprint_id_set.pickle"
pickle.dump(preprint_id_set, open(os.path.join(basepath3, my_file), 'wb'))

In [None]:
works_all_month_dict = works_all[['work_id']].reset_index().set_index('work_id').to_dict()['publication_date_1']
my_file = "works_all_month_dict.pickle"
pickle.dump(works_all_month_dict, open(os.path.join(basepath3, my_file), 'wb'))

dictionaries <br>
* preprint_id_doi_dict: map preprint_id to doi 
* works_all_month_dict: map works to publication_date 
* inst_id_name_dict: map institution_id to institution_name
* inst_root_map: map institution to its root

sets <br>
* preprint_work_id_set: set preprints openalex_work_id
* preprint_id_set: set preprints ids 
* all_work_id_set: set works ids
* preprint_author_id_set: set authors preprints

tables <br>
* works: preprints (publication_date_1,work_id,publication_date)
* works_all: all papers (publication_date_1,work_work_id,publication_date)
* works_authorships_inst2: all works (work_id,author_id,author_name,institution_id,institution_name,publication_year)

PAPERS RESTRICTED TO: AUTHORS INFO, REFERENCES INFO, DATA INFO, CONCEPTS INFO

## Affiliations

* Restrict to preprints without missing authors information.
* Use root affiliations and threshold 1 km
* Infer to each author one affiliation 

In [None]:
#preprint
works_authorships = read_parquet(basepath / 'works_authorships',
                                 columns=['work_id','author_id','author_name','institution_id','institution_name'], 
                                 filters=[('author_id', 'isnotnull', True)]        
                                )
works_authorships['author_id'] = works_authorships['author_id'].astype('int')

In [None]:
#all openalex #9mins
works_authorships_all = read_parquet(basepath2 / 'works_authorships',
                                 columns=['work_id','author_id','author_name','institution_id','institution_name'], 
                                 filters=[('author_id', 'isnotnull', True)]        
                                )
works_authorships_all['author_id'] = works_authorships_all['author_id'].astype('int')

In [None]:
my_file = "preprint_work_id_set.pickle"
with open(os.path.join(basepath3, my_file),"rb") as fp:
    preprint_work_id_set = pickle.load(fp)
works_authorships_all = works_authorships_all[~works_authorships_all.work_id.isin(preprint_work_id_set)]#delate work_id connected with preprints
works_authorships_all = pd.concat([works_authorships_all,works_authorships])

In [None]:
#for credit allocation calculation #works_id all and list authors - no institutions 
works_authorships_all['author_id'] = works_authorships_all['author_id'].astype('int')
works_authorships_all_drop = works_authorships_all[['work_id','author_id']].drop_duplicates(['work_id','author_id'])
works_all = read_parquet(basepath3 / 'works_all')
works_authorships_all_drop = works_authorships_all_drop.merge(works_all[['work_id','publication_date']].reset_index(),on='work_id')  #restrict to works with publication_date info
works_authorships_all_drop = works_authorships_all_drop.set_index('publication_date_1',drop=True).sort_index()
#save
my_file = 'works_authorships_all_drop'
works_authorships_all_drop.to_parquet(os.path.join(basepath3, my_file))

In [None]:
#institutions not nan
works_authorships_all = works_authorships_all[works_authorships_all['institution_id'].notnull()]
#works_authorships_all.to_parquet(os.path.join(basepath3, my_file = 'works_authorships_all'))

### institutions' tree

In [None]:
## institutions' tree  ## substitute every institutions with the root #ex. IU Bloomington and IU
institutions_associated_institutions = institutions_geo_df = pd.read_csv(basepath4 / 'institutions_associated_institutions.csv.gz')
institutions_associated_institutions = institutions_associated_institutions[institutions_associated_institutions.relationship == 'parent']
institutions_associated_institutions = institutions_associated_institutions.drop(columns='relationship')
institutions_associated_institutions = institutions_associated_institutions.rename(columns={'associated_institution_id':'parent'})
display(institutions_associated_institutions.head())

T = nx.from_pandas_edgelist(institutions_associated_institutions,source='parent',target='institution_id',create_using=nx.DiGraph())
my_file = "T.pickle"
pickle.dump(T, open(os.path.join(basepath3, my_file), 'wb'))

In [None]:
my_file = "T.pickle"
with open(os.path.join(basepath3, my_file),"rb") as fp:
    T = pickle.load(fp)
roots = set(v for v, d in T.in_degree() if d == 0)
print(f'{len(roots)} roots')
print(f'{len(T.nodes)} noods')

T2 = T.to_undirected()
T2_cc = list(nx.connected_components(T2))
count_roots = []
for i in tqdm(T2_cc):
    count_roots.append(len(i.intersection(roots)))
index_1root = list(np.where(np.array(count_roots) == 1)[0])
index_no1root = list(np.where(np.array(count_roots) != 1)[0])
print(f'components: 1-root {len(index_1root)}, more roots {len(index_no1root)}')

In [None]:
#components with just one root: easy
#components more roots: each node path up and pick randomly one of the neighbours
random.seed(0)

d = {}
for i in tqdm(range(len(index_no1root))):
    i = index_no1root[i]
    cc = T2_cc[i]
    cc_roots = list(cc.intersection(roots))
    d.update(dict(zip(cc_roots,cc_roots)))
    cc_noroots = list(cc - set(cc_roots))
    for v in cc_noroots:
        w = v
        while w not in roots:
            w_list = list(T.predecessors(w))
            random.shuffle(w_list)
            w = w_list[0]
        d[v] = w    
        
for i in tqdm(range(len(index_1root))):
    i = index_1root[i]
    cc = T2_cc[i]
    cc_root = list(cc.intersection(roots))[0]
    d.update({cc_root:cc_root})
    cc_noroots = list(cc - set(cc_roots))
    d.update(dict(zip(cc_noroots,[cc_root]*len(cc_noroots))))
    
inst_id_name_dict = works_authorships_all[['institution_id','institution_name']].drop_duplicates(['institution_id','institution_name']).set_index('institution_id').to_dict()['institution_name']
my_file = "inst_id_name_dict.pickle"
pickle.dump(inst_id_name_dict, open(os.path.join(basepath3, my_file), 'wb')) 

#missing affiliations #isolated #no parents or childs
missing_aff = list(set(inst_id_name_dict.keys()) - set(d.keys()))
d.update(dict(zip(missing_aff,missing_aff)))
my_file = "inst_root_map.pickle"
pickle.dump(d, open(os.path.join(basepath3, my_file), 'wb')) 

#example IU: 592451, 4210119109, 4210101670

In [None]:
my_file = "inst_root_map.pickle"
with open(os.path.join(basepath3, my_file),"rb") as fp:
    d = pickle.load(fp)

In [None]:
works_authorships_all['institution_id'] = works_authorships_all['institution_id'].map(d)
#restrict to preprints' authors 
preprint_author_id_set = set(works_authorships.author_id)
my_file = "preprint_author_id_set.pickle"
pickle.dump(preprint_author_id_set, open(os.path.join(basepath3, my_file), 'wb'))
print(f'{len(preprint_author_id_set)} authors preprints')
works_authorships_inst = works_authorships_all.query('author_id.isin(@preprint_author_id_set)')

In [None]:
#add publication year
#works = read_parquet(basepath3 / 'works')
works_all = read_parquet(basepath3 / 'works_all')
works_all = works_all.reset_index()[['work_id','publication_date_1']]
works_all['publication_year'] = works_all['publication_date_1'].dt.year
works_all = works_all[['work_id','publication_year']]
works_authorships_inst = works_authorships_inst.merge(works_all.reset_index(),on='work_id')
works_authorships_inst = works_authorships_inst.drop(columns='index')

In [None]:
#save
my_file = 'works_authorships_inst2'
works_authorships_inst.to_parquet(os.path.join(basepath3, my_file))

### frequency - one inst

Keep most frequent institutions each year - no multiple affiliations

In [None]:
my_file = 'works_authorships_inst2'
works_authorships_inst = read_parquet(basepath3 / my_file)

In [None]:
focal_authors_ids = preprint_author_id_set
works_authors_df = works_authorships_inst
persist_absolute=1.0
persist_rate=0.3

venue = 'institution_id'

In [None]:
# for each author, find the list of institutions
# keep only authors with at least 2 institutions
print('  for each author, find the list of '+venue+'; keep only authors with at least 2 '+venue)
authors_with_at_least_2_vanues = (
    works_authors_df[
        works_authors_df['author_id'].isin(set(focal_authors_ids)) #only the focal_authors
    ]
    .groupby(by='author_id',sort=False)
    .agg(
        n_venues=(venue,'nunique')
    )
    .query('n_venues>=2')
)

#otherwise that institutions always
authors_with_1_vanues = focal_authors_ids - set(authors_with_at_least_2_vanues.index)

print(f'{len(focal_authors_ids)} total authors: {len(authors_with_1_vanues)} 1-venue, {len(authors_with_at_least_2_vanues)} more venues')

In [None]:
my_file = "authors_with_at_least_2_vanues.pickle"
pickle.dump(set(authors_with_at_least_2_vanues.index), open(os.path.join(basepath3, my_file), 'wb')) 
my_file = "authors_with_1_vanues.pickle"
pickle.dump(set(authors_with_1_vanues), open(os.path.join(basepath3, my_file), 'wb')) 

In [None]:
print('  Building the lists of '+venue+' per year')
works_authors_df = works_authors_df.sort_values(by=['author_id','publication_year'])
authors_vanues = (
    works_authors_df[
            works_authors_df['author_id'].isin(set(authors_with_at_least_2_vanues.index)) #only the focal_authors
        ]
    .groupby(by=['author_id','publication_year'], as_index=False)  # check the groupby, considering author_id, publication_date and also work_id
    #.groupby(by=['author_id','publication_year'], sort=True, as_index=False)  # check the groupby, considering author_id, publication_date and also work_id
    .agg(
        #vanues=(venue,list), 
        #num_works=('work_id','nunique'),
        most_common_vanues=(venue, lambda x: x.mode().tolist()), #.mode() value that appears most often #there can be multiple [1, 2. 3]
        frequency_rate=(venue, lambda x: x.value_counts(normalize=True).values[0]), #percentage max [0.33]
        frequency_absolute=(venue, lambda x: x.value_counts().values.tolist()[:2]), #number papers each institution [1,1] #first two
        author_name=('author_name', 'first'),
    )
)

my_file = 'authors_vanues'
authors_vanues.to_parquet(os.path.join(basepath3, my_file))

In [None]:
authors_vanues = read_parquet(basepath3 / 'authors_vanues')
print('  Checking for ties')
authors_vanues['tie'] = authors_vanues['frequency_absolute'].apply(lambda x: x[0] == x[1] if len(x)!=1 else False)

In [None]:
def check_ties(df_):
    output_df = pd.DataFrame(columns=['publication_year','most_common_vanue'])

    most_common_vanue = []
    most_common_vanue.append(df_.iloc[0]['most_common_vanues'][0])    
    
    # if the first record is a tie, get the next country of affiliation (if len(df_)>1)
    if(df_.iloc[0]['tie']):
        if(len(df_)>1):
            most_common_vanue[0] = df_.iloc[1]['most_common_vanues'][0]

    for i in range(1,len(df_)):
        #print(f"{df_2.iloc[i-1]['most_common_country']=},  {df_2.iloc[i]['most_common_country']=}")
        
        if(df_.iloc[i]['tie']):
            if(most_common_vanue[i-1] in df_.iloc[i]['most_common_vanues']):
                most_common_vanue.append(most_common_vanue[i-1])
            else:
                most_common_vanue.append(df_.iloc[i]['most_common_vanues'][0])
        else:
            most_common_vanue.append(df_.iloc[i]['most_common_vanues'][0])
    
    output_df['most_common_vanues'] = most_common_vanue
    output_df['publication_year'] = df_['publication_year'].values
    
    return output_df

print('  Solving ties')
authors_vanues['most_common_vanues'] = (
    authors_vanues
    .groupby(by=['author_id'],sort=False)
    .apply(
        lambda df_: check_ties(df_)
    )[['most_common_vanues']]
    .values
    .tolist()
)

my_file = 'authors_vanues2'
authors_vanues.to_parquet(os.path.join(basepath3, my_file))

In [None]:
def check_ties(df_):
    output_df = pd.DataFrame(columns=['publication_year','most_common_vanue'])

    most_common_vanue = []
    most_common_vanue.append(df_.iloc[0]['most_common_vanues'][0])    
    
    # if the first record is a tie, get the next country of affiliation (if len(df_)>1)
    if(df_.iloc[0]['tie']):
        if(len(df_)>1):
            most_common_vanue[0] = df_.iloc[1]['most_common_vanues'][0]

    for i in range(1,len(df_)):
        #print(f"{df_2.iloc[i-1]['most_common_country']=},  {df_2.iloc[i]['most_common_country']=}")
        
        if(df_.iloc[i]['tie']):
            if(most_common_vanue[i-1] in df_.iloc[i]['most_common_vanues']):
                most_common_vanue.append(most_common_vanue[i-1])
            else:
                most_common_vanue.append(df_.iloc[i]['most_common_vanues'][0])
        else:
            most_common_vanue.append(df_.iloc[i]['most_common_vanues'][0])
    
    output_df['most_common_vanues'] = most_common_vanue
    output_df['publication_year'] = df_['publication_year'].values
    
    return output_df

In [None]:
authors_vanues2 = read_parquet(basepath3 / 'authors_vanues2')

In [None]:
authors_vanues['frequency_absolute'] = authors_vanues['frequency_absolute'].apply(lambda x: x[0])
# removing rows where frequency is less than persist
print('  Removing rows where frequency is less than a minimum threshold')
authors_vanues = (
    authors_vanues[
        (authors_vanues['frequency_absolute']>=persist_absolute) &
        (authors_vanues['frequency_rate']>=persist_rate)
    ]
)
my_file = 'authors_vanues3'
authors_vanues.to_parquet(os.path.join(basepath3, my_file))

In [None]:
authors_vanues = read_parquet(basepath3 / 'authors_vanues3')
my_file = "authors_with_at_least_2_vanues.pickle"
with open(os.path.join(basepath3, my_file),"rb") as fp:
    authors_with_at_least_2_vanues = pickle.load(fp)
my_file = "authors_with_1_vanues.pickle"
with open(os.path.join(basepath3, my_file),"rb") as fp:
    authors_with_1_vanues = pickle.load(fp)

In [None]:
authors_vanues = authors_vanues[['author_id','publication_year','most_common_vanues']]
authors_vanues['most_common_vanues'] = authors_vanues['most_common_vanues'].apply(lambda x: x[0])
authors_vanues = authors_vanues.rename(columns={'most_common_vanues':'institution_id'})

authors_vanues2 = works_authors_df[works_authors_df['author_id'].isin(set(authors_with_1_vanues))]
authors_vanues2 = authors_vanues2[['author_id','publication_year','institution_id']].drop_duplicates(['author_id','publication_year']).sort_values(by=['author_id','publication_year'])

authors_vanues_final = pd.concat([authors_vanues,authors_vanues2])

In [None]:
my_file = 'authors_vanues_final'
authors_vanues_final.to_parquet(os.path.join(basepath3, my_file))

In [None]:
#add affiliations to works_authors table

In [None]:
authors_vanues_final = read_parquet(basepath3 / 'authors_vanues_final')
works_authorships = read_parquet(basepath / 'works_authorships')
works_authorships = works_authorships[['work_id','publication_year','author_id','author_name']]
works_authorships = works_authorships.drop_duplicates(['work_id','author_id'])
works = read_parquet(basepath3 / 'works')
works_authorships = works_authorships.merge(works.reset_index(),on='work_id').set_index('publication_date_1').reset_index()

author_id_set = authors_vanues_final['author_id'].unique()
years_set = np.arange(authors_vanues_final.publication_year.min(), authors_vanues_final.publication_year.max() + 1)
idx = pd.MultiIndex.from_product((years_set, author_id_set), names=['publication_year', 'author_id'])
authors_vanues_final = authors_vanues_final.set_index(['publication_year', 'author_id']).reindex(idx, fill_value=np.nan).reset_index()

#fill-in #if nan take the previous one #if no one - take the next one
authors_vanues_final = authors_vanues_final.sort_values(by=['author_id','publication_year'])
authors_vanues_final['institution_id'] = authors_vanues_final.groupby('author_id').institution_id.ffill()
authors_vanues_final['institution_id'] = authors_vanues_final.groupby('author_id').institution_id.bfill()

authors_vanues_final = authors_vanues_final[authors_vanues_final.institution_id.notnull()]
authors_vanues_final['institution_id'] = authors_vanues_final['institution_id'].astype(int)

### city, country, continent

In [None]:
#add: institution name, city, country, continent
my_file = "inst_id_name_dict.pickle"
with open(os.path.join(basepath3, my_file),"rb") as fp:
    inst_id_name_dict = pickle.load(fp)
authors_vanues_final['institution_name'] = authors_vanues_final['institution_id'].map(inst_id_name_dict)

#add country (no city)
institutions_geo_df = pd.read_csv(basepath4 / 'institutions_geo.csv.gz')
#fillin missing info
institutions_geo_df.loc[institutions_geo_df['country']=='Namibia', 'country_code'] = 'NA' #University of Namibia
inst_geo_dict = institutions_geo_df[['country_code','country']].dropna().drop_duplicates(['country_code','country']).set_index('country_code').to_dict()['country']
institutions_geo_df['country'] = institutions_geo_df['country_code'].map(inst_geo_dict)
inst_geo_dict = institutions_geo_df[['city','country']].dropna().drop_duplicates(['city','country'],keep='last').set_index('city').to_dict()['country']
institutions_geo_df.loc[institutions_geo_df.country.isna(),'country'] = institutions_geo_df.loc[institutions_geo_df.country.isna(),'city'].map(inst_geo_dict)
inst_geo_dict = institutions_geo_df[['country','country_code']].dropna().drop_duplicates(['country','country_code']).set_index('country').to_dict()['country_code']
institutions_geo_df['country_code'] = institutions_geo_df['country'].map(inst_geo_dict)
institutions_geo_df.loc[institutions_geo_df['institution_id']==194744927,'country_code'] = 'SG'
institutions_geo_df.loc[institutions_geo_df['institution_id']==194744927,'country'] = 'Singapore'
institutions_geo_df.loc[institutions_geo_df['institution_id']==142504963,'country_code'] = 'IN'
institutions_geo_df.loc[institutions_geo_df['institution_id']==142504963,'country'] = 'India'
institutions_geo_df.loc[institutions_geo_df['institution_id']==4387155609,'country_code'] = 'FR'
institutions_geo_df.loc[institutions_geo_df['institution_id']==4387155609,'country'] = 'France'

authors_vanues_final = authors_vanues_final.merge(institutions_geo_df[['institution_id','country_code','country']],on='institution_id',how='left')

authors_vanues_final.loc[authors_vanues_final['institution_id']==4387152698,'country_code'] = 'IE'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387152698,'country'] = 'Ireland'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387152931,'country_code'] = 'JP'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387152931,'country'] = 'Japan'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387152970,'country_code'] = 'IN'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387152970,'country'] = 'India'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387153060,'country_code'] = 'KE'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387153060,'country'] = 'Kenya'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387153083,'country_code'] = 'IN'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387153083,'country'] = 'India'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387153738,'country_code'] = 'IN'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387153738,'country'] = 'India'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387154466,'country_code'] = 'IN'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387154466,'country'] = 'India'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387154815,'country_code'] = 'NP'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387154815,'country'] = 'Nepal'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387154860,'country_code'] = 'IN'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387154860,'country'] = 'India'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387155965,'country_code'] = 'IN'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387155965,'country'] = 'India'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387156305,'country_code'] = 'NP'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4387156305,'country'] = 'Nepal'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4391012538,'country_code'] = 'PK'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4391012538,'country'] = 'Pakistan'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4392738202,'country_code'] = 'IN'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4392738202,'country'] = 'India'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4396570495,'country_code'] = 'JO'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4396570495,'country'] = 'Jordan'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4396570510,'country_code'] = 'NP'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4396570510,'country'] = 'Nepal'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4399657981,'country_code'] = 'IN'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4399657981,'country'] = 'India'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4400008961,'country_code'] = 'GH'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4400008961,'country'] = 'Ghana'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4400009046,'country_code'] = 'IN'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4400009046,'country'] = 'India'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4400573231,'country_code'] = 'GH'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4400573231,'country'] = 'Ghana'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4400573235,'country_code'] = 'GH'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4400573235,'country'] = 'Ghana'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4402554220,'country_code'] = 'IN'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4402554220,'country'] = 'India'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4403386676,'country_code'] = 'CN'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4403386676,'country'] = 'China'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4403928290,'country_code'] = 'BD'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4403928290,'country'] = 'Bangladesh'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4210144721,'country_code'] = 'US'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4210144721,'country'] = 'United States'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4403928291,'country_code'] = 'FR'
authors_vanues_final.loc[authors_vanues_final['institution_id']==4403928291,'country'] = 'France'

In [None]:
missing_inst_country = list(set(authors_vanues_final[authors_vanues_final.country.isna()].institution_id))
print(len(missing_inst_country))

In [None]:
import requests
import json
url="https://api.openalex.org/institutions/I"
missing_inst_country_dict = {}
for i in tqdm(missing_inst_country):
    url_i = url+str(i) 
    response = requests.get(url_i)
    if response.status_code == 200:
        data = response.json()
        if data['geo']['country'] is not None:
            missing_inst_country_dict[i] = data['geo']['country']

In [None]:
for i in tqdm(list(missing_inst_country_dict.keys())):
    authors_vanues_final.loc[authors_vanues_final.institution_id==i,'country'] = missing_inst_country_dict[i] 
    authors_vanues_final.loc[authors_vanues_final.institution_id==i,'country_code'] = authors_vanues_final.loc[authors_vanues_final.institution_id==i,'country'].map(inst_geo_dict)

In [None]:
#add continent
#assign continent to country 
import pycountry_convert as pc

authors_vanues_final['country_code'] = authors_vanues_final['country_code'].replace({'VA': 'IT'})
authors_vanues_final['country'] = authors_vanues_final['country'].replace({'Vatican': 'Italy'})
authors_vanues_final['country_code'] = authors_vanues_final['country_code'].replace({'SX': 'NL'})
authors_vanues_final['country'] = authors_vanues_final['country'].replace({'Sint Maarten': 'Netherlands'})
authors_vanues_final['country_code'] = authors_vanues_final['country_code'].replace({'TL': 'ID'})
authors_vanues_final['country'] = authors_vanues_final['country'].replace({'Timor Leste': 'Indonesia'})

def country_to_continent(country_code):
    return pc.country_alpha2_to_continent_code(country_code)
authors_vanues_final['continent_code'] = authors_vanues_final['country_code'].apply(lambda x: country_to_continent(x))
def continent_to_continent(continent_code):
    return  pc.convert_continent_code_to_continent_name(continent_code)
authors_vanues_final['continent'] = authors_vanues_final['continent_code'].apply(lambda x: continent_to_continent(x))

my_file = 'authors_vanues_final_fill'
authors_vanues_final.to_parquet(os.path.join(basepath3, my_file))

### merge with works_authorships table

merge with works_authors table

In [None]:
works_authorships = read_parquet(basepath / 'works_authorships')
works_authorships['author_id'] = works_authorships['author_id'].astype(int)
#set preprints with at least one nan
print(f'Delate all preprints with not full affiliation info (not fillin): lose {(len(set(works_authorships[works_authorships.institution_id.isna()].work_id))/len(set(works_authorships.work_id)))*100:.2f}%')
works_authorships = works_authorships[['work_id','author_id','author_name','publication_year']]
#works_authorships['publication_year'] = works_authorships['publication_year'].astype(int)
authors_vanues_final = read_parquet(basepath3 / 'authors_vanues_final_fill')

In [None]:
print(f'{(len(works_authorships[works_authorships.institution_id.isna()])/len(works_authorships))*100:.2f}% nan institution rows')

In [None]:
#delate all preprints with not full affiliation info
print(f'Delate all preprints with not full affiliation info: lose {100*len(set(works_authorships[works_authorships.institution_id.isna()].work_id))/len(set(works_authorships.work_id)):.2f}% of works')

In [None]:
#delate only rows with missing institutions info
print(f'Delate only rows with missing institutions info: lose {100 - (100*len(set(works_authorships[works_authorships.institution_id.notnull()].work_id))/len(set(works_authorships.work_id))):.2f}% of works')

In [None]:
#works_authorships = works_authorships[works_authorships.institution_id.notnull()]
works_missinginfo = set(works_authorships[works_authorships.institution_id.isna()].work_id)
works_authorships = works_authorships[~works_authorships.work_id.isin(works_missinginfo)]

In [None]:
##add publication date
works = read_parquet(basepath3 / 'works')
works_authorships = works_authorships.merge(works.reset_index()[['work_id','publication_date_1']].reset_index(),on='work_id')
works_authorships = works_authorships.drop_duplicates(['work_id','author_id'])
works_authorships['author_id'] = works_authorships['author_id'].astype(int)
works_authorships['institution_id'] = works_authorships['institution_id'].astype(int)
works_authorships = works_authorships.sort_values(by=['publication_date_1','work_id'])
works_authorships = works_authorships.reset_index(drop=True)

In [None]:
my_file = "preprint_work_id_set.pickle"
with open(os.path.join(basepath3, my_file),"rb") as fp:
    preprint_work_id_set = pickle.load(fp)
print(f'{len(preprint_work_id_set) - len(set(works_authorships.work_id))} lost preprints')

In [None]:
#time period
works_authorships = works_authorships.query("publication_date_1 >= '2000-01-01'").query("publication_date_1 <= '2024-12-01'").reset_index(drop=True)

In [None]:
#save
my_file = 'works_authorships1'
works_authorships.to_parquet(os.path.join(basepath3, my_file))

### institutions distance

* Calculate distances
* Threshold 1 km

In [None]:
#too many institutions - calculate distance only if edge in institution collaboration graphs 

In [None]:
from itertools import combinations
works_authorships = read_parquet(basepath3 / 'works_authorships1')
works_authorships = works_authorships.set_index('publication_date_1')
institutions_set = set(works_authorships.institution_id)
print(f'{len(institutions_set)} institutions')

In [None]:
institutions_geo_df = pd.read_csv(basepath4 / 'institutions_geo.csv.gz')
institutions_set_geo = set(institutions_geo_df.institution_id)
print(f'{len(institutions_set_geo)} institutions')
institutions_geo_df = institutions_geo_df.query('institution_id.isin(@institutions_set)')
print(f'{len(set(institutions_geo_df.institution_id))} institutions')
institutions_geo_df['location'] = list(zip(institutions_geo_df.latitude, institutions_geo_df.longitude))
institutions_geo_map = dict(zip(institutions_geo_df['institution_id'],institutions_geo_df['location']))

In [None]:
import requests
import json
missing_inst_geo_map = set(works_authorships.institution_id) - set(institutions_geo_df.institution_id)
url="https://api.openalex.org/institutions/I"
missing_inst_country_dict = {}
for i in tqdm(missing_inst_geo_map):
    url_i = url+str(i) 
    response = requests.get(url_i)
    if response.status_code == 200:
        data = response.json()
        if (data['geo']['latitude'] is not None) and (data['geo']['longitude'] is not None):
            institutions_geo_map[i] = (data['geo']['latitude'],data['geo']['longitude'])

In [None]:
import geopy.distance
def make_institution_graph(works_authors_rows):
    
    institution_id_set = set(works_authors_rows.institution_id)
                                  
    bip_g = nx.from_pandas_edgelist(
        works_authors_rows,
        source='work_id', target='institution_id'
    )
   
    inst_graph = nx.bipartite.projected_graph(bip_g,nodes=institution_id_set) 

    return inst_graph
I = make_institution_graph(works_authors_rows = works_authorships[['work_id','institution_id']].drop_duplicates(['work_id','institution_id']))
I_dist = nx.to_pandas_edgelist(I)
I_dist['source_loc'] = I_dist['source'].map(institutions_geo_map)
I_dist['target_loc'] = I_dist['target'].map(institutions_geo_map)
I_dist['dist'] = I_dist.apply(lambda x : geopy.distance.distance(x.source_loc,x.target_loc).km ,axis=1) #faster: distance.great_circle

In [None]:
I_dist = I_dist[['source','target','dist']]
I_dist = pd.concat([I_dist,I_dist[['target','source','dist']].rename(columns={'source':'target','target':'source'})]) #order nodes
I_dist = I_dist.drop_duplicates(['target','source'])
## add same institution-loops 0 dist
I_dist = pd.concat([I_dist,pd.DataFrame.from_dict({'source':list(institutions_set),'target':list(institutions_set),'dist':[0]*len(institutions_set)})])
my_file = 'I_dist'
I_dist.to_parquet(os.path.join(basepath3, my_file))

In [None]:
## Def insts same if dist<1km

In [None]:
I_dist = pd.read_parquet( basepath3 /  "I_dist"  )

In [None]:
I_dist = pd.read_parquet( basepath3 /  "I_dist"  ) 
I_dist['source'] = I_dist['source'].astype(int)
I_dist['target'] = I_dist['target'].astype(int)
I_set = (set(I_dist['source'])).union(set(I_dist['target'])) 
swap = I_dist['source'] < I_dist['target'] #drop duplicates edges
I_dist.loc[swap, ['source', 'target']] = I_dist.loc[swap, ['target', 'source']].values
I_dist = I_dist.drop_duplicates(subset=['source', 'target'])
I_dist0 = I_dist[I_dist.dist<1]
print(f'{(len(I_dist0)/len(I_dist))*100:.2f}% inst dist 0')
I_dist0_intra = I_dist0[I_dist0.source==I_dist0.target]
I_dist0_inter = I_dist0[I_dist0.source!=I_dist0.target]
print(f'{(len(I_dist0_intra)/len(I_dist))*100:.2f}% inst intra, {(len(I_dist0_inter)/len(I_dist))*100:.2f}% inst inter')

In [None]:
I_dist = pd.read_parquet( basepath3 /  "I_dist"  ) 
I_dist.loc[I_dist['dist'] < 1, 'dist'] = 0
I_dist['intra'] = 0
#I_dist.loc[I_dist.source==I_dist.target,'intra'] = 1
I_dist.loc[I_dist.dist==0,'intra'] = 1
my_file = 'I_dist_threshold'
I_dist.to_parquet(os.path.join(basepath3, my_file))

In [None]:
works_authorships = works_authorships.drop(columns='index')
my_file = 'works_authors_aff'
works_authorships.to_parquet(os.path.join(basepath3, my_file))

### number of authors

In [None]:
#preprint with at least one author - restrict tables to them
#works_authorships_drop = works_authorships.drop_duplicates(['work_id','author_id'])
works_authors_aff = read_parquet(basepath3 / 'works_authors_aff')
num_auhors_df = works_authors_aff.groupby('work_id').author_id.count().to_frame().reset_index().rename(columns={'author_id':'num_authors'})                                  
my_file = 'num_auhors_df'
num_auhors_df.to_parquet(os.path.join(basepath3, my_file))

In [None]:
num_auhors_df = read_parquet(basepath3 / 'num_auhors_df')
works = read_parquet(basepath3 / 'works')
works = works.reset_index().merge(num_auhors_df,on='work_id').set_index('publication_date_1')
my_file = 'works2'
works.to_parquet(os.path.join(basepath3, my_file))

### solo preprints

In [None]:
num_auhors_df = read_parquet(basepath3 / 'num_auhors_df')
preprint_id_set_solo = set(num_auhors_df[num_auhors_df.num_authors == 1].work_id)
preprint_id_set_nosolo = set(num_auhors_df[num_auhors_df.num_authors > 1].work_id)
my_file = "preprint_id_set_solo.pickle"
pickle.dump(preprint_id_set_solo, open(os.path.join(basepath3, my_file), 'wb'))
my_file = "preprint_id_set_nosolo.pickle"
pickle.dump(preprint_id_set_nosolo, open(os.path.join(basepath3, my_file), 'wb'))
print(f'{(len(preprint_id_set_solo)/len(num_auhors_df))*100:.2f}% solo preprints')

## Concepts

### level 1

In [None]:
#preprint
works_concepts = pd.read_parquet(os.path.join(basepath, 'works_concepts'), engine='pyarrow')
my_file = "preprint_id_set.pickle"
with open(os.path.join(basepath3, my_file),"rb") as fp:
    preprint_id_set = pickle.load(fp)
works_concepts = works_concepts.query('work_id.isin(@preprint_id_set)') #restrict work ids
works_concepts_l1 = works_concepts[works_concepts.level == 1]
works_concepts = works_concepts.merge(works.reset_index(),on='work_id')
works_concepts = works_concepts[['work_id','publication_date_1','publication_date','concept_id','concept_name','level','score']]
works_concepts['level'] = works_concepts['level'].astype(int)
my_file = 'works_concepts'
works_concepts.to_parquet(os.path.join(basepath3, my_file))

### COVID

In [None]:
#COVID papers: look at openalex concepts
COVID_concepts = {3008058167,3006700255,3007834351}
works_concepts_COVID = works_concepts.query('concept_id.isin(@COVID_concepts)')
preprint_id_set_COVID = set(works_concepts_COVID.work_id)
print(f'{len(preprint_id_set_COVID)} papers COVID concepts')

In [None]:
#COVID papers: check for word 'COVID' in the title
works_ = read_parquet(basepath / 'works')
works_ = works_[['title']].reset_index()
works_['title'] = works_['title'].astype(str)

preprint_id_set_COVIDtitle = set(
      works_
    .query('title.str.lower().str.contains("covid")')
    .work_id
)
preprint_id_set_COVID = preprint_id_set_COVID.union(preprint_id_set_COVIDtitle)
print(f'{len(preprint_id_set_COVIDtitle)} papers COVID word in the title')

preprint_id_set_noCOVID = preprint_id_set - preprint_id_set_COVID
print(f'{len(preprint_id_set_COVID)} ({(len(preprint_id_set_COVID)/len(preprint_id_set))*100:.2f}%) COVID papers')
my_file = 'preprint_id_set_COVID'
pickle.dump(preprint_id_set_COVID, open(os.path.join(basepath3, my_file), 'wb'))
my_file = 'preprint_id_set_noCOVID'
pickle.dump(preprint_id_set_noCOVID, open(os.path.join(basepath3, my_file), 'wb'))

## References

In [None]:
works_referenced_works = read_parquet(basepath / 'works_referenced_works',
                                      columns=['work_id', 'referenced_work_id', 'work_publication_date', 'referenced_work_publication_date']
                                     )

In [None]:
my_file = "preprint_id_set.pickle"
with open(os.path.join(basepath3, my_file),"rb") as fp:
    preprint_id_set = pickle.load(fp)
works2 = read_parquet(basepath3 / 'works2')
#works_referenced_works = works_referenced_works.query('work_id.isin(@preprint_id_set)') #restrict work ids

In [None]:
works_month_dict = works2[['work_id']].reset_index().set_index('work_id').to_dict()['publication_date_1']
my_file = "works_month_dict.pickle"
pickle.dump(works_month_dict, open(os.path.join(basepath3, my_file), 'wb'))

In [None]:
#preprints right publication_date
works_referenced_works.loc[works_referenced_works.work_id.isin(preprint_id_set),'work_publication_date'] = works_referenced_works.loc[works_referenced_works.work_id.isin(preprint_id_set),'work_id'].map(works_month_dict)
works_referenced_works.loc[works_referenced_works.referenced_work_id.isin(preprint_id_set),'referenced_work_publication_date'] = works_referenced_works.loc[works_referenced_works.referenced_work_id.isin(preprint_id_set),'referenced_work_id'].map(works_month_dict)

In [None]:
count_ref_df = works_referenced_works.query('work_id.isin(@preprint_id_set)').groupby('work_id').referenced_work_id.count().to_frame().reset_index().rename(columns={'referenced_work_id':'ref_count'})
count_cit_df = works_referenced_works.query('referenced_work_id.isin(@preprint_id_set)').groupby('referenced_work_id').work_id.count().to_frame().reset_index().rename(columns={'work_id':'cit_count','referenced_work_id':'work_id'})

preprint_id_set_noref = preprint_id_set - set(count_ref_df.work_id)
preprint_id_set_nocit = preprint_id_set - set(count_cit_df.work_id)
preprint_id_set_noref_nocit = preprint_id_set_noref.union(preprint_id_set_nocit)
print(f'{len(preprint_id_set_noref)/len(preprint_id_set)*100:.2f}% papers no references')
print(f'{len(preprint_id_set_nocit)/len(preprint_id_set)*100:.2f}% papers no citations')
print(f'{len(preprint_id_set_noref_nocit)/len(preprint_id_set)*100:.2f}% papers no references or no citations')
my_file = "preprint_id_set_noref.pickle"
pickle.dump(preprint_id_set_noref, open(os.path.join(basepath3, my_file), 'wb'))
my_file = "preprint_id_set_nocit.pickle"
pickle.dump(preprint_id_set_nocit, open(os.path.join(basepath3, my_file), 'wb'))

#add works zero citations and references
count_cit_df = pd.concat([count_cit_df,pd.DataFrame.from_dict({'work_id':list(preprint_id_set_nocit),'cit_count':[0]*len(preprint_id_set_nocit)})])
count_ref_df = pd.concat([count_ref_df,pd.DataFrame.from_dict({'work_id':list(preprint_id_set_noref),'ref_count':[np.nan]*len(preprint_id_set_noref)})])
my_file = "count_ref_df.pickle"
pickle.dump(count_ref_df, open(os.path.join(basepath3, my_file), 'wb'))
my_file = "count_cit_df.pickle"
pickle.dump(count_cit_df, open(os.path.join(basepath3, my_file), 'wb'))

works3 = works2.reset_index().merge(count_cit_df.merge(count_ref_df,on='work_id'),on='work_id').set_index('publication_date_1')

my_file = 'works3'
works3.to_parquet(os.path.join(basepath3, my_file))

In [None]:
#differences years
works_referenced_works['work_publication_date_1'] = pd.to_datetime(pd.DataFrame({'day': 1, 
                                              'month': works_referenced_works.work_publication_date.dt.month, 
                                              'year': works_referenced_works.work_publication_date.dt.year},
                                              index=works_referenced_works.index))
works_referenced_works['referenced_work_publication_date_1'] = pd.to_datetime(pd.DataFrame({'day': 1, 
                                              'month': works_referenced_works.referenced_work_publication_date.dt.month, 
                                              'year': works_referenced_works.referenced_work_publication_date.dt.year},
                                              index=works_referenced_works.index))

works_referenced_works = works_referenced_works.drop(columns=['work_publication_date','referenced_work_publication_date'])

works_referenced_works = works_referenced_works[works_referenced_works['work_publication_date_1']> pd.Timestamp(1800, 1, 1)]
works_referenced_works = works_referenced_works[works_referenced_works['referenced_work_publication_date_1']> pd.Timestamp(1800, 1, 1)]
works_referenced_works['diff_publication_date_1'] = (works_referenced_works['work_publication_date_1'] - works_referenced_works['referenced_work_publication_date_1'])/np.timedelta64(1, 'Y')

my_file = 'works_referenced_works'
works_referenced_works.to_parquet(os.path.join(basepath3, my_file))

## Save final tables

In [None]:
my_path_ = Path('./Tables_final')
if not os.path.exists(my_path_):
    os.makedirs(my_path_)

In [None]:
#set preprints considered : no missing informations: publication_date, authors, concepts, references #in time window 2000-2024

works = read_parquet(basepath3 / 'works3')
works = works.sort_values(by=['publication_date','work_id'])

works_authors_aff = read_parquet(basepath3 / 'works_authors_aff') #authors

works_concepts = pd.read_parquet(os.path.join(basepath3, 'works_concepts'), engine='pyarrow') #concepts
works_concepts = works_concepts.sort_values(by=['publication_date','work_id']).set_index('publication_date_1',drop=True)

#restrict to ArXiv preprints
basepath6 = Path('/N/project/openalex/slices/arxiv-preprints/dec-2024') 
arxiv_categories = read_parquet(basepath6 / 'preprint_categories')
arxiv_categories = arxiv_categories[arxiv_categories['where']=='arxiv']

In [None]:
preprints_set1 = set(works.work_id)
preprints_set2 = set(works_authors_aff.work_id)
preprints_set3 = set(works_concepts.work_id)
preprints_set4 = set(arxiv_categories.work_id)
preprint_id_set_final = ((preprints_set1.intersection(preprints_set2)).intersection(preprints_set3)).intersection(preprints_set4)
print(f'{len(preprint_id_set_final)} preprints (arxiv, no missing info)')

In [None]:
#restrict to no solo papers
with open(os.path.join(basepath3, "preprint_id_set_solo.pickle"),"rb") as fp:
    preprint_id_set_solo = pickle.load(fp)
with open(os.path.join(basepath3, "preprint_id_set_nosolo.pickle"),"rb") as fp:
    preprint_id_set_nosolo = pickle.load(fp)
preprint_id_set_solo = preprint_id_set_solo.intersection(preprint_id_set_final)
preprint_id_set_nosolo = preprint_id_set_nosolo.intersection(preprint_id_set_final)
pickle.dump(preprint_id_set_solo, open(os.path.join(my_path_, "preprint_id_set_solo.pickle"), 'wb'))
pickle.dump(preprint_id_set_nosolo, open(os.path.join(my_path_, "preprint_id_set_nosolo.pickle"), 'wb'))
preprint_id_set_final = preprint_id_set_final.intersection(preprint_id_set_nosolo)

print(f'{len(preprint_id_set_final)} preprints (arxiv, no missing info, no solo)')

In [None]:
pickle.dump(preprint_id_set_final, open(os.path.join(my_path_, "preprint_id_set.pickle"), 'wb'))

In [None]:
works = works[works.work_id.isin(preprint_id_set_final)]
works_authors_aff = works_authors_aff[works_authors_aff.work_id.isin(preprint_id_set_final)]
works_concepts = works_concepts[works_concepts.work_id.isin(preprint_id_set_final)]
works.to_parquet(os.path.join(my_path_, "works"))
works_authors_aff.to_parquet(os.path.join(my_path_, "works_authors_aff"))
works_concepts.to_parquet(os.path.join(my_path_, "works_concepts"))

In [None]:
with open(os.path.join(basepath3, "preprint_id_set_COVID"),"rb") as fp:
    preprint_id_set_COVID = pickle.load(fp)
with open(os.path.join(basepath3, "preprint_id_set_noCOVID"),"rb") as fp:
    preprint_id_set_noCOVID = pickle.load(fp)
preprint_id_set_COVID = preprint_id_set_COVID.intersection(preprint_id_set_final)
preprint_id_set_noCOVID = preprint_id_set_noCOVID.intersection(preprint_id_set_final)
pickle.dump(preprint_id_set_COVID, open(os.path.join(my_path_, 'preprint_id_set_COVID'), 'wb'))
pickle.dump(preprint_id_set_noCOVID, open(os.path.join(my_path_, 'preprint_id_set_noCOVID'), 'wb'))
works_COVID = works.query('work_id.isin(@preprint_id_set_COVID)')
works_noCOVID = works.query('work_id.isin(@preprint_id_set_noCOVID)')
works_COVID.to_parquet(os.path.join(my_path_, "works_COVID"))
works_noCOVID.to_parquet(os.path.join(my_path_, "works_noCOVID"))

In [None]:
with open(os.path.join(basepath3, "count_ref_df.pickle"),"rb") as fp:
    count_ref_df = pickle.load(fp)
with open(os.path.join(basepath3, "count_cit_df.pickle"),"rb") as fp:
    count_cit_df = pickle.load(fp)
with open(os.path.join(basepath3, "preprint_id_set_noref.pickle"),"rb") as fp:
    preprint_id_set_noref = pickle.load(fp)
with open(os.path.join(basepath3, "preprint_id_set_nocit.pickle"),"rb") as fp:
    preprint_id_set_nocit = pickle.load(fp)
    
count_ref_df = count_ref_df[count_ref_df.work_id.isin(preprint_id_set_final)]
count_ref_df.to_parquet(os.path.join(my_path_, "count_ref_df"))
count_cit_df = count_cit_df[count_cit_df.work_id.isin(preprint_id_set_final)]
count_cit_df.to_parquet(os.path.join(my_path_, "count_cit_df"))


preprint_id_set_noref = preprint_id_set_noref.intersection(preprint_id_set_final)
preprint_id_set_nocit = preprint_id_set_nocit.intersection(preprint_id_set_final)
pickle.dump(preprint_id_set_noref, open(os.path.join(my_path_, 'preprint_id_set_noref.pickle'), 'wb'))
pickle.dump(preprint_id_set_nocit, open(os.path.join(my_path_, 'preprint_id_set_nocit.pickle'), 'wb'))

In [None]:
preprint_ref_concepts_max_unique = read_parquet(basepath3 / 'preprint_ref_concepts_max_unique')
preprint_ref_concepts_max_unique = preprint_ref_concepts_max_unique.reset_index(drop=True)
preprint_ref_concepts_max_unique.to_parquet(os.path.join(my_path_, "preprint_ref_concepts_max_unique"))

In [None]:
works_referenced_works_max_unique = read_parquet(basepath3 / 'works_referenced_works_max_unique')
works_referenced_works_max_unique.to_parquet(os.path.join(my_path_, "works_referenced_works_max_unique"))

In [None]:
works_referenced_works = read_parquet(basepath3 / 'works_referenced_works')
works_referenced_works.to_parquet(os.path.join(my_path_, "works_referenced_works"))

In [None]:
I_dist = read_parquet(basepath3 / 'I_dist_threshold')
I_dist.to_parquet(os.path.join(my_path_, "I_dist_threshold"))

In [None]:
#whole dataset not only preprints

In [None]:
works_all = read_parquet(basepath3 / 'works_all')
works_authorships_all_drop = read_parquet(basepath3 / 'works_authorships_all_drop') #no institutions info 
works_referenced_works_credit = read_parquet(basepath3 / 'works_referenced_works_credit')

In [None]:
#no time window restriction past
works_all = works_all.loc['1800-01-01' : '2024-12-01']
works_authorships_all_drop = works_authorships_all_drop.loc['1800-01-01' : '2024-12-01']
works_referenced_works_credit = works_referenced_works_credit[works_referenced_works_credit.work_publication_date_1<'2025']
works_referenced_works_credit = works_referenced_works_credit[works_referenced_works_credit.referenced_work_publication_date_1<'2025']
works_referenced_works_credit = works_referenced_works_credit[works_referenced_works_credit.work_publication_date_1>='1800']
works_referenced_works_credit = works_referenced_works_credit[works_referenced_works_credit.referenced_work_publication_date_1>='1800']

In [None]:
#check all preprints included
with open(os.path.join(my_path_, "preprint_id_set_solo.pickle"),"rb") as fp:
    preprint_id_set_solo = pickle.load(fp)
with open(os.path.join(my_path_, "preprint_id_set_nosolo.pickle"),"rb") as fp:
    preprint_id_set_nosolo = pickle.load(fp)
preprint_id_set_final_ = preprint_id_set_nosolo.union(preprint_id_set_solo)

works_set1 = set(works_all.work_id)
works_set2 = set(works_authorships_all_drop.drop_duplicates('work_id').work_id)
works_set_final = works_set1.intersection(works_set2)
print(f'{len(works_set_final)} considered works')

print(len(preprint_id_set_final_))
print(len(works_set1.intersection(preprint_id_set_final_)))
print(len(works_set2.intersection(preprint_id_set_final_)))

In [None]:
#works_authorships_all_drop already restricted to work_id in works_all because merge
#works_referenced_works_credit already restricted to have authors and date_publication info 
works_all = works_all[works_all.work_id.isin(works_set_final)]

In [None]:
works_all.to_parquet(os.path.join(my_path_, "works_all"))
works_authorships_all_drop.to_parquet(os.path.join(my_path_, "works_authorships_all_drop"))
works_referenced_works_credit.to_parquet(os.path.join(my_path_, "works_referenced_works_credit"))

In [None]:
#copy manually because less slow
with open(os.path.join(basepath3, "works_month_dict.pickle"),"rb") as fp:
    works_month_dict = pickle.load(fp)
pickle.dump(works_month_dict, open(os.path.join(my_path_, "works_month_dict.pickle"), 'wb'))

## Categories

In [None]:
my_file = "preprint_id_doi_dict.pickle"
with open(os.path.join(basepath3, my_file),"rb") as fp:
    preprint_id_doi_dict = pickle.load(fp)
preprint_doi_id_dict = dict((v, k) for k, v in preprint_id_doi_dict.items())
my_file = "preprint_id_set.pickle"
with open(os.path.join(my_path_, my_file),"rb") as fp:
    preprint_id_set = pickle.load(fp)

In [None]:
basepath6 = Path('/N/project/openalex/slices/arxiv-preprints/dec-2024') 
arxiv_categories = read_parquet(basepath6 / 'preprint_categories')
arxiv_categories = arxiv_categories[arxiv_categories['work_id'].notnull()].reset_index(drop=True)
arxiv_categories['work_id'] = arxiv_categories['work_id'].astype(int)
arxiv_categories = arxiv_categories.query('work_id.isin(@preprint_id_set)') #restrict work ids

In [None]:
set(arxiv_categories['where'])

In [None]:
#Arxiv multiple categories
print(len(set(arxiv_categories.id)))
print(len(arxiv_categories))

In [None]:
#Analysis arxiv
cat_code_count_df = arxiv_categories.groupby('id').cat_code.count().to_frame().reset_index().groupby('cat_code').id.count().to_frame()
cat_code_count_df.plot.bar()

In [None]:
my_file = "cat_code_count_df.pickle"
pickle.dump(cat_code_count_df, open(os.path.join(basepath3, my_file), 'wb'))

fig, ax = plt.subplots(figsize=(15, 5))
ax.bar(list(cat_code_count_df.index), list(cat_code_count_df.id))
#ax.set_title('Number of ArXiv categories assigned to each paper') 
#plt.show()
plt.savefig(os.path.join(basepath3, 'cat_code_count.png'), bbox_inches='tight', pad_inches=0.02)   

In [None]:
arxiv_categories['taxonomy'] = arxiv_categories['cat_code'].str.split('.',1).str[0]

#https://arxiv.org/category_taxonomy #https://arxiv.org/
taxt_name_dict = {
    'cs':'Computer Science',
    'econ':'Economics',
    'eess':'Electrical Engineering and Systems Science',
    'math':'Mathematics',
    'astro-ph':'Physics',
    'cond-mat':'Physics',
    'gr-qc':'Physics',
    'hep-ex':'Physics',
    'hep-lat':'Physics',
    'hep-ph':'Physics',
    'hep-th':'Physics',
    'math-ph':'Physics',
    'nlin':'Physics',
    'nucl-ex':'Physics',
    'nucl-th':'Physics',
    'physics':'Physics',
    'quant-ph':'Physics',
    'acc-phys':'Physics',
    'q-bio':'Biology', #'Quantitative Biology', # union with BioMed
    'q-fin':'Quantitative Finance',
    'stat':'Statistics',    
    'acc-phys':'Physics',
     'adap-org':'Physics',
     'alg-geom':'Mathematics',
     'ao-sci':'Physics',
     'atom-ph':'Physics',
     'bayes-an':'Physics',
     'chao-dyn':'Physics',
     'chem-ph':'Physics',
     'cmp-lg':'Computer Science',
     'comp-gas':'Physics',
     'dg-ga':'Mathematics',
     'funct-an':'Mathematics',
     'mtrl-th':'Physics',
     'patt-sol':'Physics',
     'plasm-ph':'Physics',
     'q-alg':'Mathematics',
     'solv-int':'Physics',
     'supr-con':'Physics',   
}

arxiv_categories['tax_name'] = arxiv_categories['taxonomy'].map(taxt_name_dict)
arxiv_categories_list = list(set(arxiv_categories['tax_name']))
arxiv_categories_list

In [None]:
arxiv_categories = arxiv_categories.drop_duplicates(['id','tax_name'])

In [None]:
tax_name_count_df = arxiv_categories.groupby('id').tax_name.count().to_frame().reset_index().groupby('tax_name').id.count().to_frame()
tax_name_count_df.plot.bar()

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
ax.bar(list(tax_name_count_df.index), list(tax_name_count_df.id))
plt.savefig(os.path.join(basepath3, 'tax_name_count.png'), bbox_inches='tight', pad_inches=0.02)   

In [None]:
tax_name_count_df['perc'] = (tax_name_count_df['id']/len(set(arxiv_categories.id)))*100
my_file = "tax_name_count_df.pickle"
pickle.dump(tax_name_count_df, open(os.path.join(basepath3, my_file), 'wb'))

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
ax.bar(list(tax_name_count_df.index), list(tax_name_count_df.perc))
plt.savefig(os.path.join(basepath3, 'tax_name_perc.png'), bbox_inches='tight', pad_inches=0.02) 

In [None]:
preprint_df = arxiv_categories[['work_id','cat_code','where','tax_name']].reset_index(drop=True)
my_file = "preprint_df.pickle"
pickle.dump(preprint_df, open(os.path.join(my_path_, my_file), 'wb'))

In [None]:
preprint_df = preprint_df[['work_id','tax_name']]
preprint_dict = preprint_df.groupby('tax_name').work_id.apply(set).to_dict()
my_file = "preprint_dict.pickle"
pickle.dump(preprint_dict, open(os.path.join(basepath3, my_file), 'wb'))

In [None]:
## level 1
basepath5 = Path('/N/project/openalex/slices/arxiv-preprints/dec-2024')
arxiv_categories = read_parquet(basepath5 / 'preprint_categories')

preprint_level1_df = arxiv_categories.drop_duplicates(['id','cat_code'])

preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/006734', 'cat_code'] = 'Evolutionary Biology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/010041', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/010181', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/010520', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/010793', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/011742', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/011932', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/012716', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/012666', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/013086', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/022285', 'cat_code'] = 'Genetics'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/023598', 'cat_code'] = 'Evolutionary Biology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/024802', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/026575', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/029173', 'cat_code'] = 'Evolutionary Biology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/029165', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/029330', 'cat_code'] = 'Neuroscience'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/030031', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/030601', 'cat_code'] = 'Biochemistry'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/031161', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/031849', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/033282', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/033373', 'cat_code'] = 'Evolutionary Biology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/034926', 'cat_code'] = 'Evolutionary Biology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/035295', 'cat_code'] = 'Genomics'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/035915', 'cat_code'] = 'Evolutionary Biology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/035980', 'cat_code'] = 'Neuroscience'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/036731', 'cat_code'] = 'Evolutionary Biology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/045872', 'cat_code'] = 'Developmental Biology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/058453', 'cat_code'] = 'Evolutionary Biology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/061754', 'cat_code'] = 'Animal Behavior And Cognition'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/064311', 'cat_code'] = 'Evolutionary Biology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/065516', 'cat_code'] = 'Biophysics'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/065664', 'cat_code'] = 'Biophysics'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/070631', 'cat_code'] = 'Scientific Communication And Education'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/074104', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/073734', 'cat_code'] = 'Neuroscience'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/074310', 'cat_code'] = 'Genomics'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/076653', 'cat_code'] = 'Cancer Biology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/077057', 'cat_code'] = 'Genomics'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/079533', 'cat_code'] = 'Biophysics'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/008813', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/009597', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/009589', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/011965', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/012104', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/013680', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/029777', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/030015', 'cat_code'] = 'Ecology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/036491', 'cat_code'] = 'Biophysics'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/052076', 'cat_code'] = 'Cancer Biology'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/058933', 'cat_code'] = 'Biophysics'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/069468', 'cat_code'] = 'Scientific Communication and Education'
preprint_level1_df.loc[preprint_level1_df['id']=='10.1101/073999', 'cat_code'] = 'Bioinformatics'

In [None]:
preprint_level1_df = preprint_level1_df[preprint_level1_df['cat_code'].notnull()]

preprint_level1_df['cat_code'] = preprint_level1_df['cat_code'].apply(lambda x: x.rstrip())

#overlapping categories
preprint_level1_df.loc[preprint_level1_df['cat_code']=='q-bio.BM', 'cat_code'] = 'Molecular Biology'
preprint_level1_df.loc[preprint_level1_df['cat_code']=='q-bio.CB', 'cat_code'] = 'Cell Biology'
preprint_level1_df.loc[preprint_level1_df['cat_code']=='q-bio.GN', 'cat_code'] = 'Genomics'
preprint_level1_df.loc[preprint_level1_df['cat_code']=='q-bio.NC', 'cat_code'] = 'Neuroscience'
preprint_level1_df.loc[preprint_level1_df['cat_code']=='q-bio.PE', 'cat_code'] = 'Evolutionary Biology'
preprint_level1_df.loc[preprint_level1_df['cat_code']=='q-bio.SC', 'cat_code'] = 'Subcellular Processes'
preprint_level1_df.loc[preprint_level1_df['cat_code']=='q-bio.TO', 'cat_code'] = 'Tissues and Organs'

preprint_cat_set = set(preprint_level1_df.cat_code)
print(f'{len(preprint_cat_set)} preprint categories')

my_file = "preprint_level1_df.pickle"
pickle.dump(preprint_level1_df, open(os.path.join(my_path_, my_file), 'wb'))

## Plots

In [None]:
#time restriction
works = read_parquet(my_path_ / 'works')
works_all = read_parquet(my_path_ / 'works_all')
works_authors_aff = read_parquet(my_path_ / 'works_authors_aff')
print(f"{len(works)} preprints between [2000-01-01: 2024-12-01]")
print(f"{len(works_all)} OpenAlex works between [2000-01-01: 2024-12-01]")

In [None]:
#number papers
works_count = works.reset_index().groupby('publication_date_1').work_id.count().to_frame().reset_index()
my_file = "works_count.pickle"
pickle.dump(works_count, open(os.path.join(my_path_, my_file), 'wb'))

In [None]:
my_file = "works_count.pickle"
with open(os.path.join(my_path_, my_file),"rb") as fp:
    works_count = pickle.load(fp)
print(f'{sum(works_count.work_id)} preprints') #1.8M
plot_(works_count,'publication_date_1','work_id','month','Mounthly count preprints')

In [None]:
#number of authors
authors_count = works_authors_aff.groupby('publication_date_1').author_id.nunique().to_frame().reset_index()
my_file = "authors_count.pickle"
pickle.dump(authors_count, open(os.path.join(my_path_, my_file), 'wb'))

In [None]:
my_file = "authors_count.pickle"
with open(os.path.join(my_path_, my_file),"rb") as fp:
    authors_count = pickle.load(fp)
print(f'{sum(authors_count.author_id)} authors preprints') 
plot_(authors_count,'publication_date_1','author_id','month','Mounthly count authors')

In [None]:
#number papers
authors_mean = works.reset_index().groupby('publication_date_1').num_authors.mean().to_frame().reset_index()
my_file = "authors_mean.pickle"
pickle.dump(authors_mean, open(os.path.join(my_path_, my_file), 'wb'))

In [None]:
my_file = "authors_mean.pickle"
with open(os.path.join(my_path_, my_file),"rb") as fp:
    authors_mean = pickle.load(fp)
plot_(authors_mean,'publication_date_1','num_authors','month','Mounthly average number authors')

In [None]:
#tables

works_COVID = read_parquet(my_path_ / 'works_COVID')
works_noCOVID = read_parquet(my_path_ / 'works_noCOVID')

works_count_COVID_df = works_COVID.groupby(by='publication_date_1').work_id.count().to_frame().reset_index()
my_file = "works_count_COVID_df.pickle"
pickle.dump(works_count_COVID_df, open(os.path.join(my_path_, my_file), 'wb'))
works_count_noCOVID_df = works_noCOVID.groupby(by='publication_date_1').work_id.count().to_frame().reset_index()
my_file = "works_count_noCOVID_df.pickle"
pickle.dump(works_count_noCOVID_df, open(os.path.join(my_path_, my_file), 'wb'))

In [None]:
works_count_COVID_df = works_count_COVID_df[works_count_COVID_df.publication_date_1>='2019-06-01']
works_count_COVID_df = works_count_COVID_df[works_count_COVID_df.publication_date_1<='2024-12-01']
works_count_noCOVID_df = works_count_noCOVID_df[works_count_noCOVID_df.publication_date_1>='2019-06-01']
works_count_noCOVID_df = works_count_noCOVID_df[works_count_noCOVID_df.publication_date_1<='2024-12-01']

plt.style.use("dark_background")
fig, ax = plt.subplots(figsize=(15, 5))
x_data = list(works_count_COVID_df['publication_date_1'])
y_mean = works_count_COVID_df['work_id']
ax.plot(x_data, y_mean, "o-",markersize=3,label='COVID')
y_mean2 = works_count_noCOVID_df['work_id']
ax.plot(x_data, y_mean2, "o-",markersize=3,label='not COVID')
ax.set_yscale('log')
ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
plt.grid(True, linewidth=0.5)
ax.set_xlabel('month',size=20)
ax.set_title('Mounthly count preprints - COVID/noCOVID (log)',size=30)
ax.legend()

In [None]:
works_COVID = works_COVID[['work_id']].reset_index()
works_COVID_count_df = works_COVID.groupby('publication_date_1').work_id.count().to_frame().rename(columns={'work_id':'work_id_COVID'}).reset_index()
works_count = works_count.rename(columns={'work_id':'work_id_tot'})
works_COVID_count_df = works_COVID_count_df.merge(works_count,on='publication_date_1')
works_COVID_count_df['work_id_COVID_perc'] = works_COVID_count_df['work_id_COVID']/works_COVID_count_df['work_id_tot']
my_file = "works_COVID_count_df.pickle"
pickle.dump(works_COVID_count_df, open(os.path.join(my_path_, my_file), 'wb'))

In [None]:
my_file = "works_COVID_count_df.pickle"
with open(os.path.join(my_path_, my_file),"rb") as fp:
    works_COVID_count_df = pickle.load(fp)

plt.style.use("dark_background")
fig, ax = plt.subplots(figsize=(15, 5))
x_data = list(works_COVID_count_df['publication_date_1'])
y_mean = works_COVID_count_df['work_id_COVID_perc']
ax.plot(x_data, y_mean, "o-", color = 'orange',markersize=3)

ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
plt.grid(True, linewidth=0.5)
ax.set_xlabel('month',size=20)
ax.set_title('Monthly Percentage Works tagged with COVID',size=30)

In [None]:
#average number of references
ref_mean = works.reset_index().groupby('publication_date_1').ref_count.mean().to_frame().reset_index()
my_file = "ref_mean.pickle"
pickle.dump(ref_mean, open(os.path.join(my_path_, my_file), 'wb'))
plot_(ref_mean,'publication_date_1','ref_count','month','Mounthly average number references')

In [None]:
list(preprint_dict.keys())

In [None]:
# my_file = "preprint_df.pickle"
# with open(os.path.join(basepath3, my_file),"rb") as fp:
#     preprint_df = pickle.load(fp)
# preprint_categories_list = list(set(preprint_df['tax_name']))

my_file = "preprint_dict.pickle"
with open(os.path.join(basepath3, my_file),"rb") as fp:
    preprint_dict = pickle.load(fp)
preprint_categories_list = list(preprint_dict.keys())


my_file = "preprint_categories_list.pickle"
pickle.dump(preprint_categories_list, open(os.path.join(basepath3, my_file), 'wb'))

preprint_categories_list

In [None]:
#plot number of works and authors per month

import matplotlib.dates as dates
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
def form(x,pos):
    if x<1e3:
        return '%1.3f' % (x)
    elif x<1e6:
        return '%1.1fK' % (x * 1e-3)
    else:
        return '%1.1fM' % (x * 1e-6)
formatter = FuncFormatter(form)

fig, ax = plt.subplots(figsize=(15, 5))

# x_data = []
# for y in range(2008,2025): 
#     for m in range(1,13):
#         x_data.append(pd.Timestamp(y, m, 1))
# x_data = x_data[2:-11]
# len(x_data)

print('Whole Dataset')
WORKS_COUNT = works.groupby('publication_date_1').work_id.nunique().to_frame()
x_data = WORKS_COUNT.index
ax.plot(x_data, WORKS_COUNT.work_id, markersize=6,label='whole dataset')
WORKS_COUNT = WORKS_COUNT.rename(columns={'work_id':'Whole Dataset'})
WORKS_COUNT_DF = WORKS_COUNT

for n in range(len(preprint_categories_list)):
    subfield = preprint_categories_list[n]
    print(subfield)
 
    #work_id_set = set(preprint_df[preprint_df['tax_name']==subfield].work_id)
    work_id_set = preprint_dict[subfield]

    works_sub = works.query('work_id.isin(@work_id_set)').sort_index()
#     works_authorships_sub = works_authorships.query('work_id.isin(@work_id_set)').sort_index()
#     works_authorships_drop_sub = works_authorships_sub.drop_duplicates(subset=['work_id','author_id'])
    
    WORKS_COUNT = works_sub.groupby('publication_date_1').work_id.nunique().to_frame()
    WORKS_COUNT = WORKS_COUNT.reindex(x_data) #some months zero works
    ax.plot(x_data, WORKS_COUNT.work_id, markersize=6,label=subfield)
    WORKS_COUNT = WORKS_COUNT.rename(columns={'work_id':subfield})
    WORKS_COUNT_DF = pd.merge(WORKS_COUNT_DF,WORKS_COUNT, left_index=True, right_index=True)
    
ax.yaxis.set_major_formatter(formatter)
ax.set_title('Subfields - works',size=30)
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %y')) 
ax.axvline(pd.Timestamp(2020, 3, 1),color='r') #ax.axvline(x_dates[84],color='r') 
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
ax.grid(True, linewidth=0.5)
#plt.savefig(os.path.join('Graphs_collab', 'Subfields - works (2).png'), bbox_inches='tight', pad_inches=0.02)   

In [None]:
my_file = "WORKS_COUNT_DF.pickle"
pickle.dump(WORKS_COUNT_DF, open(os.path.join(basepath3, my_file), 'wb'))

In [None]:
WORKS_COUNT_DF = WORKS_COUNT_DF[list(WORKS_COUNT_DF.sum(axis=0).to_frame().sort_values(by=0,ascending=False).index)]

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
x_data = WORKS_COUNT_DF.index
for n in range(len(list(WORKS_COUNT_DF.columns))):
    subfield = list(WORKS_COUNT_DF.columns)[n]
    ax.plot(x_data, WORKS_COUNT_DF[subfield], markersize=6,label=subfield)
ax.yaxis.set_major_formatter(formatter)
ax.set_title('Subfields - works',size=30)
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %y')) 
ax.axvline(pd.Timestamp(2020, 3, 1),color='r') #ax.axvline(x_dates[84],color='r') 
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
ax.grid(True, linewidth=0.5)
#plt.savefig(os.path.join('Graphs_collab', 'Subfields - works (2).png'), bbox_inches='tight', pad_inches=0.02) 

In [None]:
fig2, ax2 = plt.subplots(figsize=(15, 5))

print('Whole Dataset')
AUTHORS_COUNT =  works_authors_aff.reset_index().groupby('publication_date_1').author_id.nunique().to_frame()
x_data = AUTHORS_COUNT.index
ax2.plot(x_data, AUTHORS_COUNT.author_id, markersize=6,label='whole dataset')
AUTHORS_COUNT = AUTHORS_COUNT.rename(columns={'author_id':'Whole Dataset'})
AUTHORS_COUNT_DF = AUTHORS_COUNT

for n in range(len(preprint_categories_list)):
    subfield = preprint_categories_list[n]
    print(subfield)
 
    #work_id_set = set(preprint_df[preprint_df['tax_name']==subfield].work_id)
    work_id_set = preprint_dict[subfield]

    works_sub = works.query('work_id.isin(@work_id_set)').sort_index()
    works_authorships_fill_sub = works_authors_aff.query('work_id.isin(@work_id_set)').sort_index()
    works_authorships_fill_drop_sub = works_authorships_fill_sub.drop_duplicates(subset=['work_id','author_id'])
    
    AUTHORS_COUNT = works_authorships_fill_drop_sub.groupby('publication_date_1').author_id.nunique().to_frame()
    AUTHORS_COUNT = AUTHORS_COUNT.reindex(x_data) #some months zero works
    ax2.plot(x_data, AUTHORS_COUNT.author_id, markersize=6,label=subfield)
    AUTHORS_COUNT = AUTHORS_COUNT.rename(columns={'author_id':subfield})
    AUTHORS_COUNT_DF = pd.merge(AUTHORS_COUNT_DF,AUTHORS_COUNT, left_index=True, right_index=True)
    
ax2.yaxis.set_major_formatter(formatter)
ax2.set_title('Subfields - authors',size=30)
ax2.xaxis.set_major_formatter(mdates.DateFormatter('%b %y')) 
ax2.axvline(pd.Timestamp(2020, 3, 1),color='r')
ax2.legend(loc='center left', bbox_to_anchor=(1, 0.5))
ax2.grid(True, linewidth=0.5)
#plt.savefig(os.path.join('Graphs_collab', 'Subfields - authors (2).png'), bbox_inches='tight', pad_inches=0.02)  

In [None]:
my_file = "AUTHORS_COUNT_DF.pickle"
pickle.dump(AUTHORS_COUNT_DF, open(os.path.join(basepath3, my_file), 'wb'))

In [None]:
WORKS_COUNT_TOT_DF = WORKS_COUNT_DF.sum().to_frame().sort_values(by=0,ascending=False).rename(columns={0:'work_count'})
WORKS_COUNT_TOT_DF['perc'] = (WORKS_COUNT_TOT_DF['work_count']/len(preprint_categories_list))*100
my_file = "WORKS_COUNT_TOT_DF.pickle"
pickle.dump(WORKS_COUNT_TOT_DF, open(os.path.join(basepath3, my_file), 'wb'))
WORKS_COUNT_TOT_DF