# Team Distances Analysis

In [None]:
from pathlib import Path
from time import time
from tqdm.auto import tqdm
import pandas as pd 
import numpy as np
import os
import pickle
import networkx as nx
import random
tqdm.pandas()

from scipy import optimize
import numpy.polynomial.polynomial as npoly

import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns
plt.rcParams.update({'font.size': 22})
sns.set(style="ticks", context="talk")
plt.style.use("dark_background")
pd.options.plotting.backend = 'plotly'
pio.templates.default = 'plotly_dark+presentation'

import matplotlib.dates as dates
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
def form(x,pos):
    if x<1e2:
        return '%1.0f' % (x)
    elif x<1e3:
        return '%1.3f' % (x)
    elif x<1e6:
        return '%1.1fK' % (x * 1e-3)
    else:
        return '%1.1fM' % (x * 1e-6)
formatter = FuncFormatter(form)

def plot_(df_,x_column,y_column,x_label,title):
    plt.style.use("dark_background")
    fig, ax = plt.subplots(figsize=(15, 5))
    x_dates = list(df_[x_column])
    y_data = df_[y_column]
    x_data = x_dates

    ax.plot(x_data, y_data, "co-", markersize=6,label='dataset')
    ax.axvline(pd.Timestamp(2020, 3, 1),color='r')

    plt.grid(True, linewidth=0.5)
    ax.yaxis.set_major_formatter(formatter)

    ax.set_xlabel(x_label,size=20)
    ax.set_title(title,size=30)
    #ax.legend()        
    #plt.savefig(os.path.join(my_path_plots, title+'.png'), bbox_inches='tight', pad_inches=0.02)
    
def read_parquet(name, **args):
    path = name
    print(f'Reading {name!r}')
    tic = time()
    df = pd.read_parquet(path, engine='fastparquet', **args)
    before = len(df)
    # df.drop_duplicates(inplace=True)
    toc = time()
    after = len(df)
        
    print(f'Read {len(df):,} rows from {path.stem!r} in {toc-tic:.2f} sec. {before-after:,} duplicates.')
    return df

In [None]:
basepath = Path('Tables_final') 
my_path_ = Path('TeamDistance')
if not os.path.exists(my_path_):
    os.makedirs(my_path_)

## Average Team Distance - ATD

In [None]:
works = read_parquet(basepath / 'works')
works_authors_aff = read_parquet(basepath / 'works_authors_aff')
I_dist = read_parquet(basepath / 'I_dist_threshold')
I_dist1 = I_dist
I_dist2 = I_dist[I_dist.source!=I_dist.target]
I_dist[['target','source']] = I_dist[['source','target']]
I_dist = pd.concat([I_dist1,I_dist2])

In [None]:
print(f'{len(works.work_id)} considered preprints (no missing info)')

In [None]:
works_authors_aff_ = works_authors_aff[['work_id','institution_id','author_id']]
work_authors_set_df = works_authors_aff_.groupby('work_id').author_id.apply(set).to_frame().reset_index()

#8 mins
def create_edges(author_list):
    G = nx.complete_graph(author_list)
    return nx.to_pandas_edgelist(G)

# Apply the function to each set of authors and concatenate the resulting DataFrames
edge_list_dfs = work_authors_set_df.author_id.apply(create_edges)
work_authors_edges_df = pd.concat(edge_list_dfs.tolist(), ignore_index=True)

work_authors_set_df['num_authors'] = work_authors_set_df['author_id'].apply(lambda x:len(x))

In [None]:
import scipy.special
binom_list = [scipy.special.binom(x, 2) for x in range(31)]
binom_dict = dict(zip(list(range(31)),binom_list))

work_authors_set_df['binom_n_authors'] = work_authors_set_df['num_authors'].map(binom_dict)
work_authors_set_df['binom_n_authors'] = work_authors_set_df['binom_n_authors'].astype(int)

work_id_list = work_authors_set_df['work_id']
binom_list = work_authors_set_df['binom_n_authors']
work_mult_binom = [[work_id_list[i]]*binom_list[i] for i in range(len(work_id_list))]
import itertools
work_mult_binom_ = list(itertools.chain(*work_mult_binom))
work_authors_edges_df['work_id'] = work_mult_binom_
work_authors_edges_df = work_authors_edges_df[['work_id','source','target']]

#each paper #list edges #institutions #list distances #mean
#institutions authors
work_authors_edges_df = work_authors_edges_df.merge(works_authors_aff[['work_id','author_id','institution_id']].rename(columns={'author_id':'source','institution_id':'source_inst'}),on=['work_id','source'], how='left')
work_authors_edges_df = work_authors_edges_df.merge(works_authors_aff[['work_id','author_id','institution_id']].rename(columns={'author_id':'target','institution_id':'target_inst'}),on=['work_id','target'], how='left')

#distance institutions
work_authors_edges_df = work_authors_edges_df.merge(I_dist.rename(columns={'source':'source_inst','target':'target_inst'}),on=['source_inst','target_inst'], how='left')

my_file = 'work_authors_edges_df_dist'
work_authors_edges_df.to_parquet(os.path.join(my_path_, my_file))

In [None]:
#euclidean mean
work_edges_dist_mean = work_authors_edges_df.groupby('work_id').dist.mean().to_frame().reset_index()
work_edges_dist_mean = work_edges_dist_mean.merge(works[['work_id']].reset_index(),on='work_id')
my_file = "work_edges_dist_mean.csv"    
work_edges_dist_mean.to_csv(os.path.join(my_path_, my_file),index=False)
work_edges_dist_mean_monthly = work_edges_dist_mean.groupby('publication_date_1').dist.mean().to_frame().reset_index()

In [None]:
#plot only point #not rolling average #fit 3 lines

my_file = "work_edges_dist_mean.csv"    
work_edges_dist_mean = pd.read_csv(os.path.join(my_path_, my_file))
work_edges_dist_mean['publication_date_1'] = pd.to_datetime(work_edges_dist_mean['publication_date_1'])
work_edges_dist_mean_monthly = work_edges_dist_mean.groupby('publication_date_1').dist.mean().to_frame().reset_index()

In [None]:
def plot_fit_rolling_breakpoints2(df,x_column,x_label,title,window_size,num_breakpoints,ff):
    
    #save for all possible combination of breakpoints the correspondent error 
    def f(breakpoints, x, y, fcache): 
        breakpoints = tuple(map(int, sorted(breakpoints)))
        if breakpoints not in fcache:
            total_error = 0
            for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x, y):
                total_error += ((f(xi) - yi)**2).sum()
            fcache[breakpoints] = total_error 
        # print('{} --> {}'.format(breakpoints, fcache[breakpoints]))
        return fcache[breakpoints]

    def find_best_piecewise_polynomial(breakpoints, x, y):
        breakpoints = tuple(map(int, sorted(breakpoints)))
        xs = np.split(x, breakpoints)
        ys = np.split(y, breakpoints)
        result = []
        for xi, yi in zip(xs, ys):
            if len(xi) < 2: continue
            coefs = npoly.polyfit(xi, yi, 1)
            f = npoly.Polynomial(coefs)
            result.append([f, xi, yi])
        return result

    fig, ax = plt.subplots(figsize=(15, 5))
    x_dates = list(df['publication_date_1'])
    y_data = df[x_column].rolling(window=window_size).mean()[window_size-1:]
    x_data = x_dates[window_size-1:]
    
    ax.plot(x_data, y_data, "co", markersize=6)

    plt.grid(True, linewidth=0.5)
    #ax.yaxis.set_major_formatter(formatter)

    ax.set_xlabel(x_label,size=20)
    ax.set_title(title,size=30)

    #ax.xaxis.set_major_locator(mdates.MonthLocator()) # Make ticks on occurrences of each month
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %y')) # Get only the month to show in the x-axis

    ax.axvline(pd.Timestamp(2020, 3, 1),color='r') #ax.axvline(x_dates[84],color='r')

    x_num = dates.date2num(x_data)
    breakpoints = optimize.brute(f, [slice(1, len(x_data), 1)]*num_breakpoints, args=(x_num, y_data, {}), finish=None)
    if num_breakpoints==1:
        breakpoints = [breakpoints] 

    for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x_num, y_data):
        xi_min = x_data[np.where(x_num == xi.min())[0][0]]
        xi_max = x_data[np.where(x_num == xi.max())[0][0]]
        x_interval = np.array([xi_min, xi_max]) 
        #print('y = {:35s}, if x in [{}, {}]'.format(str(f), *x_interval))
        x_interval = np.array([xi.min(), xi.max()])
        #ax.plot(x_interval, f(x_interval), 'yo-')
        coef = f.convert().coef
        b = coef[0]
        a = coef[1]
        if ff==1:
            if b>0:
                ll = 'y = {:.2f} x + {:.0f}'.format(a,b)
            else: #b<0
                ll = 'y = {:.2f} x - {:.0f}'.format(a,abs(b))    
        else:
            if b>0:
                ll = 'y = {:.5f} x + {:.0f}'.format(a,b)
            else: #b<0
                ll = 'y = {:.5f} x - {:.0f}'.format(a,abs(b))
        ax.plot(x_interval, f(x_interval), 'yo-',label=ll,linewidth=2, markersize=7)
        
    ax.legend()
    ax.set_title(title,size=30)

In [None]:
df = work_edges_dist_mean_monthly
df = df[df.publication_date_1>='2000']
df = df[df.publication_date_1<'2024']
x_column='publication_date_1'
x_column='dist'
x_label='month'
title='Monthly average mean (euclidean) distances edges works (without rolling average)'
window_size=1
num_breakpoints=2
ff=1
plot_fit_rolling_breakpoints2(df,x_column,x_label,title,window_size,num_breakpoints,ff)

In [None]:
num_breakpoints=1
plot_fit_rolling_breakpoints2(df,x_column,x_label,title,window_size,num_breakpoints,ff)

### COVID

In [None]:
#COVID
my_file = 'preprint_id_set_COVID'
with open(os.path.join(basepath, my_file),"rb") as fp:
    preprint_id_set_COVID = pickle.load(fp)
my_file = 'preprint_id_set_noCOVID'
with open(os.path.join(basepath, my_file),"rb") as fp:
    preprint_id_set_noCOVID = pickle.load(fp)
works_authors_aff_COVID = works_authors_aff[works_authors_aff.work_id.isin(preprint_id_set_COVID)]
works_authors_aff_noCOVID = works_authors_aff[works_authors_aff.work_id.isin(preprint_id_set_noCOVID)]

works_authors_aff_COVID_ = works_authors_aff_COVID[['work_id','author_id']]
works_authors_aff_noCOVID_ = works_authors_aff_noCOVID[['work_id','author_id']]
work_authors_set_df_COVID = works_authors_aff_COVID_.groupby('work_id').author_id.apply(set).to_frame().reset_index()
work_authors_set_df_noCOVID = works_authors_aff_noCOVID_.groupby('work_id').author_id.apply(set).to_frame().reset_index()

edge_list_dfs_COVID = work_authors_set_df_COVID.author_id.apply(create_edges)
work_authors_edges_df_COVID = pd.concat(edge_list_dfs_COVID.tolist(), ignore_index=True)
work_authors_set_df_COVID['num_authors'] = work_authors_set_df_COVID['author_id'].apply(lambda x:len(x))
work_authors_set_df_COVID['binom_n_authors'] = work_authors_set_df_COVID['num_authors'].map(binom_dict)
work_authors_set_df_COVID['binom_n_authors'] = work_authors_set_df_COVID['binom_n_authors'].astype(int)
work_id_list_COVID = work_authors_set_df_COVID['work_id']
binom_list_COVID = work_authors_set_df_COVID['binom_n_authors']
work_mult_binom_COVID = [[work_id_list_COVID[i]]*binom_list_COVID[i] for i in range(len(work_id_list_COVID))]
work_mult_binom_COVID_ = list(itertools.chain(*work_mult_binom_COVID))
work_authors_edges_df_COVID['work_id'] = work_mult_binom_COVID_
work_authors_edges_df_COVID = work_authors_edges_df_COVID[['work_id','source','target']]
my_file = 'work_authors_edges_df_COVID'
work_authors_edges_df_COVID.to_parquet(os.path.join(my_path_, my_file))

In [None]:
edge_list_dfs_noCOVID = work_authors_set_df_noCOVID.author_id.apply(create_edges)
work_authors_edges_df_noCOVID = pd.concat(edge_list_dfs_noCOVID.tolist(), ignore_index=True)
work_authors_set_df_noCOVID['num_authors'] = work_authors_set_df_noCOVID['author_id'].apply(lambda x:len(x))
work_authors_set_df_noCOVID['binom_n_authors'] = work_authors_set_df_noCOVID['num_authors'].map(binom_dict)
work_authors_set_df_noCOVID['binom_n_authors'] = work_authors_set_df_noCOVID['binom_n_authors'].astype(int)
work_id_list_noCOVID = work_authors_set_df_noCOVID['work_id']
binom_list_noCOVID = work_authors_set_df_noCOVID['binom_n_authors']
work_mult_binom_noCOVID = [[work_id_list_noCOVID[i]]*binom_list_noCOVID[i] for i in range(len(work_id_list_noCOVID))]
work_mult_binom_noCOVID_ = list(itertools.chain(*work_mult_binom_noCOVID))
work_authors_edges_df_noCOVID['work_id'] = work_mult_binom_noCOVID_
work_authors_edges_df_noCOVID = work_authors_edges_df_noCOVID[['work_id','source','target']]
my_file = 'work_authors_edges_df_noCOVID'
work_authors_edges_df_noCOVID.to_parquet(os.path.join(my_path_, my_file))

In [None]:
work_authors_edges_df_COVID = read_parquet(my_path_ / 'work_authors_edges_df_COVID')
works_authors_aff_COVID['author_id'] = works_authors_aff_COVID['author_id'].astype('int64')
work_authors_edges_df_COVID = work_authors_edges_df_COVID.merge(works_authors_aff_COVID[['work_id','author_id','institution_id']].rename(columns={'author_id':'source','institution_id':'source_inst'}),on=['work_id','source'], how='left')
work_authors_edges_df_COVID = work_authors_edges_df_COVID.merge(works_authors_aff_COVID[['work_id','author_id','institution_id']].rename(columns={'author_id':'target','institution_id':'target_inst'}),on=['work_id','target'], how='left')
work_authors_edges_df_COVID['source_inst'] = work_authors_edges_df_COVID['source_inst'].astype(int)
work_authors_edges_df_COVID['target_inst'] = work_authors_edges_df_COVID['target_inst'].astype(int)
work_authors_edges_df_COVID = work_authors_edges_df_COVID.merge(I_dist.rename(columns={'source':'source_inst','target':'target_inst'}),on=['source_inst','target_inst'], how='left')

work_authors_edges_df_noCOVID = read_parquet(my_path_ / 'work_authors_edges_df_noCOVID')
works_authors_aff_noCOVID['author_id'] = works_authors_aff_noCOVID['author_id'].astype('int64')
work_authors_edges_df_noCOVID = work_authors_edges_df_noCOVID.merge(works_authors_aff_noCOVID[['work_id','author_id','institution_id']].rename(columns={'author_id':'source','institution_id':'source_inst'}),on=['work_id','source'], how='left')
work_authors_edges_df_noCOVID = work_authors_edges_df_noCOVID.merge(works_authors_aff_noCOVID[['work_id','author_id','institution_id']].rename(columns={'author_id':'target','institution_id':'target_inst'}),on=['work_id','target'], how='left')
work_authors_edges_df_noCOVID['source_inst'] = work_authors_edges_df_noCOVID['source_inst'].astype(int)
work_authors_edges_df_noCOVID['target_inst'] = work_authors_edges_df_noCOVID['target_inst'].astype(int)
work_authors_edges_df_noCOVID = work_authors_edges_df_noCOVID.merge(I_dist.rename(columns={'source':'source_inst','target':'target_inst'}),on=['source_inst','target_inst'], how='left')

In [None]:
work_edges_dist_mean_COVID = work_authors_edges_df_COVID.groupby('work_id').dist.mean().to_frame().reset_index()
work_edges_dist_mean_COVID = work_edges_dist_mean_COVID.merge(works[['work_id']].reset_index(),on='work_id')
work_edges_dist_mean_monthly_COVID = work_edges_dist_mean_COVID.groupby('publication_date_1').dist.mean().to_frame().reset_index()
my_file = "work_edges_dist_mean_COVID.csv"    
work_edges_dist_mean_COVID.to_csv(os.path.join(my_path_, my_file),index=False)

work_edges_dist_mean_noCOVID = work_authors_edges_df_noCOVID.groupby('work_id').dist.mean().to_frame().reset_index()
work_edges_dist_mean_noCOVID = work_edges_dist_mean_noCOVID.merge(works[['work_id']].reset_index(),on='work_id')
work_edges_dist_mean_monthly_noCOVID = work_edges_dist_mean_noCOVID.groupby('publication_date_1').dist.mean().to_frame().reset_index()
my_file = "work_edges_dist_mean_noCOVID.csv"    
work_edges_dist_mean_noCOVID.to_csv(os.path.join(my_path_, my_file),index=False)

## Power-law

In [None]:
#all preprints - edges

def my_weight(G, u, v, weight="weight"):
    w = 0
    for nbr in set(G[u]) & set(G[v]):
        w += (G[u][nbr].get(weight, 1) + G[v][nbr].get(weight, 1))/2
    return w

def make_institution_graph(works_authors_rows):
    
    institution_id_set = set(works_authors_rows.institution_id)
                                  
    bip_g = nx.from_pandas_edgelist(
        works_authors_rows,
        source='work_id', target='institution_id', edge_attr ='weight'
    )

    inst_graph = nx.bipartite.generic_weighted_projected_graph(bip_g,nodes=institution_id_set,weight_function=my_weight)    
    #inst_graph = nx.bipartite.weighted_projected_graph(bip_g,nodes=institution_id_set) 

    return inst_graph

In [None]:
def mean_scatter_plot_multi(func,edges_df,degree_df,col,columnx,columny,labelx,labely,title,logx=False,logy=False,limity=False,geomspace=False):
    fig, ax = plt.subplots(figsize=(8,5))
    for y in periods_list[0:]:
        df1 = func(edges_df,degree_df,col,y)
        x1 = np.array(df1[columnx]) 
        y1 = np.array(df1[columny])  
        # Define the grid
        gridsize = 10**2
        if geomspace:
            xbins1 = np.geomspace(x1.min(), x1.max(), gridsize)
            #xbins2 = np.geomspace(x2.min(), x2.max(), gridsize)
        else:
            xbins1 = np.linspace(x1.min(), x1.max(), gridsize)
            #xbins2 = np.linspace(x2.min(), x2.max(), gridsize)
        # Calculate the mean values within each column
        mean_values1 = []
        for i in range(len(xbins1) - 1):
            mask = (x1 >= xbins1[i]) & (x1 < xbins1[i + 1])
            mean_y1 = np.mean(y1[mask])
            mean_values1.append(mean_y1) 
        print((xbins1[0] + xbins1[1]) / 2)
        ax.plot((xbins1[:-1] + xbins1[1:]) / 2, mean_values1, '.',color=color_dict[y],markersize=5, label=periods_labels[y])          
    ax.set_xlabel(labelx,size=20)
    ax.set_ylabel(labely,size=20)
    ax.set_title(title,size=30)
    if logx==True:
        ax.set_xscale('log')  
    if logy==True:
        ax.set_yscale('log')
    if limity==True:
        ax.set_ylim([0, 1])  
    ax.legend(bbox_to_anchor=(1.0, 0.9), prop={'size': 15}, markerscale=3)
    plt.show() 
def scipy_fun_multi(func1,func2,edges_df,degree_df,col,x_max,labelx,labely,title):
    fig, ax = plt.subplots(figsize=(8,5))  
    for y in periods_list[0:]:
        x1,y1 = func1(func2,edges_df,degree_df,col,y)
        y1 = np.array(y1)
        popt1, pcov1 = scipy.optimize.curve_fit(linear, x1[x1<=x_max], y1[x1<=x_max])
        perr1 = np.sqrt(np.diag(pcov1))
        print(f'{periods_labels[y]}: gamma {popt1[0]} (perr {perr1[0]:.4f})') 
        ax.scatter(x1[x1<=x_max], y1[x1<=x_max],marker='.',color=color_dict[y],linewidths=0.01, label=periods_labels[y]) 
        ax.plot(x1[x1<=x_max], linear(x1[x1<=x_max], *popt1),color=color_dict[y],linewidth=2.0)   
    ax.set_xlabel(labelx,size=20)
    ax.set_ylabel(labely,size=20)
    ax.legend(bbox_to_anchor=(1.0, 0.9), prop={'size': 15}, markerscale=3)
    ax.set_title(title+" - linear",size=30)
    plt.show()
def linear(x, b):
    return  b * x 
def linear_fit(func,edges_df,degree_df,col,yy):
    df1 = func(edges_df,degree_df,col,yy)
    x1 = np.array(df1["s"]) 
    y1 = np.array(df1[col])  
    gridsize = 10**2
    xbins1 = np.linspace(x1.min(), x1.max(), gridsize)    
    mean_values1 = []
    for i in range(len(xbins1) - 1):
        mask = (x1 >= xbins1[i]) & (x1 < xbins1[i + 1])
        mean_y1 = np.mean(y1[mask])
        mean_values1.append(mean_y1)     
    x = (xbins1[:-1] + xbins1[1:]) / 2
    y = np.array(mean_values1)
    x = x[~np.isnan(y)]
    y = y[~np.isnan(y)]
    return x,y
def fig_2B(edges_df,degree_df,col,y):
    edges_year_df = edges_df.query('period == @y')
    degree_year_df = degree_df.query('period == @y')
    loops_w_count = edges_year_df[edges_year_df.source == edges_year_df.target][['source',col]].rename(columns={'source':'institution_id'})
    loops_w_count = loops_w_count.merge(degree_year_df,on='institution_id')
    return loops_w_count
def fig_4B(df1,df2,col,y):
    df1 = df1.query('period == @y')[['source','target',col,'dist']]
    df2 = df2.query('period == @y')[['institution_id','s']]
    df2_dict = df2.set_index('institution_id').to_dict()['s']
    df1['source_s'] = df1['source'].map(df2_dict)
    df1['target_s'] = df1['target'].map(df2_dict)
    df1['ratio'] = df1[col] / (df1['source_s']*df1['target_s'])**(1/2)
    df1 = df1[["dist", "ratio"]]
    #delate zero
    df1 = df1[df1.dist>0]
    return df1
def fig_4B_prods(df1,df2,col,y):
    df1 = df1.query('period == @y')[['source','target',col,'dist']]
    df2 = df2.query('period == @y')[['institution_id','s']]
    df2_dict = df2.set_index('institution_id').to_dict()['s']
    df1['source_s'] = df1['source'].map(df2_dict)
    df1['target_s'] = df1['target'].map(df2_dict)
    df1['ratio'] = df1[col] / (df1['source_s']*df1['target_s'])
    df1 = df1[["dist", "ratio"]]
    #delate zero
    df1 = df1[df1.dist>0]
    return df1
def scipy_fun_multi2(func1,func2,edges_df,degree_df,col,x_min,labelx,labely,title):
    fig, ax = plt.subplots(figsize=(8,5))  
    for y in periods_list[0:]:
        x1,y1 = func1(func2,edges_df,degree_df,col,y)
        y1 = np.array(y1)
        popt1, pcov1 = scipy.optimize.curve_fit(power2, x1[x1>=x_min], y1[x1>=x_min])
        perr1 = np.sqrt(np.diag(pcov1))
        print(f'{periods_labels[y]}: alpha {(-1)*popt1[1]:.3f} (perr {perr1[1]:.4f}), beta {popt1[0]:.6f} (perr {perr1[0]:.4f})') 
        ax.scatter(x1[x1>=x_min], y1[x1>=x_min],marker='.',color=color_dict[y],linewidths=0.01, label=periods_labels[y]) 
        ax.plot(x1[x1>=x_min], power2(x1[x1>=x_min], *popt1),color=color_dict[y],linewidth=2.0)   
    ax.set_xscale('log')  
    ax.set_yscale('log')
    ax.set_xlabel(labelx,size=20)
    ax.set_ylabel(labely,size=20)
    ax.legend(bbox_to_anchor=(1.0, 0.9), prop={'size': 15}, markerscale=3)
    ax.set_title(title+" - power law",size=30)
    plt.show()
def power2(x, b, c):
    return  b * x ** c 
def powerlaw_fit(func,edges_df,degree_df,col,yy):
    df = func(edges_df,degree_df,col,yy)
    x = np.array(df["dist"]) 
    y = np.array(df["ratio"])
    gridsize = 10**2
    xbins = np.geomspace(x.min(), x.max(), gridsize)
    # Calculate the mean values within each column
    mean_values = []
    for i in range(len(xbins) - 1):
        mask = (x >= xbins[i]) & (x < xbins[i + 1])
        mean_y = np.mean(y[mask])
        mean_values.append(mean_y) 
    x = (xbins[:-1] + xbins[1:]) / 2
    y = np.array(mean_values)
    return x,y

In [None]:
#all period

works = read_parquet(basepath / 'works')
works = works.reset_index()
works['publication_year'] = works['publication_date_1'].dt.year
works_yearly_count = works.groupby('publication_year').work_id.count().to_frame()#.reset_index()
works = works.set_index('publication_year')
 
#works_authors_aff = read_parquet(basepath3 / 'works_authorships_fill')
works_authors_aff = read_parquet(basepath / 'works_authors_aff')
works_authors_aff = works_authors_aff.reset_index()
works_authors_aff['publication_year'] = works_authors_aff['publication_date_1'].dt.year
works_authors_aff['publication_year'] = works_authors_aff['publication_year'].astype('int64')
works_authors_aff = works_authors_aff.drop_duplicates(['work_id','institution_id']).reset_index()
works_authors_aff = works_authors_aff[['publication_year','work_id','institution_id']]
works_authors_aff = works_authors_aff.sort_values(by='publication_year')
works_authors_aff['institution_id'] = works_authors_aff['institution_id'].astype(int)

I_dist = read_parquet(basepath / 'I_dist_threshold')
I_dist1 = I_dist
I_dist2 = I_dist[I_dist.source!=I_dist.target]
I_dist[['target','source']] = I_dist[['source','target']]
I_dist = pd.concat([I_dist1,I_dist2])

my_file = "inst_id_name_dict.pickle" #dictionary inst_id and inst_name
with open(os.path.join(Path('./Tables'), my_file),"rb") as fp:
    inst_id_name_dict = pickle.load(fp)
    
#count number (unique) institutions per paper
works_authors_aff['num_affs'] = works_authors_aff.groupby('work_id')['institution_id'].transform('size')
print(f'{min(works_authors_aff.num_affs)}-{max(works_authors_aff.num_affs)} min-max number (unique) affiliations per work')
works_authors_aff['weight'] = 2 / ( works_authors_aff['num_affs']*(works_authors_aff['num_affs']-1) ) 
works_authors_aff.loc[works_authors_aff.num_affs==1,'weight'] = 1 #one affiliation

years_list = list(set(works_authors_aff.publication_year))
years_list.sort()

works_authors_aff_noloops = works_authors_aff[works_authors_aff.num_affs>1]
works_authors_aff_loops = works_authors_aff[works_authors_aff.num_affs==1]
works_authors_aff_loops['institution_id2'] = works_authors_aff_loops['institution_id']
works_authors_aff_noloops = works_authors_aff_noloops.set_index('publication_year')
works_authors_aff_loops = works_authors_aff_loops.set_index('publication_year')

works_yearly_count = works_yearly_count.loc[2000:2024]
print(f"Total number preprints (from {min(works_yearly_count.index)} to {max(works_yearly_count.index)}): {works_yearly_count.work_id.sum()}")

noloops_df = works_authors_aff_noloops
loops_df = works_authors_aff_loops
loops_df['weight'] = loops_df[['institution_id','institution_id2','weight']].groupby(['institution_id']).weight.transform('sum')
loops_df = loops_df.drop_duplicates('institution_id')
I_year = make_institution_graph(noloops_df)
I_year.add_weighted_edges_from([tuple(r) for r in loops_df[['institution_id','institution_id2','weight']].to_numpy()])
Iloops_year = nx.Graph()
Iloops_year.add_weighted_edges_from([tuple(r) for r in loops_df[['institution_id','institution_id2','weight']].to_numpy()])


info_df = pd.DataFrame.from_dict({'period':[0], 'N': [I_year.number_of_nodes()], 'E': [I_year.size()], 'W' : [I_year.size(weight='weight')], 'Nloops': [Iloops_year.number_of_nodes()], 'Eloops': [Iloops_year.size()], 'Wloops' : [Iloops_year.size(weight='weight')]})  
degree = {'k': I_year.degree(), 's' : I_year.degree(weight='weight'),'kloops': Iloops_year.degree(), 'sloops' : Iloops_year.degree(weight='weight')}

dict_y = degree
df_k = pd.DataFrame.from_dict(dict_y['k']).rename(columns={0:'institution_id',1:'k'})
df_s = pd.DataFrame.from_dict(dict_y['s']).rename(columns={0:'institution_id',1:'s'})
df_kloops = pd.DataFrame.from_dict(dict_y['kloops']).rename(columns={0:'institution_id',1:'kloops'})
df_sloops = pd.DataFrame.from_dict(dict_y['sloops']).rename(columns={0:'institution_id',1:'sloops'})
if len(df_kloops)!=0:
    df_year = (df_k.merge(df_s,on='institution_id')).merge(df_kloops.merge(df_sloops,on='institution_id'),on='institution_id',how='left')
else:
    df_year = df_k.merge(df_s,on='institution_id')
    df_year["kloops"] = np.nan
    df_year["sloops"] = np.nan
degree_df = df_year
my_file = "degree_df_whole.csv"    
degree_df.to_csv(os.path.join(my_path_, my_file),index=False)  

I_year_df = nx.to_pandas_edgelist(I_year)
I_year_df = I_year_df.merge(I_dist,on=['source','target'],how='left')
edges_df = I_year_df
my_file = "edges_df_whole.csv"    
edges_df.to_csv(os.path.join(my_path_, my_file),index=False)   

I_comp = nx.complement(I_year) 
I_comp_df = nx.to_pandas_edgelist(I_comp)
I_comp_df = I_comp_df.merge(I_dist,on=['source','target'])
edges_comp_df = I_comp_df
my_file = "edges_comp_df_whole.csv"    
edges_comp_df.to_csv(os.path.join(my_path_, my_file),index=False)

In [None]:
#plot
edges_df = pd.read_csv(my_path_ / 'edges_df_whole.csv')  
degree_df = pd.read_csv(my_path_ / 'degree_df_whole.csv')  
degree_df.fillna(0, inplace=True)
edges_comp_df = pd.read_csv(my_path_ / 'edges_comp_df_whole.csv') 
edges_comp_df['weight'] = 0
edges_df_tot = pd.concat([edges_df,edges_comp_df])
edges_df.insert(0, 'period', 0)
edges_df_tot.insert(0, 'period', 0)
degree_df.insert(0, 'period', 0)
edges_comp_df.insert(0, 'period', 0)
periods_list = [0]
periods_labels = ['[2000,2024]']
color_dict = {0: 'green'}

In [None]:
mean_scatter_plot_multi(fig_2B,edges_df_tot,degree_df,"weight","s","weight","s_i","w_ii",'Fig. 2B',False,False,False,False)
scipy_fun_multi(linear_fit,fig_2B,edges_df_tot,degree_df,"weight",2*10**4,"s_i","w_ii","Fig. 2B")
mean_scatter_plot_multi(fig_4B,edges_df_tot,degree_df,"weight","dist", "ratio","d_ij","w_ij / (s_i*s_j)^(1/2) ",'Fig. 4B - sqrt prod. strengths',True,True,False,True)
scipy_fun_multi2(powerlaw_fit,fig_4B,edges_df_tot,degree_df,"weight",10**1,"d_ij","w_ij / (s_i*s_j)^(1/2) ","Fig. 4B - power law - sqrt prod. strengths")
scipy_fun_multi2(powerlaw_fit,fig_4B,edges_df_tot,degree_df,"weight",10**2,"d_ij","w_ij / (s_i*s_j)^(1/2) ","Fig. 4B - power law - sqrt prod. strengths")

In [None]:
mean_scatter_plot_multi(fig_4B_prods,edges_df_tot,degree_df,"weight","dist", "ratio","d_ij","w_ij / (s_i*s_j) ",'Fig. 4B - prod. strengths',True,True,False,True)
scipy_fun_multi2(powerlaw_fit,fig_4B_prods,edges_df_tot,degree_df,"weight",10**1,"d_ij","w_ij / (s_i*s_j) ","Fig. 4B - power law - prod. strengths")
scipy_fun_multi2(powerlaw_fit,fig_4B_prods,edges_df_tot,degree_df,"weight",10**2,"d_ij","w_ij / (s_i*s_j) ","Fig. 4B - power law - prod. strengths")

In [None]:
#divided into periods

works = read_parquet(basepath / 'works')
works = works.reset_index()
works['publication_year'] = works['publication_date_1'].dt.year
works_yearly_count = works.groupby('publication_year').work_id.count().to_frame()#.reset_index()
works = works.set_index('publication_year')

works_authors_aff = read_parquet(basepath  / 'works_authors_aff')
works_authors_aff = works_authors_aff.reset_index()
works_authors_aff['publication_year'] = works_authors_aff['publication_date_1'].dt.year
works_authors_aff['publication_year'] = works_authors_aff['publication_year'].astype('int64')
works_authors_aff = works_authors_aff.drop_duplicates(['work_id','institution_id']).reset_index()
works_authors_aff = works_authors_aff[['publication_year','work_id','institution_id']]
works_authors_aff = works_authors_aff.sort_values(by='publication_year')
works_authors_aff['institution_id'] = works_authors_aff['institution_id'].astype(int)

I_dist = read_parquet(basepath / 'I_dist_threshold')
I_dist1 = I_dist
I_dist2 = I_dist[I_dist.source!=I_dist.target]
I_dist[['target','source']] = I_dist[['source','target']]
I_dist = pd.concat([I_dist1,I_dist2])

my_file = "inst_id_name_dict.pickle" #dictionary inst_id and inst_name
with open(os.path.join(Path('./Tables'), my_file),"rb") as fp:
    inst_id_name_dict = pickle.load(fp)
    
#count number (unique) institutions per paper
works_authors_aff['num_affs'] = works_authors_aff.groupby('work_id')['institution_id'].transform('size')
print(f'{min(works_authors_aff.num_affs)}-{max(works_authors_aff.num_affs)} min-max number (unique) affiliations per work')
works_authors_aff['weight'] = 2 / ( works_authors_aff['num_affs']*(works_authors_aff['num_affs']-1) ) 
works_authors_aff.loc[works_authors_aff.num_affs==1,'weight'] = 1 #one affiliation

years_list = list(set(works_authors_aff.publication_year))
years_list.sort()

works_authors_aff_noloops = works_authors_aff[works_authors_aff.num_affs>1]
works_authors_aff_loops = works_authors_aff[works_authors_aff.num_affs==1]
works_authors_aff_loops['institution_id2'] = works_authors_aff_loops['institution_id']
works_authors_aff_noloops = works_authors_aff_noloops.set_index('publication_year')
works_authors_aff_loops = works_authors_aff_loops.set_index('publication_year')

works_yearly_count = works_yearly_count.loc[2000:2024]
print(f"Total number preprints (from {min(works_yearly_count.index)} to {max(works_yearly_count.index)}): {works_yearly_count.work_id.sum()}")

start_year_0 = 2000
end_year_0 = 2009
works_yearly_count_0 = works_yearly_count.loc[start_year_0:end_year_0]
print(f"Number preprints (from {start_year_0} to {end_year_0}): {works_yearly_count_0.work_id.sum()}")
start_year_1 = 2010
end_year_1 = 2014
works_yearly_count_1 = works_yearly_count.loc[start_year_1:end_year_1]
print(f"Number preprints (from {start_year_1} to {end_year_1}): {works_yearly_count_1.work_id.sum()}")
start_year_2 = 2015
end_year_2 = 2019
works_yearly_count_2 = works_yearly_count.loc[start_year_2:end_year_2]
print(f"Number preprints (from {start_year_2} to {end_year_2}): {works_yearly_count_2.work_id.sum()}")
start_year_3 = 2020
end_year_3 = 2020
works_yearly_count_3 = works_yearly_count.loc[start_year_3:end_year_3]
print(f"Number preprints (from {start_year_3} to {end_year_3}): {works_yearly_count_3.work_id.sum()}")
start_year_4 = 2021
end_year_4 = 2024
works_yearly_count_4 = works_yearly_count.loc[start_year_4:end_year_4]
print(f"Number preprints (from {start_year_4} to {end_year_4}): {works_yearly_count_4.work_id.sum()}")

works_0 = works.loc[start_year_0:end_year_0]
works_set_0 = set(works_0.work_id)
works_1 = works.loc[start_year_1:end_year_1]
works_set_1 = set(works_1.work_id)
works_2 = works.loc[start_year_2:end_year_2]
works_set_2 = set(works_2.work_id)
works_3 = works.loc[start_year_3:end_year_3]
works_set_3 = set(works_3.work_id)
works_4 = works.loc[start_year_4:end_year_4]
works_set_4 = set(works_4.work_id)

works_authors_aff_noloops_0 = works_authors_aff_noloops.loc[start_year_0:end_year_0]
works_authors_aff_loops_0 = works_authors_aff_loops.loc[start_year_0:end_year_0]
works_authors_aff_noloops_1 = works_authors_aff_noloops.loc[start_year_1:end_year_1]
works_authors_aff_loops_1 = works_authors_aff_loops.loc[start_year_1:end_year_1]
works_authors_aff_noloops_2 = works_authors_aff_noloops.loc[start_year_2:end_year_2]
works_authors_aff_loops_2 = works_authors_aff_loops.loc[start_year_2:end_year_2]
works_authors_aff_noloops_3 = works_authors_aff_noloops.loc[start_year_3:end_year_3]
works_authors_aff_loops_3 = works_authors_aff_loops.loc[start_year_3:end_year_3]
works_authors_aff_noloops_4 = works_authors_aff_noloops.loc[start_year_4:end_year_4]
works_authors_aff_loops_4 = works_authors_aff_loops.loc[start_year_4:end_year_4]

works_authors_aff_noloops_0['period'] = 0
works_authors_aff_noloops_1['period'] = 1
works_authors_aff_noloops_2['period'] = 2
works_authors_aff_noloops_3['period'] = 3
works_authors_aff_noloops_4['period'] = 4
works_authors_aff_loops_0['period'] = 0
works_authors_aff_loops_1['period'] = 1
works_authors_aff_loops_2['period'] = 2
works_authors_aff_loops_3['period'] = 3
works_authors_aff_loops_4['period'] = 4
works_authors_aff_noloops_periods = pd.concat([works_authors_aff_noloops_0,works_authors_aff_noloops_1])
works_authors_aff_noloops_periods = pd.concat([works_authors_aff_noloops_periods,works_authors_aff_noloops_2])
works_authors_aff_noloops_periods = pd.concat([works_authors_aff_noloops_periods,works_authors_aff_noloops_3])
works_authors_aff_noloops_periods = pd.concat([works_authors_aff_noloops_periods,works_authors_aff_noloops_4])
works_authors_aff_loops_periods = pd.concat([works_authors_aff_loops_0,works_authors_aff_loops_1])
works_authors_aff_loops_periods = pd.concat([works_authors_aff_loops_periods,works_authors_aff_loops_2])
works_authors_aff_loops_periods = pd.concat([works_authors_aff_loops_periods,works_authors_aff_loops_3])
works_authors_aff_loops_periods = pd.concat([works_authors_aff_loops_periods,works_authors_aff_loops_4])

for y in tqdm(range(5)):
    noloops_df = works_authors_aff_noloops_periods.query("period == @y")
    loops_df = works_authors_aff_loops_periods.query("period == @y")
    loops_df['weight'] = loops_df[['institution_id','institution_id2','weight']].groupby(['institution_id']).weight.transform('sum')
    loops_df = loops_df.drop_duplicates('institution_id')
    I = make_institution_graph(noloops_df)
    I.add_weighted_edges_from([tuple(r) for r in loops_df[['institution_id','institution_id2','weight']].to_numpy()])
    my_file = "I_period"+str(y)+".pickle"
    pickle.dump(I, open(os.path.join(my_path_, my_file), 'wb'))  
    #only loops
    I_loops = nx.Graph()
    I_loops.add_weighted_edges_from([tuple(r) for r in loops_df[['institution_id','institution_id2','weight']].to_numpy()])
    my_file = "Iloops_period"+str(y)+".pickle"
    pickle.dump(I_loops, open(os.path.join(my_path_, my_file), 'wb'))
    
info_df = pd.DataFrame()
degree = {}
for y in tqdm(range(5)):
    my_file = "I_period"+str(y)+".pickle"
    with open(os.path.join(my_path_, my_file),"rb") as fp:
        I_year = pickle.load(fp)
    my_file = "Iloops_period"+str(y)+".pickle"
    with open(os.path.join(my_path_, my_file),"rb") as fp:
        Iloops_year = pickle.load(fp)
    info_y = pd.DataFrame.from_dict({'period':[y], 'N': [I_year.number_of_nodes()], 'E': [I_year.size()], 'W' : [I_year.size(weight='weight')], 'Nloops': [Iloops_year.number_of_nodes()], 'Eloops': [Iloops_year.size()], 'Wloops' : [Iloops_year.size(weight='weight')]})
    info_df = pd.concat([info_df,info_y])    
    degree[y] = {'k': I_year.degree(), 's' : I_year.degree(weight='weight'),'kloops': Iloops_year.degree(), 'sloops' : Iloops_year.degree(weight='weight')}
my_file = "info_period_df.csv"
info_df.to_csv(os.path.join(my_path_, my_file),index=False)    
my_file = "degree_period.pickle"
pickle.dump(degree, open(os.path.join(my_path_, my_file), 'wb'))
my_file = "info_period_df.csv"
info_period_df = pd.read_csv(os.path.join(my_path_, my_file))
display(info_df)

degree_df = pd.DataFrame()
for y in tqdm(range(5)):
    dict_y = degree[y]
    df_k = pd.DataFrame.from_dict(dict_y['k']).rename(columns={0:'institution_id',1:'k'})
    df_s = pd.DataFrame.from_dict(dict_y['s']).rename(columns={0:'institution_id',1:'s'})
    df_kloops = pd.DataFrame.from_dict(dict_y['kloops']).rename(columns={0:'institution_id',1:'kloops'})
    df_sloops = pd.DataFrame.from_dict(dict_y['sloops']).rename(columns={0:'institution_id',1:'sloops'})
    if len(df_kloops)!=0:
        df_year = (df_k.merge(df_s,on='institution_id')).merge(df_kloops.merge(df_sloops,on='institution_id'),on='institution_id',how='left')
    else:
        df_year = df_k.merge(df_s,on='institution_id')
        df_year["kloops"] = np.nan
        df_year["sloops"] = np.nan
    df_year.insert(0, 'period', y)
    degree_df = pd.concat([degree_df,df_year])
degree_df = degree_df.fillna(0)
my_file = "degree_df.csv"    
degree_df.to_csv(os.path.join(my_path_, my_file),index=False) 
my_file = "degree_df.csv"
degree_df = pd.read_csv(os.path.join(my_path_, my_file))
display(degree_df)

edges_df = pd.DataFrame()
for y in tqdm(range(5)):
    my_file = "I_period"+str(y)+".pickle"
    with open(os.path.join(my_path_, my_file),"rb") as fp:
        I_year = pickle.load(fp)
    I_year_df = nx.to_pandas_edgelist(I_year)
    I_year_df = I_year_df.merge(I_dist,on=['source','target'],how='left')
    I_year_df.insert(0, 'period', y)
    edges_df = pd.concat([edges_df,I_year_df])
my_file = "edges_df.csv"    
edges_df.to_csv(os.path.join(my_path_, my_file),index=False) 
my_file = "edges_df.csv"
edges_df = pd.read_csv(os.path.join(my_path_, my_file))
display(edges_df)

edges_comp_df = pd.DataFrame()
for y in tqdm(range(5)):
    my_file = "I_period"+str(y)+".pickle" 
    with open(os.path.join(my_path_, my_file),"rb") as fp:
        I = pickle.load(fp)
    I_comp = nx.complement(I) #20 mins
    # my_file = "I_period"+str(y)+"_comp.pickle"
    # pickle.dump(I_comp, open(os.path.join(my_path_, my_file), 'wb')) 
    I_comp_df = nx.to_pandas_edgelist(I_comp)
    I_comp_df = I_comp_df.merge(I_dist,on=['source','target'])
    I_comp_df.insert(0, 'period', y)
    edges_comp_df = pd.concat([edges_comp_df,I_comp_df])    
my_file = "edges_comp_df.csv"    
edges_comp_df.to_csv(os.path.join(my_path_, my_file),index=False)   

In [None]:
periods_list = [0,1,2,3,4]
periods_labels = ['[2000,2009]','[2010,2014]','[2015,2019]','[2020,2020]','[2021,2024]']
color_dict = {0: 'yellow', 1: 'magenta', 2: 'orange', 3: 'red', 4: 'cyan'}

#strengths: (a) during the period (b) till that period (included) (c) till that period (not included)
degree_df = pd.read_csv(my_path_ / 'degree_df.csv')
degree_df_a = degree_df
degree_df_b = pd.DataFrame()
for y in periods_list:
    df_y = (degree_df[degree_df.period<=y][['institution_id','s','sloops']]).groupby(['institution_id']).sum().reset_index()
    df_y.insert(0,'period',y)
    degree_df_b = pd.concat([degree_df_b,df_y])
degree_df_c = pd.DataFrame()
for y in periods_list:
    df_y = (degree_df[degree_df.period<y][['institution_id','s','sloops']]).groupby(['institution_id']).sum().reset_index()
    df_y.insert(0,'period',y)
    degree_df_c = pd.concat([degree_df_c,df_y])
    
edges_df = pd.read_csv(my_path_ / 'edges_df.csv')  
edges_comp_df = pd.read_csv(my_path_ / 'edges_comp_df.csv') 
edges_comp_df['weight'] = 0
edges_df_tot = pd.concat([edges_df,edges_comp_df])

In [None]:
mean_scatter_plot_multi(fig_2B,edges_df_tot,degree_df,"weight","s","weight","s_i","w_ii",'Fig. 2B',False,False,False,False)
scipy_fun_multi(linear_fit,fig_2B,edges_df_tot,degree_df,"weight",2*10**4,"s_i","w_ii","Fig. 2B")
mean_scatter_plot_multi(fig_4B,edges_df_tot,degree_df,"weight","dist", "ratio","d_ij","w_ij / (s_i*s_j)^(1/2) ",'Fig. 4B',True,True,False,True)
scipy_fun_multi2(powerlaw_fit,fig_4B,edges_df_tot,degree_df,"weight",10**1,"d_ij","w_ij / (s_i*s_j)^(1/2) ","Fig. 4B - power law")
scipy_fun_multi2(powerlaw_fit,fig_4B,edges_df_tot,degree_df,"weight",10**2,"d_ij","w_ij / (s_i*s_j)^(1/2) ","Fig. 4B - power law")

## Inter-intra institutions collaborations

In [None]:
works = read_parquet(basepath / 'works')

In [None]:
work_authors_edges_df = read_parquet(my_path_ / 'work_authors_edges_df_dist')
work_authors_edges_df = works[['work_id']].reset_index().merge(work_authors_edges_df,on='work_id')
df1 = work_authors_edges_df.groupby('publication_date_1').intra.count().reset_index().rename(columns={'intra':'total'})
df2 = work_authors_edges_df.groupby('publication_date_1').intra.sum().to_frame().reset_index()
intra_inter_df = df1.merge(df2,on='publication_date_1')
intra_inter_df['inter'] = intra_inter_df['total'] - intra_inter_df['intra']
intra_inter_df['frac_intra'] = intra_inter_df['intra'] / intra_inter_df['total']
intra_inter_df['frac_inter'] = intra_inter_df['inter'] / intra_inter_df['total']
my_file = "intra_inter_df.csv"    
intra_inter_df.to_csv(os.path.join(my_path_, my_file),index=False)

In [None]:
my_file = 'preprint_id_set_COVID'
with open(os.path.join(basepath, my_file),"rb") as fp:
    preprint_id_set_COVID = pickle.load(fp)
my_file = 'preprint_id_set_noCOVID'
with open(os.path.join(basepath, my_file),"rb") as fp:
    preprint_id_set_noCOVID = pickle.load(fp)

In [None]:
work_authors_edges_df = read_parquet(my_path_ / 'work_authors_edges_df_dist')
work_authors_edges_df = work_authors_edges_df[work_authors_edges_df.work_id.isin(preprint_id_set_COVID)]
work_authors_edges_df = works[['work_id']].reset_index().merge(work_authors_edges_df,on='work_id')
df1 = work_authors_edges_df.groupby('publication_date_1').intra.count().reset_index().rename(columns={'intra':'total'})
df2 = work_authors_edges_df.groupby('publication_date_1').intra.sum().to_frame().reset_index()
intra_inter_df = df1.merge(df2,on='publication_date_1')
intra_inter_df['inter'] = intra_inter_df['total'] - intra_inter_df['intra']
intra_inter_df['frac_intra'] = intra_inter_df['intra'] / intra_inter_df['total']
intra_inter_df['frac_inter'] = intra_inter_df['inter'] / intra_inter_df['total']
my_file = "intra_inter_df_COVID.csv"    
intra_inter_df.to_csv(os.path.join(my_path_, my_file),index=False)

In [None]:
work_authors_edges_df = read_parquet(my_path_ / 'work_authors_edges_df_dist')
work_authors_edges_df = work_authors_edges_df[work_authors_edges_df.work_id.isin(preprint_id_set_noCOVID)]
work_authors_edges_df = works[['work_id']].reset_index().merge(work_authors_edges_df,on='work_id')
df1 = work_authors_edges_df.groupby('publication_date_1').intra.count().reset_index().rename(columns={'intra':'total'})
df2 = work_authors_edges_df.groupby('publication_date_1').intra.sum().to_frame().reset_index()
intra_inter_df = df1.merge(df2,on='publication_date_1')
intra_inter_df['inter'] = intra_inter_df['total'] - intra_inter_df['intra']
intra_inter_df['frac_intra'] = intra_inter_df['intra'] / intra_inter_df['total']
intra_inter_df['frac_inter'] = intra_inter_df['inter'] / intra_inter_df['total']
my_file = "intra_inter_df_noCOVID.csv"    
intra_inter_df.to_csv(os.path.join(my_path_, my_file),index=False)

In [None]:
from scipy import optimize
import numpy.polynomial.polynomial as npoly

def plot_fit_rolling_breakpoints_2(df,x_column,x_column2,x_label,title,window_size,num_breakpoints):

    fig, ax = plt.subplots(figsize=(15, 5))
    x_dates = list(df['publication_date_1'])
    y_data = df[x_column].rolling(window=window_size).mean()[window_size-1:]
    y_data2 = df[x_column2].rolling(window=window_size).mean()[window_size-1:]
    x_data = x_dates[window_size-1:]
    ax.set_ylim([0, 1])
    
    ax.plot(x_data, y_data, "o-", color='orange', markersize=3,label='intra-institution')
    ax.plot(x_data, y_data2, "o-", color='green', markersize=3,label='inter-institution')

    plt.grid(True, linewidth=0.5)
    #ax.yaxis.set_major_formatter(formatter)

    ax.set_xlabel(x_label,size=20)
    #ax.set_title(title,size=30)

    #ax.xaxis.set_major_locator(mdates.MonthLocator()) # Make ticks on occurrences of each month
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %y')) # Get only the month to show in the x-axis

    ax.axvline(pd.Timestamp(2020, 3, 1),color='r') #ax.axvline(x_dates[84],color='r')

    #save for all possible combination of breakpoints the correspondent error 
    def f(breakpoints, x, y, fcache): 
        breakpoints = tuple(map(int, sorted(breakpoints)))
        if breakpoints not in fcache:
            total_error = 0
            for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x, y):
                total_error += ((f(xi) - yi)**2).sum()
            fcache[breakpoints] = total_error 
        # print('{} --> {}'.format(breakpoints, fcache[breakpoints]))
        return fcache[breakpoints]

    def find_best_piecewise_polynomial(breakpoints, x, y):
        breakpoints = tuple(map(int, sorted(breakpoints)))
        xs = np.split(x, breakpoints)
        ys = np.split(y, breakpoints)
        result = []
        for xi, yi in zip(xs, ys):
            if len(xi) < 2: continue
            coefs = npoly.polyfit(xi, yi, 1)
            f = npoly.Polynomial(coefs)
            result.append([f, xi, yi])
        return result
    
    x_num = dates.date2num(x_data)
    breakpoints = optimize.brute(f, [slice(1, len(x_data), 1)]*num_breakpoints, args=(x_num, y_data, {}), finish=None)
    if num_breakpoints==1:
        breakpoints = [breakpoints] 
    for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x_num, y_data):
        xi_min = x_data[np.where(x_num == xi.min())[0][0]]
        xi_max = x_data[np.where(x_num == xi.max())[0][0]]
        x_interval = np.array([xi_min, xi_max]) 
        #print('y = {:35s}, if x in [{}, {}]'.format(str(f), *x_interval))
        x_interval = np.array([xi.min(), xi.max()])
        #ax.plot(x_interval, f(x_interval), 'yo-')
        coef = f.convert().coef
        b = coef[0]
        a = coef[1]
        if b>0:
            ll = 'y = {:.2f} x + {:.0f}'.format(a,b)
        else: #b<0
            ll = 'y = {:.2f} x - {:.0f}'.format(a,abs(b))    
        ax.plot(x_interval, f(x_interval), 'yo-',linewidth=2, markersize=7)       
    #save for all possible combination of breakpoints the correspondent error
    
    def f(breakpoints, x, y, fcache): 
        breakpoints = tuple(map(int, sorted(breakpoints)))
        if breakpoints not in fcache:
            total_error = 0
            for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x, y):
                total_error += ((f(xi) - yi)**2).sum()
            fcache[breakpoints] = total_error 
        # print('{} --> {}'.format(breakpoints, fcache[breakpoints]))
        return fcache[breakpoints]

    def find_best_piecewise_polynomial(breakpoints, x, y):
        breakpoints = tuple(map(int, sorted(breakpoints)))
        xs = np.split(x, breakpoints)
        ys = np.split(y, breakpoints)
        result = []
        for xi, yi in zip(xs, ys):
            if len(xi) < 2: continue
            coefs = npoly.polyfit(xi, yi, 1)
            f = npoly.Polynomial(coefs)
            result.append([f, xi, yi])
        return result
    
    x_num = dates.date2num(x_data)
    breakpoints = optimize.brute(f, [slice(1, len(x_data), 1)]*num_breakpoints, args=(x_num, y_data2, {}), finish=None)
    if num_breakpoints==1:
        breakpoints = [breakpoints] 
    for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x_num, y_data2):
        xi_min = x_data[np.where(x_num == xi.min())[0][0]]
        xi_max = x_data[np.where(x_num == xi.max())[0][0]]
        x_interval = np.array([xi_min, xi_max]) 
        #print('y = {:35s}, if x in [{}, {}]'.format(str(f), *x_interval))
        x_interval = np.array([xi.min(), xi.max()])
        #ax.plot(x_interval, f(x_interval), 'yo-')
        coef = f.convert().coef
        b = coef[0]
        a = coef[1]
        if b>0:
            ll = 'y = {:.2f} x + {:.0f}'.format(a,b)
        else: #b<0
            ll = 'y = {:.2f} x - {:.0f}'.format(a,abs(b))    
        ax.plot(x_interval, f(x_interval), 'yo-',linewidth=2, markersize=7) 
       
    ax.legend()        
    ax.set_title(title,size=30)

### COVID

In [None]:
my_file = "intra_inter_df_noCOVID.csv"    
intra_inter_df = pd.read_csv(os.path.join(my_path_, my_file))
intra_inter_df['publication_date_1'] = intra_inter_df['publication_date_1'].apply(pd.to_datetime)
intra_inter_df = intra_inter_df[intra_inter_df.publication_date_1<'2024-01-01']
x_column = 'frac_intra'
x_column2 = 'frac_inter'
x_label = 'publication_date_1'
title = 'Fraction collabs. intra/inter-inst. - no COVID'
window_size = 1
num_breakpoints = 2
plot_fit_rolling_breakpoints_2(intra_inter_df,x_column,x_column2,x_label,title,window_size,num_breakpoints)

In [None]:
my_file = "intra_inter_df_COVID.csv"    
intra_inter_df = pd.read_csv(os.path.join(my_path_, my_file))
intra_inter_df['publication_date_1'] = intra_inter_df['publication_date_1'].apply(pd.to_datetime)
intra_inter_df = intra_inter_df[intra_inter_df.publication_date_1>='2020-01-01']
intra_inter_df = intra_inter_df[intra_inter_df.publication_date_1<'2024-01-01']
x_column = 'frac_intra'
x_column2 = 'frac_inter'
x_label = 'publication_date_1'
title = 'Fraction collabs. intra/inter-inst. - COVID'
num_breakpoints = 1
plot_fit_rolling_breakpoints_2(intra_inter_df,x_column,x_column2,x_label,title,window_size,num_breakpoints)

## Categories

In [None]:
my_file = "preprint_dict.pickle"
with open(os.path.join(Path('./Tables'), my_file),"rb") as fp:
    preprint_dict = pickle.load(fp)
rows = [(category, work_id) for category, work_ids in preprint_dict.items() for work_id in work_ids]
preprint_df = pd.DataFrame(rows, columns=['tax_name', 'work_id'])
preprint_categories_list = list(preprint_df.groupby('tax_name').work_id.count().to_frame().sort_values(by='work_id',ascending=False).index)

In [None]:
my_file = "work_edges_dist_mean.csv"    
work_edges_dist_mean = pd.read_csv(os.path.join(my_path_, my_file))
work_edges_dist_mean['publication_date_1'] = pd.to_datetime(work_edges_dist_mean['publication_date_1'])
df = work_edges_dist_mean
df = df[df.publication_date_1>='2000']
df = df[df.publication_date_1<'2024']
df = df.merge(preprint_df,on='work_id')
df_stat_tot = pd.DataFrame()
for k in preprint_categories_list:
    df_k = df[df.tax_name==k]
    df_stat = df_k[['publication_date_1','dist']].groupby('publication_date_1').dist.agg(['mean', 'std','sem']).reset_index()
    df_stat.insert(1, 'category', k)
    df_stat_tot = pd.concat([df_stat_tot,df_stat])    
    
    plt.style.use("dark_background")
    fig, ax = plt.subplots(figsize=(15, 5))
    x_data = list(df_stat['publication_date_1'])
    y_mean = df_stat['mean']
    y_sem = df_stat['sem']
    ax.plot(x_data, y_mean, "o", color = 'green',markersize=5)
    #ax.fill_between(x_data, y_mean - y_sem, y_mean + y_sem, color = 'green',alpha=0.4)
    ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
    plt.grid(True, linewidth=0.5)
    ax.yaxis.set_major_formatter(formatter)

    ax.set_xlabel('month',size=20)
    title = 'ATD - '+k
    ax.set_title(title,size=30)    

my_file = "work_edges_dist_mean_cat.pickle"
pickle.dump(df_stat_tot, open(os.path.join(my_path_, my_file), 'wb'))

In [None]:
works = read_parquet(basepath / 'works')
work_authors_edges_df = read_parquet(my_path_ / 'work_authors_edges_df_dist')
work_authors_edges_df = works[['work_id']].reset_index().merge(work_authors_edges_df,on='work_id')
work_authors_edges_df = work_authors_edges_df[work_authors_edges_df.publication_date_1>='2000-01-01']
work_authors_edges_df = work_authors_edges_df[work_authors_edges_df.publication_date_1<'2024-01-01']
df = work_authors_edges_df
df = df.merge(preprint_df,on='work_id')
intra_inter_df_tot = pd.DataFrame()
for k in preprint_categories_list:
    df_k = df[df.tax_name==k]
    df1 = df_k.groupby('publication_date_1').intra.count().reset_index().rename(columns={'intra':'total'})
    df2 = df_k.groupby('publication_date_1').intra.sum().to_frame().reset_index()
    intra_inter_df = df1.merge(df2,on='publication_date_1')
    intra_inter_df['inter'] = intra_inter_df['total'] - intra_inter_df['intra']
    intra_inter_df['frac_intra'] = intra_inter_df['intra'] / intra_inter_df['total']
    intra_inter_df['frac_inter'] = intra_inter_df['inter'] / intra_inter_df['total']
    intra_inter_df.insert(1, 'category', k)
    intra_inter_df_tot = pd.concat([intra_inter_df_tot,intra_inter_df])    
    
    plt.style.use("dark_background")
    fig, ax = plt.subplots(figsize=(15, 5))
    x_data = list(intra_inter_df['publication_date_1'])
    y_mean = intra_inter_df['frac_inter']
    ax.plot(x_data, y_mean, "o", color = 'magenta',markersize=5)
    #ax.fill_between(x_data, y_mean - y_sem, y_mean + y_sem, color = 'green',alpha=0.4)
    ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
    plt.grid(True, linewidth=0.5)
    #ax.yaxis.set_major_formatter(formatter)

    ax.set_xlabel('month',size=20)
    title = 'inter-institution collaborations - '+k
    ax.set_title(title,size=30)  
   
my_file = "intra_inter_df_cat.csv"    
intra_inter_df_tot.to_csv(os.path.join(my_path_, my_file),index=False)