# Model - info

In [None]:
import pandas as pd 
from pathlib import Path
import networkx as nx
import igraph as ig
import pickle
import numpy as np
from scipy.sparse import csr_matrix
from scipy.spatial import distance
import seaborn as sns
import time
from tqdm.auto import tqdm
import random 
import os
from itertools import chain, combinations
import itertools
import scipy

import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.io as pio
plt.rcParams.update({'font.size': 22})
sns.set(style="ticks", context="talk")
plt.style.use("dark_background")
pd.options.plotting.backend = 'plotly'
pio.templates.default = 'plotly_dark+presentation'

import numpy as np
from scipy.optimize import minimize

#pd.options.mode.chained_assignment = None 

tqdm.pandas()

import matplotlib.dates as dates
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
from scipy import optimize
import numpy.polynomial.polynomial as npoly

def form(x,pos):
    if x<1e3:
        return '%1.3f' % (x)
    elif x<1e6:
        return '%1.1fK' % (x * 1e-3)
    else:
        return '%1.1fM' % (x * 1e-6)
formatter = FuncFormatter(form)

import warnings
warnings.filterwarnings("ignore")

def read_parquet(name, **args):
    path = name
    print(f'Reading {name!r}')
    tic = time.time()
    df = pd.read_parquet(path, engine='fastparquet', **args)
    before = len(df)
    # df.drop_duplicates(inplace=True)
    toc = time.time()
    after = len(df)
        
    print(f'Read {len(df):,} rows from {path.stem!r} in {toc-tic:.2f} sec. {before-after:,} duplicates.')
    return df

In [None]:
def my_weight(G, u, v, weight="weight"):
    w = 0
    for nbr in set(G[u]) & set(G[v]):
        w += (G[u][nbr].get(weight, 1) + G[v][nbr].get(weight, 1))/2
    return w
def make_institution_graph(works_authors_rows):
    
    institution_id_set = set(works_authors_rows.institution_id)
                                  
    bip_g = nx.from_pandas_edgelist(
        works_authors_rows,
        source='work_id', target='institution_id', edge_attr ='weight'
    )

    inst_graph = nx.bipartite.generic_weighted_projected_graph(bip_g,nodes=institution_id_set,weight_function=my_weight)    
    #inst_graph = nx.bipartite.weighted_projected_graph(bip_g,nodes=institution_id_set) 

    return inst_graph


In [None]:
basepath = Path('Tables_final') 
my_path_ = Path('Model_info')
if not os.path.exists(my_path_):
    os.makedirs(my_path_)

## Perc. works resp. team size

In [None]:
works = read_parquet(basepath / 'works')

#TW training window
months_list = list(set(works.reset_index().drop_duplicates('publication_date_1').publication_date_1))
months_list.sort()
months_list = [i.strftime('%Y-%m-%d') for i in months_list]
start_index = 0
end_index = 120 #180
start_month = months_list[start_index]#.strftime('%Y-%m-%d')
end_month = months_list[end_index-1]#.strftime('%Y-%m-%d')
print(start_month,end_month)
works = works[works.num_authors > 1]
works_TW = works.loc[start_month:end_month]
works_AW = works.loc[months_list[end_index]:]

df_perc_works = ((works.groupby('num_authors').work_id.count().to_frame().cumsum()/len(works))*100).rename(columns={'work_id':'perc_works'})
display(df_perc_works)
df_perc_works = ((works_AW.groupby('num_authors').work_id.count().to_frame().cumsum()/len(works_AW))*100).rename(columns={'work_id':'perc_works'})
display(df_perc_works)

## Edges - 2 authors

In [None]:
path_2authors = Path('Model_2authors') 
if not os.path.exists(path_2authors):
    os.makedirs(path_2authors)

### Tables

In [None]:
works = read_parquet(basepath / 'works')
works_authors_aff = read_parquet(basepath / 'works_authors_aff')

works = works.loc['2000-01-01':'2023-12-01'] 
works = works[works.num_authors>1]

works_2authors = set(works[works.num_authors ==2].work_id)
print(f'{len(works_2authors)} ({(len(works_2authors)/len(works))*100:.2f}%) works 2 authors')

works = works[works.work_id.isin(works_2authors)]
works_authors_aff = works_authors_aff[works_authors_aff.work_id.isin(works_2authors)]

my_file = "dfs_2authors.pickle"
pickle.dump([works_2authors,works,works_authors_aff], open(os.path.join(path_2authors, my_file), 'wb')) 

In [None]:
#INITIALIZATION #preferential attachment  #TW: 2000-2009
my_file = "dfs_2authors.pickle"
with open(os.path.join(path_2authors, my_file),"rb") as fp:
    [works_2authors,works,works_authors_aff] = pickle.load(fp)  
works = works.loc['2000-01-01':'2023-12-01'] 
works_authors_aff = works_authors_aff.loc['2000-01-01':'2023-12-01']

N_dict = works_authors_aff.groupby('publication_date_1').work_id.nunique().to_frame().to_dict()['work_id']
months_list = list(N_dict.keys())
months_list.sort()
months_list = [i.strftime('%Y-%m-%d') for i in months_list]

start_index = 0
end_index = 120 #180
start_month = months_list[start_index]#.strftime('%Y-%m-%d')
end_month = months_list[end_index-1]#.strftime('%Y-%m-%d')

df_TW = works_authors_aff.loc[start_month:end_month] #df_TW = works_authors_aff.loc[months_list[:120]]
inst_set = set(df_TW.institution_id)
I = len(inst_set)
print(f'TW from {start_month} to {end_month} : {I} institutions')

In [None]:
#initial strenghts    #consider only institutions in sample in TW
#count number (unique) institutions per paper
df_TW = df_TW.drop_duplicates(['work_id','institution_id'])
df_TW['num_affs'] = df_TW.groupby('work_id')['institution_id'].transform('size')
print(f'{min(df_TW.num_affs)}-{max(df_TW.num_affs)} min-max number (unique) affiliations per work')
df_TW['weight'] = 2 / ( df_TW['num_affs']*(df_TW['num_affs']-1) ) 
df_TW.loc[df_TW.num_affs==1,'weight'] = 1 #one affiliation
df_TW_noloops = df_TW[df_TW.num_affs>1]
df_TW_loops = df_TW[df_TW.num_affs==1]
df_TW_loops['institution_id2'] = df_TW_loops['institution_id']
df_TW_loops['weight'] = df_TW_loops[['institution_id','institution_id2','weight']].groupby(['institution_id']).weight.transform('sum')
df_TW_loops = df_TW_loops.drop_duplicates('institution_id')
I_graph = make_institution_graph(df_TW_noloops)
I_graph.add_weighted_edges_from([tuple(r) for r in df_TW_loops[['institution_id','institution_id2','weight']].to_numpy()])
df_ = pd.DataFrame.from_dict(dict(I_graph.degree(weight='weight')),orient='index').reset_index().rename(columns={'index':'institution_id',0:'strength'})
df_['institution_id'] = df_['institution_id'].astype(int)
df = df_.sort_values(by='strength',ascending=False)
inst_set = set(df.institution_id)
I = len(inst_set)
print(f'TW from {start_month} to {end_month} : {I} institutions')

my_file = "df_strengths0.csv"     
df.to_csv(os.path.join(path_2authors, my_file),index=False)

my_file = "inst_set.pickle"
pickle.dump(inst_set, open(os.path.join(path_2authors, my_file), 'wb'))

In [None]:
I_dist = read_parquet(basepath / 'I_dist_threshold')
I_dist = I_dist[['source','target','dist']].reset_index(drop=True)
I_dist = I_dist.query('source.isin(@inst_set) & target.isin(@inst_set)').reset_index()
my_file = "I_dist_model.csv" 
I_dist.to_csv(os.path.join(path_2authors, my_file),index=False)

### Data plots

In [None]:
my_file = "dfs_2authors.pickle"
with open(os.path.join(path_2authors, my_file),"rb") as fp:
    [works_2authors,works,works_authors_aff] = pickle.load(fp)  

N_dict = works_authors_aff.groupby('publication_date_1').work_id.nunique().to_frame().to_dict()['work_id']

months_list = list(N_dict.keys())
months_list.sort()
months_list = [i.strftime('%Y-%m-%d') for i in months_list]

In [None]:
def plot_fit_rolling_breakpoints_2(df,x_column,x_column2,x_label,title,window_size,num_breakpoints):

    fig, ax = plt.subplots(figsize=(15, 5))
    x_dates = list(df['publication_date_1'])
    y_data = df[x_column].rolling(window=window_size).mean()[window_size-1:]
    y_data2 = df[x_column2].rolling(window=window_size).mean()[window_size-1:]
    x_data = x_dates[window_size-1:]
    ax.set_ylim([0, 1])
    
    ax.plot(x_data, y_data, "o-", color='orange', markersize=3,label='intra-institution')
    ax.plot(x_data, y_data2, "o-", color='green', markersize=3,label='inter-institution')

    plt.grid(True, linewidth=0.5)
    ax.yaxis.set_major_formatter(formatter)

    ax.set_xlabel(x_label,size=20)
    #ax.set_title(title,size=30)

    #ax.xaxis.set_major_locator(mdates.MonthLocator()) # Make ticks on occurrences of each month
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %y')) # Get only the month to show in the x-axis

    ax.axvline(pd.Timestamp(2020, 3, 1),color='r') #ax.axvline(x_dates[84],color='r')

    #save for all possible combination of breakpoints the correspondent error 
    def f(breakpoints, x, y, fcache): 
        breakpoints = tuple(map(int, sorted(breakpoints)))
        if breakpoints not in fcache:
            total_error = 0
            for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x, y):
                total_error += ((f(xi) - yi)**2).sum()
            fcache[breakpoints] = total_error 
        # print('{} --> {}'.format(breakpoints, fcache[breakpoints]))
        return fcache[breakpoints]

    def find_best_piecewise_polynomial(breakpoints, x, y):
        breakpoints = tuple(map(int, sorted(breakpoints)))
        xs = np.split(x, breakpoints)
        ys = np.split(y, breakpoints)
        result = []
        for xi, yi in zip(xs, ys):
            if len(xi) < 2: continue
            coefs = npoly.polyfit(xi, yi, 1)
            f = npoly.Polynomial(coefs)
            result.append([f, xi, yi])
        return result
    
    x_num = dates.date2num(x_data)
    breakpoints = optimize.brute(f, [slice(1, len(x_data), 1)]*num_breakpoints, args=(x_num, y_data, {}), finish=None)
    if num_breakpoints==1:
        breakpoints = [breakpoints] 
    for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x_num, y_data):
        xi_min = x_data[np.where(x_num == xi.min())[0][0]]
        xi_max = x_data[np.where(x_num == xi.max())[0][0]]
        x_interval = np.array([xi_min, xi_max]) 
        #print('y = {:35s}, if x in [{}, {}]'.format(str(f), *x_interval))
        x_interval = np.array([xi.min(), xi.max()])
        #ax.plot(x_interval, f(x_interval), 'yo-')
        coef = f.convert().coef
        b = coef[0]
        a = coef[1]
        if b>0:
            ll = 'y = {:.2f} x + {:.0f}'.format(a,b)
        else: #b<0
            ll = 'y = {:.2f} x - {:.0f}'.format(a,abs(b))    
        ax.plot(x_interval, f(x_interval), 'yo-',linewidth=2, markersize=7)       
    #save for all possible combination of breakpoints the correspondent error
    
    def f(breakpoints, x, y, fcache): 
        breakpoints = tuple(map(int, sorted(breakpoints)))
        if breakpoints not in fcache:
            total_error = 0
            for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x, y):
                total_error += ((f(xi) - yi)**2).sum()
            fcache[breakpoints] = total_error 
        # print('{} --> {}'.format(breakpoints, fcache[breakpoints]))
        return fcache[breakpoints]

    def find_best_piecewise_polynomial(breakpoints, x, y):
        breakpoints = tuple(map(int, sorted(breakpoints)))
        xs = np.split(x, breakpoints)
        ys = np.split(y, breakpoints)
        result = []
        for xi, yi in zip(xs, ys):
            if len(xi) < 2: continue
            coefs = npoly.polyfit(xi, yi, 1)
            f = npoly.Polynomial(coefs)
            result.append([f, xi, yi])
        return result
    
    x_num = dates.date2num(x_data)
    breakpoints = optimize.brute(f, [slice(1, len(x_data), 1)]*num_breakpoints, args=(x_num, y_data2, {}), finish=None)
    if num_breakpoints==1:
        breakpoints = [breakpoints] 
    for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x_num, y_data2):
        xi_min = x_data[np.where(x_num == xi.min())[0][0]]
        xi_max = x_data[np.where(x_num == xi.max())[0][0]]
        x_interval = np.array([xi_min, xi_max]) 
        #print('y = {:35s}, if x in [{}, {}]'.format(str(f), *x_interval))
        x_interval = np.array([xi.min(), xi.max()])
        #ax.plot(x_interval, f(x_interval), 'yo-')
        coef = f.convert().coef
        b = coef[0]
        a = coef[1]
        if b>0:
            ll = 'y = {:.2f} x + {:.0f}'.format(a,b)
        else: #b<0
            ll = 'y = {:.2f} x - {:.0f}'.format(a,abs(b))    
        ax.plot(x_interval, f(x_interval), 'yo-',linewidth=2, markersize=7) 
       
    ax.legend()        
    ax.set_title(title,size=30)

In [None]:
import matplotlib.dates as dates
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
from scipy import optimize
import numpy.polynomial.polynomial as npoly

def form(x,pos):
    if x<1e3:
        return '%1.3f' % (x)
    elif x<1e6:
        return '%1.1fK' % (x * 1e-3)
    else:
        return '%1.1fM' % (x * 1e-6)
formatter = FuncFormatter(form)

def plot_fit_rolling_breakpoints(df,x_column,x_label,title,window_size,num_breakpoints,ff):
    
    #save for all possible combination of breakpoints the correspondent error 
    def f(breakpoints, x, y, fcache): 
        breakpoints = tuple(map(int, sorted(breakpoints)))
        if breakpoints not in fcache:
            total_error = 0
            for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x, y):
                total_error += ((f(xi) - yi)**2).sum()
            fcache[breakpoints] = total_error 
        # print('{} --> {}'.format(breakpoints, fcache[breakpoints]))
        return fcache[breakpoints]

    def find_best_piecewise_polynomial(breakpoints, x, y):
        breakpoints = tuple(map(int, sorted(breakpoints)))
        xs = np.split(x, breakpoints)
        ys = np.split(y, breakpoints)
        result = []
        for xi, yi in zip(xs, ys):
            if len(xi) < 2: continue
            coefs = npoly.polyfit(xi, yi, 1)
            f = npoly.Polynomial(coefs)
            result.append([f, xi, yi])
        return result

    fig, ax = plt.subplots(figsize=(15, 5))
    x_dates = list(df['publication_date_1'])
    y_data = df[x_column].rolling(window=window_size).mean()[window_size-1:]
    x_data = x_dates[window_size-1:]
    
    ax.plot(x_data, y_data, "o-", color='orange', markersize=3)

    plt.grid(True, linewidth=0.5)
    ax.yaxis.set_major_formatter(formatter)

    ax.set_xlabel(x_label,size=20)
    ax.set_title(title,size=30)

    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %y')) # Get only the month to show in the x-axis

    ax.axvline(pd.Timestamp(2020, 3, 1),color='r') #ax.axvline(x_dates[84],color='r')

    x_num = dates.date2num(x_data)
    breakpoints = optimize.brute(f, [slice(1, len(x_data), 1)]*num_breakpoints, args=(x_num, y_data, {}), finish=None)
    if num_breakpoints==1:
        breakpoints = [breakpoints] 

    for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x_num, y_data):
        xi_min = x_data[np.where(x_num == xi.min())[0][0]]
        xi_max = x_data[np.where(x_num == xi.max())[0][0]] 
        x_interval = np.array([xi.min(), xi.max()])
        coef = f.convert().coef
        b = coef[0]
        a = coef[1]
        if ff==1:
            if b>0:
                ll = 'y = {:.2f} x + {:.2f}'.format(a,b)
            else: #b<0
                ll = 'y = {:.2f} x - {:.2f}'.format(a,abs(b))    
        else:
            if b>0:
                ll = 'y = {:.5f} x + {:.5f}'.format(a,b)
            else: #b<0
                ll = 'y = {:.5f} x - {:.5f}'.format(a,abs(b))  
        #x_1 = x_dates[window_size-1:][np.where(x_num==x_interval[0])[0][0]]
        #x_3 = x_dates[window_size-1:][np.where(x_num==x_interval[1])[0][0]]
        ax.plot(x_interval, f(x_interval), 'o-',color='yellow',label=ll, markersize=6)
        
    ax.legend()
    ax.set_title(title,size=30)
    #return x_num 
    
x_label='Month'
window_size = 6
num_breakpoints = 1

In [None]:
def df_data1_(work_authors_edges_df):
    #work_authors_edges_df['intra'] = 0
    #work_authors_edges_df.loc[work_authors_edges_df.source_inst == work_authors_edges_df.target_inst,'intra'] = 1
    df1 = work_authors_edges_df.groupby('publication_date_1').intra.count().reset_index().rename(columns={'intra':'total'})
    df2 = work_authors_edges_df.groupby('publication_date_1').intra.sum().to_frame().reset_index()
    intra_inter_df = df1.merge(df2,on='publication_date_1')
    intra_inter_df['inter'] = intra_inter_df['total'] - intra_inter_df['intra']
    intra_inter_df['frac_intra'] = intra_inter_df['intra'] / intra_inter_df['total']
    intra_inter_df['frac_inter'] = intra_inter_df['inter'] / intra_inter_df['total']
    df_data1 = intra_inter_df
    return df_data1

def work_edges_dist_mean_monthly_(works_outside,works_set,path):
    my_file = "work_edges_dist_mean.csv"    
    work_edges_dist_mean = pd.read_csv(os.path.join(path, my_file))
    work_edges_dist_mean = work_edges_dist_mean[work_edges_dist_mean.work_id.isin(works_set)]
    work_edges_dist_mean = work_edges_dist_mean[~work_edges_dist_mean.work_id.isin(works_outside)]
    work_edges_dist_mean['publication_date_1'] = pd.to_datetime(work_edges_dist_mean['publication_date_1'])
    work_edges_dist_mean_monthly = work_edges_dist_mean.groupby('publication_date_1').dist.mean().to_frame().reset_index()
    return work_edges_dist_mean_monthly

In [None]:
work_authors_edges_df_dist = read_parquet(Path('./TeamDistance') / 'work_authors_edges_df_dist')
work_authors_edges_df_dist = works[['work_id']].reset_index().merge(work_authors_edges_df_dist,on='work_id')
work_authors_edges_df_dist = work_authors_edges_df_dist[work_authors_edges_df_dist.work_id.isin(works_2authors)]
work_edges_dist_mean = work_authors_edges_df_dist.groupby('work_id').dist.mean().to_frame().reset_index()
work_edges_dist_mean = work_edges_dist_mean.merge(works[['work_id']].reset_index(),on='work_id')
work_edges_dist_mean_monthly = work_edges_dist_mean.groupby('publication_date_1').dist.mean().to_frame().reset_index()
my_file = "work_edges_dist_mean.csv"    
work_edges_dist_mean.to_csv(os.path.join(path_2authors, my_file),index=False)

In [None]:
my_file = "work_edges_dist_mean_monthly.csv"     
work_edges_dist_mean_monthly.to_csv(os.path.join(path_2authors, my_file),index=False) 

In [None]:
df_data1 = df_data1_(work_authors_edges_df_dist)
df_data1['publication_date_1'] = df_data1['publication_date_1'].apply(pd.to_datetime)
my_file = "df_data1.csv"  
df_data1.to_csv(os.path.join(path_2authors, my_file),index=False) 

## Edges - 3 authors

In [None]:
path_3authors = Path('Model_3authors') 
if not os.path.exists(path_3authors):
    os.makedirs(path_3authors)

### Tables

In [None]:
works = read_parquet(basepath / 'works')
works_authors_aff = read_parquet(basepath / 'works_authors_aff')

works = works.loc['2000-01-01':'2023-12-01'] 
works = works[works.num_authors>1]

works_3authors = set(works[works.num_authors ==3].work_id)
print(f'{len(works_3authors)} ({(len(works_3authors)/len(works))*100:.2f}%) works 3 authors')

works = works[works.work_id.isin(works_3authors)]
works_authors_aff = works_authors_aff[works_authors_aff.work_id.isin(works_3authors)]

my_file = "dfs_3authors.pickle"
pickle.dump([works_3authors,works,works_authors_aff], open(os.path.join(path_3authors, my_file), 'wb')) 

In [None]:
#INITIALIZATION #preferential attachment  #TW: 2000-2009
my_file = "dfs_3authors.pickle"
with open(os.path.join(path_3authors, my_file),"rb") as fp:
    [works_3authors,works,works_authors_aff] = pickle.load(fp)  
works = works.loc['2000-01-01':'2023-12-01'] 
works_authors_aff = works_authors_aff.loc['2000-01-01':'2023-12-01']

N_dict = works_authors_aff.groupby('publication_date_1').work_id.nunique().to_frame().to_dict()['work_id']
months_list = list(N_dict.keys())
months_list.sort()
months_list = [i.strftime('%Y-%m-%d') for i in months_list]

start_index = 0
end_index = 120 #180
start_month = months_list[start_index]#.strftime('%Y-%m-%d')
end_month = months_list[end_index-1]#.strftime('%Y-%m-%d')

df_TW = works_authors_aff.loc[start_month:end_month] #df_TW = works_authors_aff.loc[months_list[:120]]
inst_set = set(df_TW.institution_id)
I = len(inst_set)
print(f'TW from {start_month} to {end_month} : {I} institutions')
# my_file = "inst_set.pickle"
# pickle.dump(inst_set, open(os.path.join(path_3authors, my_file), 'wb'))

In [None]:
#initial strenghts    #consider only institutions in sample in TW
#count number (unique) institutions per paper
df_TW = df_TW.drop_duplicates(['work_id','institution_id'])
df_TW['num_affs'] = df_TW.groupby('work_id')['institution_id'].transform('size')
print(f'{min(df_TW.num_affs)}-{max(df_TW.num_affs)} min-max number (unique) affiliations per work')
df_TW['weight'] = 2 / ( df_TW['num_affs']*(df_TW['num_affs']-1) ) 
df_TW.loc[df_TW.num_affs==1,'weight'] = 1 #one affiliation
df_TW_noloops = df_TW[df_TW.num_affs>1]
df_TW_loops = df_TW[df_TW.num_affs==1]
df_TW_loops['institution_id2'] = df_TW_loops['institution_id']
df_TW_loops['weight'] = df_TW_loops[['institution_id','institution_id2','weight']].groupby(['institution_id']).weight.transform('sum')
df_TW_loops = df_TW_loops.drop_duplicates('institution_id')
I_graph = make_institution_graph(df_TW_noloops)
I_graph.add_weighted_edges_from([tuple(r) for r in df_TW_loops[['institution_id','institution_id2','weight']].to_numpy()])
df_ = pd.DataFrame.from_dict(dict(I_graph.degree(weight='weight')),orient='index').reset_index().rename(columns={'index':'institution_id',0:'strength'})
df_['institution_id'] = df_['institution_id'].astype(int)
df = df_.sort_values(by='strength',ascending=False)
inst_set = set(df.institution_id)
I = len(inst_set)
print(f'TW from {start_month} to {end_month} : {I} institutions')

my_file = "df_strengths0.csv"     
df.to_csv(os.path.join(path_3authors, my_file),index=False)

my_file = "inst_set.pickle"
pickle.dump(inst_set, open(os.path.join(path_3authors, my_file), 'wb'))

In [None]:
I_dist = read_parquet(basepath / 'I_dist_threshold')
I_dist = I_dist[['source','target','dist']].reset_index(drop=True)
I_dist_sort = [tuple(row) for row in I_dist.itertuples(index=False)]
I_dist_sort = [ sorted(list(x)[:2])+[list(x)[2]] for x in I_dist_sort]
I_dist_sort = pd.DataFrame(I_dist_sort, columns = ['source', 'target', 'dist'])
my_file = "I_dist_model.csv" 
I_dist_sort.to_csv(os.path.join(path_3authors, my_file),index=False)

### Data plots

In [None]:
my_file = "dfs_3authors.pickle"
with open(os.path.join(path_3authors, my_file),"rb") as fp:
    [works_3authors,works,works_authors_aff] = pickle.load(fp)  

N_dict = works_authors_aff.groupby('publication_date_1').work_id.nunique().to_frame().to_dict()['work_id']

months_list = list(N_dict.keys())
months_list.sort()
months_list = [i.strftime('%Y-%m-%d') for i in months_list]

In [None]:
def plot_fit_rolling_breakpoints_2(df,x_column,x_column2,x_label,title,window_size,num_breakpoints):

    fig, ax = plt.subplots(figsize=(15, 5))
    x_dates = list(df['publication_date_1'])
    y_data = df[x_column].rolling(window=window_size).mean()[window_size-1:]
    y_data2 = df[x_column2].rolling(window=window_size).mean()[window_size-1:]
    x_data = x_dates[window_size-1:]
    ax.set_ylim([0, 1])
    
    ax.plot(x_data, y_data, "o-", color='orange', markersize=3,label='intra-institution')
    ax.plot(x_data, y_data2, "o-", color='green', markersize=3,label='inter-institution')

    plt.grid(True, linewidth=0.5)
    ax.yaxis.set_major_formatter(formatter)

    ax.set_xlabel(x_label,size=20)
    #ax.set_title(title,size=30)

    #ax.xaxis.set_major_locator(mdates.MonthLocator()) # Make ticks on occurrences of each month
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %y')) # Get only the month to show in the x-axis

    ax.axvline(pd.Timestamp(2020, 3, 1),color='r') #ax.axvline(x_dates[84],color='r')

    #save for all possible combination of breakpoints the correspondent error 
    def f(breakpoints, x, y, fcache): 
        breakpoints = tuple(map(int, sorted(breakpoints)))
        if breakpoints not in fcache:
            total_error = 0
            for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x, y):
                total_error += ((f(xi) - yi)**2).sum()
            fcache[breakpoints] = total_error 
        # print('{} --> {}'.format(breakpoints, fcache[breakpoints]))
        return fcache[breakpoints]

    def find_best_piecewise_polynomial(breakpoints, x, y):
        breakpoints = tuple(map(int, sorted(breakpoints)))
        xs = np.split(x, breakpoints)
        ys = np.split(y, breakpoints)
        result = []
        for xi, yi in zip(xs, ys):
            if len(xi) < 2: continue
            coefs = npoly.polyfit(xi, yi, 1)
            f = npoly.Polynomial(coefs)
            result.append([f, xi, yi])
        return result
    
    x_num = dates.date2num(x_data)
    breakpoints = optimize.brute(f, [slice(1, len(x_data), 1)]*num_breakpoints, args=(x_num, y_data, {}), finish=None)
    if num_breakpoints==1:
        breakpoints = [breakpoints] 
    for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x_num, y_data):
        xi_min = x_data[np.where(x_num == xi.min())[0][0]]
        xi_max = x_data[np.where(x_num == xi.max())[0][0]]
        x_interval = np.array([xi_min, xi_max]) 
        #print('y = {:35s}, if x in [{}, {}]'.format(str(f), *x_interval))
        x_interval = np.array([xi.min(), xi.max()])
        #ax.plot(x_interval, f(x_interval), 'yo-')
        coef = f.convert().coef
        b = coef[0]
        a = coef[1]
        if b>0:
            ll = 'y = {:.2f} x + {:.0f}'.format(a,b)
        else: #b<0
            ll = 'y = {:.2f} x - {:.0f}'.format(a,abs(b))    
        ax.plot(x_interval, f(x_interval), 'yo-',linewidth=2, markersize=7)       
    #save for all possible combination of breakpoints the correspondent error
    
    def f(breakpoints, x, y, fcache): 
        breakpoints = tuple(map(int, sorted(breakpoints)))
        if breakpoints not in fcache:
            total_error = 0
            for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x, y):
                total_error += ((f(xi) - yi)**2).sum()
            fcache[breakpoints] = total_error 
        # print('{} --> {}'.format(breakpoints, fcache[breakpoints]))
        return fcache[breakpoints]

    def find_best_piecewise_polynomial(breakpoints, x, y):
        breakpoints = tuple(map(int, sorted(breakpoints)))
        xs = np.split(x, breakpoints)
        ys = np.split(y, breakpoints)
        result = []
        for xi, yi in zip(xs, ys):
            if len(xi) < 2: continue
            coefs = npoly.polyfit(xi, yi, 1)
            f = npoly.Polynomial(coefs)
            result.append([f, xi, yi])
        return result
    
    x_num = dates.date2num(x_data)
    breakpoints = optimize.brute(f, [slice(1, len(x_data), 1)]*num_breakpoints, args=(x_num, y_data2, {}), finish=None)
    if num_breakpoints==1:
        breakpoints = [breakpoints] 
    for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x_num, y_data2):
        xi_min = x_data[np.where(x_num == xi.min())[0][0]]
        xi_max = x_data[np.where(x_num == xi.max())[0][0]]
        x_interval = np.array([xi_min, xi_max]) 
        #print('y = {:35s}, if x in [{}, {}]'.format(str(f), *x_interval))
        x_interval = np.array([xi.min(), xi.max()])
        #ax.plot(x_interval, f(x_interval), 'yo-')
        coef = f.convert().coef
        b = coef[0]
        a = coef[1]
        if b>0:
            ll = 'y = {:.2f} x + {:.0f}'.format(a,b)
        else: #b<0
            ll = 'y = {:.2f} x - {:.0f}'.format(a,abs(b))    
        ax.plot(x_interval, f(x_interval), 'yo-',linewidth=2, markersize=7) 
       
    ax.legend()        
    ax.set_title(title,size=30)

In [None]:
import matplotlib.dates as dates
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
from scipy import optimize
import numpy.polynomial.polynomial as npoly

def form(x,pos):
    if x<1e3:
        return '%1.3f' % (x)
    elif x<1e6:
        return '%1.1fK' % (x * 1e-3)
    else:
        return '%1.1fM' % (x * 1e-6)
formatter = FuncFormatter(form)

def plot_fit_rolling_breakpoints(df,x_column,x_label,title,window_size,num_breakpoints,ff):
    
    #save for all possible combination of breakpoints the correspondent error 
    def f(breakpoints, x, y, fcache): 
        breakpoints = tuple(map(int, sorted(breakpoints)))
        if breakpoints not in fcache:
            total_error = 0
            for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x, y):
                total_error += ((f(xi) - yi)**2).sum()
            fcache[breakpoints] = total_error 
        # print('{} --> {}'.format(breakpoints, fcache[breakpoints]))
        return fcache[breakpoints]

    def find_best_piecewise_polynomial(breakpoints, x, y):
        breakpoints = tuple(map(int, sorted(breakpoints)))
        xs = np.split(x, breakpoints)
        ys = np.split(y, breakpoints)
        result = []
        for xi, yi in zip(xs, ys):
            if len(xi) < 2: continue
            coefs = npoly.polyfit(xi, yi, 1)
            f = npoly.Polynomial(coefs)
            result.append([f, xi, yi])
        return result

    fig, ax = plt.subplots(figsize=(15, 5))
    x_dates = list(df['publication_date_1'])
    y_data = df[x_column].rolling(window=window_size).mean()[window_size-1:]
    x_data = x_dates[window_size-1:]
    
    ax.plot(x_data, y_data, "o-", color='orange', markersize=3)

    plt.grid(True, linewidth=0.5)
    ax.yaxis.set_major_formatter(formatter)

    ax.set_xlabel(x_label,size=20)
    ax.set_title(title,size=30)

    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %y')) # Get only the month to show in the x-axis

    ax.axvline(pd.Timestamp(2020, 3, 1),color='r') #ax.axvline(x_dates[84],color='r')

    x_num = dates.date2num(x_data)
    breakpoints = optimize.brute(f, [slice(1, len(x_data), 1)]*num_breakpoints, args=(x_num, y_data, {}), finish=None)
    if num_breakpoints==1:
        breakpoints = [breakpoints] 

    for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x_num, y_data):
        xi_min = x_data[np.where(x_num == xi.min())[0][0]]
        xi_max = x_data[np.where(x_num == xi.max())[0][0]] 
        x_interval = np.array([xi.min(), xi.max()])
        coef = f.convert().coef
        b = coef[0]
        a = coef[1]
        if ff==1:
            if b>0:
                ll = 'y = {:.2f} x + {:.2f}'.format(a,b)
            else: #b<0
                ll = 'y = {:.2f} x - {:.2f}'.format(a,abs(b))    
        else:
            if b>0:
                ll = 'y = {:.5f} x + {:.5f}'.format(a,b)
            else: #b<0
                ll = 'y = {:.5f} x - {:.5f}'.format(a,abs(b))  
        #x_1 = x_dates[window_size-1:][np.where(x_num==x_interval[0])[0][0]]
        #x_3 = x_dates[window_size-1:][np.where(x_num==x_interval[1])[0][0]]
        ax.plot(x_interval, f(x_interval), 'o-',color='yellow',label=ll, markersize=6)
        
    ax.legend()
    ax.set_title(title,size=30)
    #return x_num 
    
x_label='Month'
window_size = 6
num_breakpoints = 1

In [None]:
def work_edges_dist_mean_monthly_(works_outside,works_set,path):
    my_file = "work_edges_dist_mean.csv"    
    work_edges_dist_mean = pd.read_csv(os.path.join(path, my_file))
    work_edges_dist_mean = work_edges_dist_mean[work_edges_dist_mean.work_id.isin(works_set)]
    work_edges_dist_mean = work_edges_dist_mean[~work_edges_dist_mean.work_id.isin(works_outside)]
    work_edges_dist_mean['publication_date_1'] = pd.to_datetime(work_edges_dist_mean['publication_date_1'])
    work_edges_dist_mean_monthly = work_edges_dist_mean.groupby('publication_date_1').dist.mean().to_frame().reset_index()
    return work_edges_dist_mean_monthly

def work_edges_dist_max_monthly_(works_outside,works_set,path):
    my_file = "work_edges_dist_max.csv"    
    work_edges_dist_mean = pd.read_csv(os.path.join(path, my_file))
    work_edges_dist_mean = work_edges_dist_mean[work_edges_dist_mean.work_id.isin(works_set)]
    work_edges_dist_mean = work_edges_dist_mean[~work_edges_dist_mean.work_id.isin(works_outside)]
    work_edges_dist_mean['publication_date_1'] = pd.to_datetime(work_edges_dist_mean['publication_date_1'])
    work_edges_dist_mean_monthly = work_edges_dist_mean.groupby('publication_date_1').dist.mean().to_frame().reset_index()
    return work_edges_dist_mean_monthly

In [None]:
work_authors_edges_df_dist = read_parquet(Path('./TeamDistance') / 'work_authors_edges_df_dist')
work_authors_edges_df_dist = works[['work_id']].reset_index().merge(work_authors_edges_df_dist,on='work_id')
work_authors_edges_df_dist = work_authors_edges_df_dist[work_authors_edges_df_dist.work_id.isin(works_3authors)]
work_edges_dist_mean = work_authors_edges_df_dist.groupby('work_id').dist.mean().to_frame().reset_index()
work_edges_dist_mean = work_edges_dist_mean.merge(works[['work_id']].reset_index(),on='work_id')
work_edges_dist_mean_monthly = work_edges_dist_mean.groupby('publication_date_1').dist.mean().to_frame().reset_index()
my_file = "work_edges_dist_mean.csv"    
work_edges_dist_mean.to_csv(os.path.join(path_3authors, my_file),index=False)

In [None]:
my_file = "work_edges_dist_mean_monthly.csv"     
work_edges_dist_mean_monthly.to_csv(os.path.join(path_3authors, my_file),index=False) 

In [None]:
df_data1 = df_data1_(work_authors_edges_df_dist)
df_data1['publication_date_1'] = df_data1['publication_date_1'].apply(pd.to_datetime)
my_file = "df_data1.csv"    
df_data1.to_csv(os.path.join(path_3authors, my_file),index=False)
x_column = 'frac_intra'
x_label = 'publication_date_1'
title = 'Fraction collabs. intra-inst. - works 3 authors'
window_size = 1
num_breakpoints = 1
plot_fit_rolling_breakpoints(df_data1,x_column,x_label,title,window_size,num_breakpoints,ff=1)

In [None]:
df_data1

In [None]:
df_data2 = work_edges_dist_mean_monthly_({},works_3authors,path_3authors)
df_data2['publication_date_1'] = df_data2['publication_date_1'].apply(pd.to_datetime)
x_column='publication_date_1'
x_column='dist'
x_label='month'
title='Monthly average mean team distance - works 3 authors'
window_size=1
num_breakpoints=1
ff=1
plot_fit_rolling_breakpoints(df_data2,x_column,x_label,title,window_size,num_breakpoints,ff)

In [None]:
my_file = "dfs_3authors.pickle"
with open(os.path.join(path_3authors, my_file),"rb") as fp:
    [works_3authors,works_3authors,works_authors_aff_3authors] = pickle.load(fp)  
inst_set_whole_3authors = set((works_authors_aff_3authors.drop_duplicates('institution_id')).institution_id)    

In [None]:
N_dict = works_authors_aff_3authors.groupby('publication_date_1').work_id.nunique().to_frame().to_dict()['work_id']
months_list = list(N_dict.keys())
months_list.sort()
months_list = [i.strftime('%Y-%m-%d') for i in months_list]
start_index = 0
end_index = 120 
start_month = months_list[start_index]
end_month = months_list[end_index-1]
works_authors_aff_3authors_TW = works_authors_aff_3authors.loc[start_month:end_month]

inst_set_TW_3authors = set((works_authors_aff_3authors_TW .drop_duplicates('institution_id')).institution_id)

In [None]:
#restrict to papers not including authors from institutions not in TW
inst_set_TW_3authors_compl = inst_set_whole_3authors - inst_set_TW_3authors
works_outside = set(works_authors_aff_3authors[works_authors_aff_3authors.institution_id.isin(inst_set_TW_3authors_compl)].work_id)
works_3authors = works_3authors[~works_3authors.work_id.isin(works_outside)]
works_authors_aff_3authors_new = works_authors_aff_3authors[~works_authors_aff_3authors.work_id.isin(works_outside)]

In [None]:
print(f'3 authors. [2000,2023] {len(inst_set_whole_3authors)} institutions, [2000,2009] {len(inst_set_TW_3authors)} institutions ({(len(inst_set_TW_3authors)/len(inst_set_whole_3authors))*100:.2f}%)')

In [None]:
print(f'3 authors. All institutions: {len(set(works_authors_aff_3authors.work_id))} works, restrict to institutions in TW {len(set(works_authors_aff_3authors_new.work_id))} works ({(len(set(works_authors_aff_3authors_new.work_id))/len(set(works_authors_aff_3authors.work_id)))*100:.2f}%)')

In [None]:
#delate tail
df_strengths_3authors_TW = works_authors_aff_3authors_TW.groupby('institution_id').work_id.count().to_frame().sort_values(by='work_id',ascending=False).reset_index().rename(columns={'work_id':'strength'})
df_strengths_3authors_TW['institution_id'] = df_strengths_3authors_TW['institution_id'].astype(int)

In [None]:
for n in [20,50,70,100]:
    my_file = "dfs_3authors.pickle"
    with open(os.path.join(path_3authors, my_file),"rb") as fp:
        [works_3authors,works_3authors,works_authors_aff_3authors] = pickle.load(fp) 
    I = set((df_strengths_3authors_TW[df_strengths_3authors_TW.strength>=n]).institution_id)
    I_compl = inst_set_whole_3authors - I
    works_outside = set(works_authors_aff_3authors[works_authors_aff_3authors.institution_id.isin(I_compl)].work_id)
    works_3authors = works_3authors[~works_3authors.work_id.isin(works_outside)]
    works_authors_aff_3authors_new = works_authors_aff_3authors[~works_authors_aff_3authors.work_id.isin(works_outside)]
    print(f'3 authors - tail {n}. TW {len(I)} institutions ({(len(I)/len(inst_set_TW_3authors))*100:.2f}%), {len(set(works_authors_aff_3authors_new.work_id))} works ({(len(set(works_authors_aff_3authors_new.work_id))/len(set(works_authors_aff_3authors.work_id)))*100:.2f}%)')

## Edges - all works 

In [None]:
path_allauthors = Path('Model_allworks')
if not os.path.exists(path_allauthors):
    os.makedirs(path_allauthors)

### Tables

In [None]:
works = read_parquet(basepath / 'works')
works_authors_aff = read_parquet(basepath / 'works_authors_aff')

works = works.loc['2000-01-01':'2023-12-01'] 
works = works[works.num_authors>1]

works_allauthors = set(works.work_id)
print(f'{len(works_allauthors)} ({(len(works_allauthors)/len(works))*100:.2f}%) works all authors')

works = works[works.work_id.isin(works_allauthors)]
works_authors_aff = works_authors_aff[works_authors_aff.work_id.isin(works_allauthors)]

my_file = "dfs_allauthors.pickle"
pickle.dump([works_allauthors,works,works_authors_aff], open(os.path.join(path_allauthors, my_file), 'wb')) 

In [None]:
#INITIALIZATION #preferential attachment  #TW: 2000-2009
my_file = "dfs_allauthors.pickle"
with open(os.path.join(path_allauthors, my_file),"rb") as fp:
    [works_allauthors,works,works_authors_aff] = pickle.load(fp)  
works = works.loc['2000-01-01':'2023-12-01'] 
works_authors_aff = works_authors_aff.loc['2000-01-01':'2023-12-01']

N_dict = works_authors_aff.groupby('publication_date_1').work_id.nunique().to_frame().to_dict()['work_id']
months_list = list(N_dict.keys())
months_list.sort()
months_list = [i.strftime('%Y-%m-%d') for i in months_list]

start_index = 0
end_index = 120 #180
start_month = months_list[start_index]#.strftime('%Y-%m-%d')
end_month = months_list[end_index-1]#.strftime('%Y-%m-%d')

df_TW = works_authors_aff.loc[start_month:end_month] #df_TW = works_authors_aff.loc[months_list[:120]]
inst_set = set(df_TW.institution_id)
I = len(inst_set)
print(f'TW from {start_month} to {end_month} : {I} institutions')

In [None]:
#initial strenghts    #consider only institutions in sample in TW
#count number (unique) institutions per paper
df_TW = df_TW.drop_duplicates(['work_id','institution_id'])
df_TW['num_affs'] = df_TW.groupby('work_id')['institution_id'].transform('size')
print(f'{min(df_TW.num_affs)}-{max(df_TW.num_affs)} min-max number (unique) affiliations per work')
df_TW['weight'] = 2 / ( df_TW['num_affs']*(df_TW['num_affs']-1) ) 
df_TW.loc[df_TW.num_affs==1,'weight'] = 1 #one affiliation
df_TW_noloops = df_TW[df_TW.num_affs>1]
df_TW_loops = df_TW[df_TW.num_affs==1]
df_TW_loops['institution_id2'] = df_TW_loops['institution_id']
df_TW_loops['weight'] = df_TW_loops[['institution_id','institution_id2','weight']].groupby(['institution_id']).weight.transform('sum')
df_TW_loops = df_TW_loops.drop_duplicates('institution_id')
I_graph = make_institution_graph(df_TW_noloops)
I_graph.add_weighted_edges_from([tuple(r) for r in df_TW_loops[['institution_id','institution_id2','weight']].to_numpy()])
df_ = pd.DataFrame.from_dict(dict(I_graph.degree(weight='weight')),orient='index').reset_index().rename(columns={'index':'institution_id',0:'strength'})
df_['institution_id'] = df_['institution_id'].astype(int)
df = df_.sort_values(by='strength',ascending=False)
inst_set = set(df.institution_id)
I = len(inst_set)
print(f'TW from {start_month} to {end_month} : {I} institutions')

my_file = "df_strengths0.csv"     
df.to_csv(os.path.join(path_allauthors, my_file),index=False)

my_file = "inst_set.pickle"
pickle.dump(inst_set, open(os.path.join(path_allauthors, my_file), 'wb'))

In [None]:
I_dist = read_parquet(basepath / 'I_dist_threshold')
I_dist = I_dist[['source','target','dist']].reset_index(drop=True)
I_dist = I_dist.query('source.isin(@inst_set) & target.isin(@inst_set)').reset_index()
my_file = "I_dist_model.csv" 
I_dist.to_csv(os.path.join(path_allauthors, my_file),index=False)

In [None]:
work_authors_edges_df_dist = read_parquet(Path('./TeamDistance') / 'work_authors_edges_df_dist')
work_authors_edges_df_dist = works[['work_id']].reset_index().merge(work_authors_edges_df_dist,on='work_id')
work_authors_edges_df_dist = work_authors_edges_df_dist[work_authors_edges_df_dist.work_id.isin(works_allauthors)]
work_edges_dist_mean = work_authors_edges_df_dist.groupby('work_id').dist.mean().to_frame().reset_index()
work_edges_dist_mean = work_edges_dist_mean.merge(works[['work_id']].reset_index(),on='work_id')
work_edges_dist_mean_monthly = work_edges_dist_mean.groupby('publication_date_1').dist.mean().to_frame().reset_index()
my_file = "work_edges_dist_mean.csv"    
work_edges_dist_mean.to_csv(os.path.join(path_allauthors, my_file),index=False)

In [None]:
my_file = "work_edges_dist_mean_monthly.csv"     
work_edges_dist_mean_monthly.to_csv(os.path.join(path_allauthors, my_file),index=False) 

In [None]:
df_data1 = df_data1_(work_authors_edges_df_dist)
df_data1['publication_date_1'] = df_data1['publication_date_1'].apply(pd.to_datetime)
my_file = "df_data1.csv"  
df_data1.to_csv(os.path.join(path_allauthors, my_file),index=False) 