# Model - edges - all works

In [None]:
import pandas as pd 
from pathlib import Path
import networkx as nx
import igraph as ig
import pickle
import numpy as np
from scipy.sparse import csr_matrix
from scipy.spatial import distance
import seaborn as sns
from time import time
from tqdm.auto import tqdm
import random 
import os
from itertools import chain, combinations
import itertools
import collections

import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.io as pio
plt.rcParams.update({'font.size': 22})
sns.set(style="ticks", context="talk")
plt.style.use("dark_background")
pd.options.plotting.backend = 'plotly'
pio.templates.default = 'plotly_dark+presentation'
                    
import numpy as np
from scipy.optimize import minimize

tqdm.pandas()

import matplotlib.dates as dates
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
from scipy import optimize
import numpy.polynomial.polynomial as npoly

def form(x,pos):
    if x<1e3:
        return '%1.3f' % (x)
    elif x<1e6:
        return '%1.1fK' % (x * 1e-3)
    else:
        return '%1.1fM' % (x * 1e-6)
formatter = FuncFormatter(form)

import warnings
warnings.filterwarnings("ignore")

def read_parquet(name, **args):
    path = name
    print(f'Reading {name!r}')
    tic = time()
    df = pd.read_parquet(path, engine='fastparquet', **args)
    before = len(df)
    # df.drop_duplicates(inplace=True)
    toc = time()
    after = len(df)
        
    print(f'Read {len(df):,} rows from {path.stem!r} in {toc-tic:.2f} sec. {before-after:,} duplicates.')
    return df

In [None]:
basepath = Path('./Tables_final') 
my_path_ = Path('./Model_allworks')
if not os.path.exists(my_path_):
    os.makedirs(my_path_)

## Model 2 variables

B_{ij} = beta * ((s_i*s_j)^a/(d_{ij}+c)^alpha) if i!=j </br>
B_{ii} = gamma * s_i if i!=j </br>

P_{ij} = B_{ij}/N(alpha,beta,gamma)</br>
P_{ii} = B_{ii}/N(alpha,beta,gamma)</br>
with N(alpha,beta,gamma) = sum_{i} B_{ii} + sum_{(i,j)} B_{ij}</br>

Parameters: a=1/2, c=10 </br>
Variables: alpha>=0, beta>=0, gamma>=0 </br>

In [None]:
def make_collaboration_graph(works_authors_rows):
    
    authors_id_set = set(works_authors_rows.author_id)
                                  
    bip_g = nx.from_pandas_edgelist(
        works_authors_rows,
        source='work_id', target='author_id'
    )
    
    collab_graph = nx.bipartite.weighted_projected_graph(bip_g,nodes=authors_id_set) #bipartite.weighted_projected_graph(bip_g,nodes=authors_id)
    return collab_graph

def strength_update(df_intra,df_inter,W,a,c,params):
    N = ( params[1] *((df_inter['m_prod'])/ ((df_inter['dist']+c)**params[0])) ).sum() + (params[2] *df_intra['m_source']).sum() 
    df_inter2 = df_inter.copy()
    df_inter2[['target','source','m_target','m_source']] = df_inter2[['source','target','m_source','m_target']] 
    df_inter = pd.concat([df_inter,df_inter2])
    df_inter['a'] = params[1] *((df_inter['m_prod'])/ ((df_inter['dist']+c)**params[0]))
    df_intra['a'] = 2*params[2] *df_intra['m_source']
    df = pd.concat([df_inter,df_intra])
    df = df.groupby(['source','m_source']).a.sum().to_frame().reset_index()
    df['m_source'] = df['m_source'] + (W/N)*df['a']
    df = df[['source','m_source']].rename(columns={'source':'institution_id','m_source':'strength'})
    inst_str_dict = df.set_index('institution_id').to_dict()['strength']
    return inst_str_dict

def model_function1(df_intra,df_inter,a,c,params):    
    N = ( params[1] *((df_inter['m_prod'])/ ((df_inter['dist']+c)**params[0])) ).sum() + (params[2] *df_intra['m_source']).sum() 
    u = ( (params[2] *df_intra['m_source']).sum() )  / N
    return u
    
def model_function2(df_intra,df_inter,a,c,params):
    N = ( params[1] *((df_inter['m_prod'])/ ((df_inter['dist']+c)**params[0])) ).sum() + (params[2] *df_intra['m_source']).sum()
    u = (  ( (df_inter['dist']) * ( ((params[1] *df_inter['m_prod'])/ ((df_inter['dist']+c)**(params[0]))) ) ).sum() ) / N
    return u
    
def objective_function(params,a,c, x_data, y_data):
    df_intra = x_data[0]
    df_inter = x_data[1]

    y_pred1 = model_function1(df_intra,df_inter,a,c,params)
    y_pred2 = model_function2(df_intra,df_inter,a,c,params)
    of = ((y_pred1 - y_data[0]) / y_data[0])**2  +  (((y_pred2 - y_data[1]) / y_data[1])**2)  

    return of 

def model():
    works = read_parquet(basepath / 'works')
    works_authors_aff = read_parquet(basepath / 'works_authors_aff')
    works_all = set(works.work_id)
    N_dict = works_authors_aff.groupby('publication_date_1').work_id.nunique().to_frame().to_dict()['work_id']
    months_list = list(N_dict.keys())
    months_list.sort()
    months_list = [i.strftime('%Y-%m-%d') for i in months_list]
    start_index = 0
    end_index = 120 #180
    start_month = months_list[start_index]
    end_month = months_list[end_index-1]
    my_file = "df_strengths0.csv"     
    df = pd.read_csv(os.path.join(my_path_, my_file))
    inst_str_dict = df.set_index('institution_id').to_dict()['strength']
    my_file = "inst_set.pickle"
    with open(os.path.join(my_path_, my_file),"rb") as fp:
        inst_set = pickle.load(fp)    
    # #calculate d_ij 
    my_file = "I_dist_threshold.csv"
    I_dist = read_parquet(basepath / 'I_dist_threshold')
    I_dist = I_dist[['source','target','dist']].reset_index(drop=True)
    I_dist = I_dist.query('source.isin(@inst_set) & target.isin(@inst_set)')

    #calculate #each month: F_INTRA

    my_file = "df_data1.csv"    
    df_data1 = pd.read_csv(os.path.join(my_path_, my_file))
    df_data1 = df_data1.set_index('publication_date_1')
    df_data3 = df_data1
    df_data3 = df_data3[['total']]
    df_data3['total'] = df_data3['total'].astype(int)
    df_data3 = df_data3.loc[months_list[end_index]:months_list[-1]]
    df_data3_list = list(df_data3['total'])
    df_data1 = df_data1[['frac_intra']]
    df_data1 = df_data1.rename(columns={'frac_intra':'F_aff'})
    df_data1 = df_data1.loc[months_list[end_index]:months_list[-1]] #[months_list[end_index-1]:months_list[-1]]
    #each month: average distance
    my_file = "work_edges_dist_mean_monthly.csv"    
    df_data2 = pd.read_csv(os.path.join(my_path_, my_file))
    df_data2 = df_data2.set_index('publication_date_1').loc[months_list[end_index]:months_list[-1]]

    I_dist = I_dist.query('source.isin(@inst_set) & target.isin(@inst_set)')

    c = 100
    a = 0.5
    
    df = I_dist.copy()
    df['m_source'] = df['source'].map(inst_str_dict)
    df['m_target'] = df['target'].map(inst_str_dict)
    df['m_prod'] = (df['m_source']*df['m_target'])**a
    df_intra = df[df.source == df.target]
    df_inter = df[df.source != df.target]

    opt_dict = {}
    x_data = [df_intra,df_inter]
    y_data = np.array([list(df_data1['F_aff'])[0],list(df_data2['dist'])[0]])

    #randomstart #starting point can be not feasible
    np.random.seed(0)
    initial_params_list = [[2.0,1.0,1.0]] + [list(np.concatenate([np.random.uniform(0, 5, 1),np.random.uniform(0, 1e3, 2)])) for _ in range(9)]
    err_ = +np.inf
    initial_params_ = np.nan
    result_ = np.nan
    for initial_params in tqdm(initial_params_list):
        result = minimize(objective_function, initial_params, args=(a,c,x_data,y_data), bounds=((0, np.inf), (0, np.inf), (0, np.inf)), tol = 1e-10, options={'eps': 1e-10, 'ftol': 1e-15}) #, method='SLSQP'
        err = result.fun
        success = result.success
        print(err,success,result.message)
        if err<err_ and success:
            initial_params_ = initial_params
            result_ = result
            err_ = err

#     my_file = "initial_params_randomstart.pickle"
#     pickle.dump([initial_params_,result_], open(os.path.join(my_path_, my_file), 'wb'))

#     # #run on the best one
#     # my_file = "initial_params_randomstart.pickle"
#     # with open(os.path.join(my_path_, my_file),"rb") as fp:
#     #     [initial_params_,result_] = pickle.load(fp)

    params = result_.x
    err = result_.fun
    success = result_.success
    message = result_.message
    print(f'{0} {list(df_data3.index)[0]} {params[0]:.5f} {params[1]:.5f} {params[2]:.5f} {params[1]/params[2]:.5f}  {err:.2e} {success} {message}')
    opt_dict[0] = {'optimized_alpha':params[0], 'optimized_beta':params[1], 'optimized_gamma':params[2],'beta/gamma':params[1]/params[2],'optimized_err':err, 'success':success, 'message':message}

    #update strenght with parameters
    for i in tqdm(range(len(df_data3_list)-1)):
        W = int(1e5) #df_data3_list[i]
        inst_str_dict = strength_update(df_intra,df_inter,W,a,c,params)

        df = I_dist.copy()
        df['m_source'] = df['source'].map(inst_str_dict)
        df['m_target'] = df['target'].map(inst_str_dict)
        df['m_prod'] = (df['m_source']*df['m_target'])**a
        #df_intra = df[df.dist==0]
        #df_intra = df[df.dist>0]
        df_intra = df[df.source == df.target]
        df_inter = df[df.source != df.target]

        x_data = [df_intra,df_inter]
        y_data = np.array([list(df_data1['F_aff'])[i+1],list(df_data2['dist'])[i+1]])

        result = minimize(objective_function, params, args=(a,c,x_data,y_data), bounds=((0, np.inf), (0, np.inf), (0, np.inf)), tol = 1e-10, options={'eps': 1e-10, 'ftol': 1e-15}) 
        params = result.x
        err = result.fun
        success = result.success
        message = result.message
        print(f'{i+1} {list(df_data3.index)[i+1]} {params[0]:.5f} {params[1]:.5f} {params[2]:.5f} {params[1]/params[2]:.5f} {err:.2e} {success} {message}')
        opt_dict[i+1] = {'optimized_alpha':params[0], 'optimized_beta':params[1], 'optimized_gamma':params[2],'beta/gamma':params[1]/params[2],'optimized_err':err, 'success':success,'message':message}

    opt_df = pd.DataFrame.from_dict(opt_dict).T
    opt_df['month'] = list(df_data3.index)
    my_file = "opt_df.csv"   
    opt_df.to_csv(os.path.join(my_path_, my_file),index=False) 

In [None]:
model()

In [None]:
def plot_model(df,ylabel,title,color,log=False):
    plt.style.use("dark_background")
    fig, ax = plt.subplots(figsize=(15, 5))   
    if log:
        ax.semilogy(list(df['month']), df[ylabel], "o-", color=color, markersize=3)
    else:
        ax.plot(list(df['month']), df[ylabel], "o-", color=color, markersize=3)
    ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
    plt.grid(True, linewidth=0.5)
    ax.ticklabel_format(axis='y')
    ax.set_xlabel('month',size=20)
    ax.set_title(title,size=30)
my_file = "opt_df.csv" 
opt_df = pd.read_csv(os.path.join(my_path_, my_file))
opt_df['month'] = opt_df['month'].apply(pd.to_datetime)
plot_model(opt_df,'optimized_alpha','Optimized alpha','orange')
plot_model(opt_df,'beta/gamma','Optimized beta/gamma','red')
plot_model(opt_df,'optimized_err','Optimized error','blue')

### Simulations

In [None]:
def simulation():
    c = 100
    a = 0.5 
    
    my_file = "opt_df.csv" 
    opt_df = pd.read_csv(os.path.join(my_path_, my_file))
    opt_df['month'] = opt_df['month'].apply(pd.to_datetime)
    
    works = read_parquet(basepath / 'works')
    works_authors_aff = read_parquet(basepath / 'works_authors_aff')
    works_all = set(works.work_id)
    N_dict = works_authors_aff.groupby('publication_date_1').work_id.nunique().to_frame().to_dict()['work_id']
    months_list = list(N_dict.keys())
    months_list.sort()
    months_list = [i.strftime('%Y-%m-%d') for i in months_list]
    start_index = 0
    end_index = 120 #180
    start_month = months_list[start_index]
    end_month = months_list[end_index-1]
    my_file = "df_strengths0.csv"     
    df_0 = pd.read_csv(os.path.join(my_path_, my_file))
    my_file = "inst_set.pickle"
    with open(os.path.join(my_path_, my_file),"rb") as fp:
        inst_set = pickle.load(fp)       
    
    #calculate #each month: F_INTRA
    my_file = "df_data1.csv"    
    df_data1 = pd.read_csv(os.path.join(my_path_, my_file))
    df_data1 = df_data1.set_index('publication_date_1')
    df_data3 = df_data1
    df_data3 = df_data3[['total']]
    df_data3['total'] = df_data3['total'].astype(int)
    df_data3 = df_data3.loc[months_list[end_index]:months_list[-1]]
    df_data3_list = list(df_data3['total'])
    df_data1 = df_data1[['frac_intra']]
    df_data1 = df_data1.rename(columns={'frac_intra':'F_aff'})
    df_data1 = df_data1.loc[months_list[end_index]:months_list[-1]] #[months_list[end_index-1]:months_list[-1]]
    #each month: average distance
    my_file = "work_edges_dist_mean_monthly.csv"    
    df_data2 = pd.read_csv(os.path.join(my_path_, my_file))
    df_data2 = df_data2.set_index('publication_date_1').loc[months_list[end_index]:months_list[-1]]
    
    my_file = "I_dist_threshold.csv"
    I_dist = read_parquet(basepath / 'I_dist_threshold')
    I_dist = I_dist[['source','target','dist']].reset_index(drop=True)
    I_dist = I_dist.query('source.isin(@inst_set) & target.isin(@inst_set)')
    
    df_intra = I_dist[I_dist.source == I_dist.target].reset_index(drop=True)
    df_inter = I_dist[I_dist.source != I_dist.target].reset_index(drop=True)
    df_inter.index += len(df_intra)

    import random
    random.seed(0)
    for s in tqdm(range(10)):

        df_ = df_0

        fra_intra_list = []
        mean_dist_list = []
        for i in tqdm(range(len(df_data3_list))):

            #update 
            alpha = list(opt_df['optimized_alpha'])[i]
            beta = list(opt_df['optimized_beta'])[i]
            gamma = list(opt_df['optimized_gamma'])[i]
            
            inst_str_dict = df_[['institution_id','strength']].set_index('institution_id').to_dict()['strength']

            df_intra['m_source'] = df_intra['source'].map(inst_str_dict)
            df_inter['m_source'] = df_inter['source'].map(inst_str_dict)
            df_inter['m_target'] = df_inter['target'].map(inst_str_dict)
            df_inter['m_prod'] = (df_inter['m_source']*df_inter['m_target'])**a         

            #edges probabilities
            df_intra['p']= gamma*df_intra['m_source'] 
            df_inter['p'] = df_inter['m_prod'] * ( beta / ((df_inter['dist']+c)**alpha)) 

            hyperedges_probabilities = list(itertools.chain(df_intra['p'], df_inter['p']))
            hyperedges_probabilities = np.array(hyperedges_probabilities)/(df_intra['p'].sum()+df_inter['p'].sum())
            W = int(1e5) #df_data3_list[i] 
            hyperedges_model = random.choices(np.arange(0, len(hyperedges_probabilities)), weights=hyperedges_probabilities, k=W)

            #count edges
            counter = dict(collections.Counter(hyperedges_model))
            df_intra['count'] = df_intra.index.to_series().map(counter)
            df_intra['count'] = df_intra['count'].fillna(0)
            df_inter['count'] = df_inter.index.to_series().map(counter)
            df_inter['count'] = df_inter['count'].fillna(0)           
            
            #count intra-inter
            frac_intra = sum(df_intra['count'])/(df_intra['count'].sum()+df_inter['count'].sum())
            fra_intra_list.append(frac_intra)

            #mean team distace
            df_inter['mean_dist'] = df_inter['dist']*df_inter['count']
            mean_dist = df_inter.mean_dist.sum()/W
            mean_dist_list.append(mean_dist)

            #update strength 
            df_intra['count'] = df_intra['count']*2 
            temp = pd.concat([
                df_intra[['source','count']].rename(columns={'source':'institution_id'}),
                df_inter[['source','count']].rename(columns={'source':'institution_id'}),
                df_inter[['target','count']].rename(columns={'target':'institution_id'})])
            temp = temp.groupby('institution_id')['count'].sum().to_frame().reset_index()
            df_old = df_
            df_ = df_[['institution_id','strength']]
            df_ = df_.merge(temp,on='institution_id',how='left')
            df_['count'] = df_['count'].fillna(0)
            df_['strength'] = df_['strength'] + df_['count']
            df_ = df_[['institution_id','strength']]

        model_data = pd.DataFrame.from_dict({'fra_intra':fra_intra_list,'mean_dist':mean_dist_list})
        model_data['month'] = list(df_data3.index)
        my_file = "simulation_"+str(s)+".csv"  
        model_data.to_csv(os.path.join(my_path_, my_file))

In [None]:
simulation()

In [None]:
def plot_simulation():
    
    my_file = "simulation_"+str(0)+".csv"  
    model_data = pd.read_csv(os.path.join(my_path_, my_file))
    model_data['month'] = pd.to_datetime(model_data['month'])
    months_list = list(model_data['month'])
    months_list.sort()
    months_list = [i.strftime('%Y-%m-%d') for i in months_list]
    
    #calculate #each month: F_INTRA
    my_file = "df_data1.csv"    
    df_data1 = pd.read_csv(os.path.join(my_path_, my_file))
    df_data1 = df_data1.set_index('publication_date_1')
    df_data1 = df_data1[['frac_intra']]
    df_data1 = df_data1.rename(columns={'frac_intra':'F_aff'})
    df_data1 = df_data1.loc[months_list] #[months_list[end_index]:months_list[-1]]
    #each month: average distance
    my_file = "work_edges_dist_mean_monthly.csv"    
    work_edges_dist_mean_monthly = pd.read_csv(os.path.join(my_path_, my_file))
    work_edges_dist_mean_monthly = work_edges_dist_mean_monthly.set_index('publication_date_1')
    work_edges_dist_mean_monthly = work_edges_dist_mean_monthly.loc[months_list]
    df_data2 = work_edges_dist_mean_monthly

    for s in range(10):
        my_file = "simulation_"+str(s)+".csv"  
        model_data = pd.read_csv(os.path.join(my_path_, my_file))
        model_data['month'] = pd.to_datetime(model_data['month'])
        months_list = list(model_data['month'])
        months_list.sort()
        months_list = [i.strftime('%Y-%m-%d') for i in months_list]

        plt.style.use("dark_background")
        fig, ax = plt.subplots(figsize=(15, 5))
        x_dates = list(model_data['month'])
        x_data = x_dates
        y_data1 = list(df_data1.F_aff)   
        y_data2 = list(model_data['fra_intra'])
        ax.plot(x_data, y_data1, "o-", markersize=3,label='data')
        ax.plot(x_data, y_data2, "o-", markersize=3,label='model')
        ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
        plt.grid(True, linewidth=0.5)
        ax.yaxis.set_major_formatter(formatter)
        ax.set_xlabel('month',size=20)
        ax.legend() 
        ax.set_title('Frac intra-insts collabs - with exact params',size=30)
        
    for s in range(10):
        my_file = "simulation_"+str(s)+".csv"  
        model_data = pd.read_csv(os.path.join(my_path_, my_file))
        model_data['month'] = pd.to_datetime(model_data['month'])
        months_list = list(model_data['month'])
        months_list.sort()
        months_list = [i.strftime('%Y-%m-%d') for i in months_list]

        plt.style.use("dark_background")
        fig, ax = plt.subplots(figsize=(15, 5))
        x_data = x_dates
        y_data1 = list(df_data2.dist)   
        y_data2 = list(model_data['mean_dist'])
        ax.plot(x_data, y_data1, "o-", markersize=3,label='data')
        ax.plot(x_data, y_data2, "o-", markersize=3,label='model')
        ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
        plt.grid(True, linewidth=0.5)
        ax.yaxis.set_major_formatter(formatter)
        ax.set_xlabel('month',size=20)
        ax.legend() 
        ax.set_title('Avg team dist - with exact params',size=30)

In [None]:
plot_simulation()

## Model 1 variable

B_{ij} = beta * ((s_i*s_j)^(1/2)/(d_{ij}+c)^alpha) if i!=j </br>
B_{ii} = gamma * s_i if i!=j </br>

P_{ij} = B_{ij}/N(alpha,beta,gamma)</br>
P_{ii} = B_{ii}/N(alpha,beta,gamma)</br>
with N(alpha,beta,gamma) = sum_{i} B_{ii} + sum_{(i,j)} B_{ij}</br>

Parameters: c=10 </br>
Variables: alpha>=0 </br>
Fixed: beta>=0, gamma>=0

In [None]:
my_file = "opt_df.csv" 
opt_df = pd.read_csv(os.path.join(my_path_, my_file))
opt_df['month'] = opt_df['month'].apply(pd.to_datetime)
beta = opt_df['optimized_beta'].mean()
gamma = opt_df['optimized_gamma'].mean()
print(f'beta {beta}, gamma {gamma}, beta/gamma {beta/gamma}')

In [None]:
def make_collaboration_graph(works_authors_rows):
    
    authors_id_set = set(works_authors_rows.author_id)
                                  
    bip_g = nx.from_pandas_edgelist(
        works_authors_rows,
        source='work_id', target='author_id'
    )
    
    collab_graph = nx.bipartite.weighted_projected_graph(bip_g,nodes=authors_id_set) #bipartite.weighted_projected_graph(bip_g,nodes=authors_id)
    return collab_graph

def strength_update(df_intra,df_inter,W,a,c,beta,gamma,params):
    N = ( beta *((df_inter['m_prod'])/ ((df_inter['dist']+c)**params)) ).sum() + (gamma *df_intra['m_source']).sum() 
    df_inter2 = df_inter.copy()
    df_inter2[['target','source','m_target','m_source']] = df_inter2[['source','target','m_source','m_target']] 
    df_inter = pd.concat([df_inter,df_inter2])
    df_inter['a'] = beta *((df_inter['m_prod'])/ ((df_inter['dist']+c)**params))
    df_intra['a'] = 2*gamma *df_intra['m_source']
    df = pd.concat([df_inter,df_intra])
    df = df.groupby(['source','m_source']).a.sum().to_frame().reset_index()
    df['m_source'] = df['m_source'] + (W/N)*df['a']
    df = df[['source','m_source']].rename(columns={'source':'institution_id','m_source':'strength'})
    inst_str_dict = df.set_index('institution_id').to_dict()['strength']
    return inst_str_dict

def model_function1(df_intra,df_inter,a,c,beta,gamma,params):    
    N = ( beta *((df_inter['m_prod'])/ ((df_inter['dist']+c)**params)) ).sum() + (gamma *df_intra['m_source']).sum() 
    u = ( (gamma *df_intra['m_source']).sum() )  / N
    return u
    
def model_function2(df_intra,df_inter,a,c,beta,gamma,params):
    N = ( beta *((df_inter['m_prod'])/ ((df_inter['dist']+c)**params)) ).sum() + (gamma *df_intra['m_source']).sum()
    u = (  ( (df_inter['dist']) * ( ((beta *df_inter['m_prod'])/ ((df_inter['dist']+c)**(params))) ) ).sum() ) / N
    return u
    
def objective_function(params,a,c,beta,gamma, x_data, y_data):
    df_intra = x_data[0]
    df_inter = x_data[1]

    y_pred1 = model_function1(df_intra,df_inter,a,c,beta,gamma,params)
    y_pred2 = model_function2(df_intra,df_inter,a,c,beta,gamma,params)
    of = ((y_pred1 - y_data[0]) / y_data[0])**2  +  (((y_pred2 - y_data[1]) / y_data[1])**2)  

    return of 

def model_b():
    
    works = read_parquet(basepath / 'works')
    works_authors_aff = read_parquet(basepath / 'works_authors_aff')
    works_all = set(works.work_id)
    N_dict = works_authors_aff.groupby('publication_date_1').work_id.nunique().to_frame().to_dict()['work_id']
    months_list = list(N_dict.keys())
    months_list.sort()
    months_list = [i.strftime('%Y-%m-%d') for i in months_list]
    start_index = 0
    end_index = 120 #180
    start_month = months_list[start_index]
    end_month = months_list[end_index-1]
    my_file = "df_strengths0.csv"     
    df = pd.read_csv(os.path.join(my_path_, my_file))
    inst_str_dict = df.set_index('institution_id').to_dict()['strength']
    my_file = "inst_set.pickle"
    with open(os.path.join(my_path_, my_file),"rb") as fp:
        inst_set = pickle.load(fp)    
    # #calculate d_ij 
    my_file = "I_dist_threshold.csv"
    I_dist = read_parquet(basepath / 'I_dist_threshold')
    I_dist = I_dist[['source','target','dist']].reset_index(drop=True)
    I_dist = I_dist.query('source.isin(@inst_set) & target.isin(@inst_set)')

    #calculate #each month: F_INTRA

    my_file = "df_data1.csv"    
    df_data1 = pd.read_csv(os.path.join(my_path_, my_file))
    df_data1 = df_data1.set_index('publication_date_1')
    df_data3 = df_data1
    df_data3 = df_data3[['total']]
    df_data3['total'] = df_data3['total'].astype(int)
    df_data3 = df_data3.loc[months_list[end_index]:months_list[-1]]
    df_data3_list = list(df_data3['total'])
    df_data1 = df_data1[['frac_intra']]
    df_data1 = df_data1.rename(columns={'frac_intra':'F_aff'})
    df_data1 = df_data1.loc[months_list[end_index]:months_list[-1]] #[months_list[end_index-1]:months_list[-1]]
    #each month: average distance
    my_file = "work_edges_dist_mean_monthly.csv"    
    df_data2 = pd.read_csv(os.path.join(my_path_, my_file))
    df_data2 = df_data2.set_index('publication_date_1').loc[months_list[end_index]:months_list[-1]]

    I_dist = I_dist.query('source.isin(@inst_set) & target.isin(@inst_set)')

    c = 100
    a = 0.5
    
    df = I_dist.copy()
    df['m_source'] = df['source'].map(inst_str_dict)
    df['m_target'] = df['target'].map(inst_str_dict)
    df['m_prod'] = (df['m_source']*df['m_target'])**a
    df_intra = df[df.source == df.target]
    df_inter = df[df.source != df.target]

    opt_dict = {}
    x_data = [df_intra,df_inter]
    y_data = np.array([list(df_data1['F_aff'])[0],list(df_data2['dist'])[0]])

    my_file = "opt_df.csv" 
    opt_df = pd.read_csv(os.path.join(my_path_, my_file))
    opt_df['month'] = opt_df['month'].apply(pd.to_datetime)
    beta = opt_df['optimized_beta'].mean()
    gamma = opt_df['optimized_gamma'].mean()
    
    #randomstart #starting point can be not feasible
    np.random.seed(0)
    initial_params_list = [2.0] + [np.random.uniform(0, 10, 1)[0] for _ in range(19)]
    err_ = +np.inf
    initial_params_ = np.nan
    result_ = np.nan
    for initial_params in tqdm(initial_params_list):
        result = minimize(objective_function, initial_params, args=(a,c,beta,gamma,x_data,y_data), bounds=[(0, np.inf)], tol = 1e-10, options={'eps': 1e-10, 'ftol': 1e-15}) #, method='SLSQP'
        err = result.fun
        success = result.success
        print(err,success,result.message)
        if err<err_ and success:
            initial_params_ = initial_params
            result_ = result
            err_ = err

    params = result_.x
    err = result_.fun
    success = result_.success
    message = result_.message
    print(f'{0} {list(df_data3.index)[0]} {params[0]:.5f} {err:.2e} {success} {message}')
    opt_dict[0] = {'optimized_alpha':params[0], 'optimized_err':err, 'success':success, 'message':message}

    #update strenght with parameters
    for i in tqdm(range(len(df_data3_list)-1)):
        W = int(1e5) #df_data3_list[i]
        inst_str_dict = strength_update(df_intra,df_inter,W,a,c,beta,gamma,params)

        df = I_dist.copy()
        df['m_source'] = df['source'].map(inst_str_dict)
        df['m_target'] = df['target'].map(inst_str_dict)
        df['m_prod'] = (df['m_source']*df['m_target'])**a
        df_intra = df[df.source == df.target]
        df_inter = df[df.source != df.target]

        x_data = [df_intra,df_inter]
        y_data = np.array([list(df_data1['F_aff'])[i+1],list(df_data2['dist'])[i+1]])

        result = minimize(objective_function, params, args=(a,c,beta,gamma,x_data,y_data), bounds=[(0, np.inf)], tol = 1e-10, options={'eps': 1e-10, 'ftol': 1e-15}) 
        params = result.x
        err = result.fun
        success = result.success
        message = result.message
        print(f'{i+1} {list(df_data3.index)[i+1]} {params[0]:.5f} {err:.2e} {success} {message}')
        opt_dict[i+1] = {'optimized_alpha':params[0],'optimized_err':err, 'success':success,'message':message}

    opt_df = pd.DataFrame.from_dict(opt_dict).T
    opt_df['month'] = list(df_data3.index)
    my_file = "opt_df_b.csv"   
    opt_df.to_csv(os.path.join(my_path_, my_file),index=False) 

In [None]:
model_b()

In [None]:
def plot_model_b(df,ylabel,title,color,log=False):
    plt.style.use("dark_background")
    fig, ax = plt.subplots(figsize=(15, 5))   
    if log:
        ax.semilogy(list(df['month']), df[ylabel], "o-", color=color, markersize=3)
    else:
        ax.plot(list(df['month']), df[ylabel], "o-", color=color, markersize=3)
    ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
    plt.grid(True, linewidth=0.5)
    ax.ticklabel_format(axis='y')
    ax.set_xlabel('month',size=20)
    ax.set_title(title,size=30)
my_file = "opt_df_b.csv" 
opt_df = pd.read_csv(os.path.join(my_path_, my_file))
opt_df['month'] = opt_df['month'].apply(pd.to_datetime)
plot_model_b(opt_df,'optimized_alpha','Optimized alpha','orange')
plot_model_b(opt_df,'optimized_err','Optimized error','blue')

### Simulations

In [None]:
def simulation_b():
    c = 100 
    a = 0.5 
    
    my_file = "opt_df.csv" 
    opt_df = pd.read_csv(os.path.join(my_path_, my_file))
    opt_df['month'] = opt_df['month'].apply(pd.to_datetime)
    beta = opt_df['optimized_beta'].mean()
    gamma = opt_df['optimized_gamma'].mean()
    
    my_file = "opt_df_b.csv" 
    opt_df = pd.read_csv(os.path.join(my_path_, my_file))
    opt_df['month'] = opt_df['month'].apply(pd.to_datetime)
    
    works = read_parquet(basepath / 'works')
    works_authors_aff = read_parquet(basepath / 'works_authors_aff')
    works_all = set(works.work_id)
    works_authors_aff = works_authors_aff[works_authors_aff.work_id.isin(works_all)]
    N_dict = works_authors_aff.groupby('publication_date_1').work_id.nunique().to_frame().to_dict()['work_id']
    months_list = list(N_dict.keys())
    months_list.sort()
    months_list = [i.strftime('%Y-%m-%d') for i in months_list]
    start_index = 0
    end_index = 120 #180
    start_month = months_list[start_index]
    end_month = months_list[end_index-1]
    my_file = "df_strengths0.csv"     
    df_0 = pd.read_csv(os.path.join(my_path_, my_file))
    my_file = "inst_set.pickle"
    with open(os.path.join(my_path_, my_file),"rb") as fp:
        inst_set = pickle.load(fp)       
    
    #calculate #each month: F_INTRA
    my_file = "df_data1.csv"    
    df_data1 = pd.read_csv(os.path.join(my_path_, my_file))
    df_data1 = df_data1.set_index('publication_date_1')
    df_data3 = df_data1
    df_data3 = df_data3[['total']]
    df_data3['total'] = df_data3['total'].astype(int)
    df_data3 = df_data3.loc[months_list[end_index]:months_list[-1]]
    df_data3_list = list(df_data3['total'])
    df_data1 = df_data1[['frac_intra']]
    df_data1 = df_data1.rename(columns={'frac_intra':'F_aff'})
    df_data1 = df_data1.loc[months_list[end_index]:months_list[-1]] #[months_list[end_index-1]:months_list[-1]]
    #each month: average distance
    my_file = "work_edges_dist_mean_monthly.csv"    
    df_data2 = pd.read_csv(os.path.join(my_path_, my_file))
    df_data2 = df_data2.set_index('publication_date_1').loc[months_list[end_index]:months_list[-1]]
    
    my_file = "I_dist_threshold.csv"
    I_dist = read_parquet(basepath / 'I_dist_threshold')
    I_dist = I_dist[['source','target','dist']].reset_index(drop=True)
    I_dist = I_dist.query('source.isin(@inst_set) & target.isin(@inst_set)')
    
    df_intra = I_dist[I_dist.source == I_dist.target].reset_index(drop=True)
    df_inter = I_dist[I_dist.source != I_dist.target].reset_index(drop=True)
    df_inter.index += len(df_intra)

    import random
    random.seed(0)
    for s in tqdm(range(10)):

        df_ = df_0

        fra_intra_list = []
        mean_dist_list = []
        for i in tqdm(range(len(df_data3_list))):

            #update 
            alpha = list(opt_df['optimized_alpha'])[i]
            
            inst_str_dict = df_[['institution_id','strength']].set_index('institution_id').to_dict()['strength']

            df_intra['m_source'] = df_intra['source'].map(inst_str_dict)
            df_inter['m_source'] = df_inter['source'].map(inst_str_dict)
            df_inter['m_target'] = df_inter['target'].map(inst_str_dict)
            df_inter['m_prod'] = (df_inter['m_source']*df_inter['m_target'])**a         

            #edges probabilities
            df_intra['p']= gamma*df_intra['m_source'] 
            df_inter['p'] = df_inter['m_prod'] * ( beta / ((df_inter['dist']+c)**alpha)) 

            hyperedges_probabilities = list(itertools.chain(df_intra['p'], df_inter['p']))
            hyperedges_probabilities = np.array(hyperedges_probabilities)/(df_intra['p'].sum()+df_inter['p'].sum())
            W = int(1e5) #df_data3_list[i] 
            hyperedges_model = random.choices(np.arange(0, len(hyperedges_probabilities)), weights=hyperedges_probabilities, k=W)

            #count edges
            counter = dict(collections.Counter(hyperedges_model))
            df_intra['count'] = df_intra.index.to_series().map(counter)
            df_intra['count'] = df_intra['count'].fillna(0)
            df_inter['count'] = df_inter.index.to_series().map(counter)
            df_inter['count'] = df_inter['count'].fillna(0)           
            
            #count intra-inter
            frac_intra = sum(df_intra['count'])/(df_intra['count'].sum()+df_inter['count'].sum())
            fra_intra_list.append(frac_intra)

            #mean team distace
            df_inter['mean_dist'] = df_inter['dist']*df_inter['count']
            mean_dist = df_inter.mean_dist.sum()/W
            mean_dist_list.append(mean_dist)

            #update strength 
            df_intra['count'] = df_intra['count']*2 
            temp = pd.concat([
                df_intra[['source','count']].rename(columns={'source':'institution_id'}),
                df_inter[['source','count']].rename(columns={'source':'institution_id'}),
                df_inter[['target','count']].rename(columns={'target':'institution_id'})])
            temp = temp.groupby('institution_id')['count'].sum().to_frame().reset_index()
            df_old = df_
            df_ = df_[['institution_id','strength']]
            df_ = df_.merge(temp,on='institution_id',how='left')
            df_['count'] = df_['count'].fillna(0)
            df_['strength'] = df_['strength'] + df_['count']
            df_ = df_[['institution_id','strength']]

        model_data = pd.DataFrame.from_dict({'fra_intra':fra_intra_list,'mean_dist':mean_dist_list})
        model_data['month'] = list(df_data3.index)
        my_file = "simulation_"+str(s)+"_b.csv" 
        model_data.to_csv(os.path.join(my_path_, my_file))

In [None]:
simulation_b()

In [None]:
def plot_simulation_b():
    
    my_file = "simulation_"+str(0)+"_b.csv"
    model_data = pd.read_csv(os.path.join(my_path_, my_file))
    model_data['month'] = pd.to_datetime(model_data['month'])
    months_list = list(model_data['month'])
    months_list.sort()
    months_list = [i.strftime('%Y-%m-%d') for i in months_list]
    
    #calculate #each month: F_INTRA
    my_file = "df_data1.csv"    
    df_data1 = pd.read_csv(os.path.join(my_path_, my_file))
    df_data1 = df_data1.set_index('publication_date_1')
    df_data1 = df_data1[['frac_intra']]
    df_data1 = df_data1.rename(columns={'frac_intra':'F_aff'})
    df_data1 = df_data1.loc[months_list] #[months_list[end_index]:months_list[-1]]
    #each month: average distance
    my_file = "work_edges_dist_mean_monthly.csv"    
    work_edges_dist_mean_monthly = pd.read_csv(os.path.join(my_path_, my_file))
    work_edges_dist_mean_monthly = work_edges_dist_mean_monthly.set_index('publication_date_1')
    work_edges_dist_mean_monthly = work_edges_dist_mean_monthly.loc[months_list]
    df_data2 = work_edges_dist_mean_monthly

    for s in range(10):
        my_file = "simulation_"+str(s)+"_b.csv" 
        model_data = pd.read_csv(os.path.join(my_path_, my_file))
        model_data['month'] = pd.to_datetime(model_data['month'])
        months_list = list(model_data['month'])
        months_list.sort()
        months_list = [i.strftime('%Y-%m-%d') for i in months_list]

        plt.style.use("dark_background")
        fig, ax = plt.subplots(figsize=(15, 5))
        x_dates = list(model_data['month'])
        x_data = x_dates
        y_data1 = list(df_data1.F_aff)   
        y_data2 = list(model_data['fra_intra'])
        ax.plot(x_data, y_data1, "o-", markersize=3,label='data')
        ax.plot(x_data, y_data2, "o-", markersize=3,label='model')
        ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
        plt.grid(True, linewidth=0.5)
        ax.yaxis.set_major_formatter(formatter)
        ax.set_xlabel('month',size=20)
        ax.legend() 
        ax.set_title('Frac intra-insts collabs - with exact params',size=30)
        
    for s in range(10):
        my_file = "simulation_"+str(s)+"_b.csv" 
        model_data = pd.read_csv(os.path.join(my_path_, my_file))
        model_data['month'] = pd.to_datetime(model_data['month'])
        months_list = list(model_data['month'])
        months_list.sort()
        months_list = [i.strftime('%Y-%m-%d') for i in months_list]

        plt.style.use("dark_background")
        fig, ax = plt.subplots(figsize=(15, 5))
        x_data = x_dates
        y_data1 = list(df_data2.dist)   
        y_data2 = list(model_data['mean_dist'])
        ax.plot(x_data, y_data1, "o-", markersize=3,label='data')
        ax.plot(x_data, y_data2, "o-", markersize=3,label='model')
        ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
        plt.grid(True, linewidth=0.5)
        ax.yaxis.set_major_formatter(formatter)
        ax.set_xlabel('month',size=20)
        ax.legend() 
        ax.set_title('Avg team dist - with exact params',size=30)

In [None]:
plot_simulation_b()