# Relations metrics

* Average Team distance ATD
* Impact Factor IF

In [None]:
from pathlib import Path
from time import time
from tqdm.auto import tqdm
import pandas as pd 
import numpy as np
import os
import pickle
import networkx as nx
import random
from scipy import stats
tqdm.pandas()

import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns
plt.rcParams.update({'font.size': 22})
sns.set(style="ticks", context="talk")
plt.style.use("dark_background")
pd.options.plotting.backend = 'plotly'
pio.templates.default = 'plotly_dark+presentation'

import matplotlib.dates as dates
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
def form(x,pos):
    if x<1e2:
        return '%1.2f' % (x)
    elif x<1e3:
        return '%1.3f' % (x)
    elif x<1e6:
        return '%1.1fK' % (x * 1e-3)
    else:
        return '%1.1fM' % (x * 1e-6)
formatter = FuncFormatter(form)

def plot_(df_,x_column,y_column,x_label,title):
    plt.style.use("dark_background")
    fig, ax = plt.subplots(figsize=(15, 5))
    x_dates = list(df_[x_column])
    y_data = df_[y_column]
    x_data = x_dates

    ax.plot(x_data, y_data, "co-", markersize=6,label='dataset')
    ax.axvline(pd.Timestamp(2020, 3, 1),color='r')

    plt.grid(True, linewidth=0.5)
    ax.yaxis.set_major_formatter(formatter)

    ax.set_xlabel(x_label,size=20)
    ax.set_title(title,size=30)

def read_parquet(name, **args):
    path = name
    print(f'Reading {name!r}')
    tic = time()
    df = pd.read_parquet(path, engine='fastparquet', **args)
    before = len(df)
    toc = time()
    after = len(df)
        
    print(f'Read {len(df):,} rows from {path.stem!r} in {toc-tic:.2f} sec. {before-after:,} duplicates.')
    return df

In [None]:
my_path_ = Path('./Raleations_metrics')
if not os.path.exists(my_path_):
    os.makedirs(my_path_)

In [None]:
def mean_scatter_plot_multi(df,columnx,columny,labelx,labely,logx=False,logy=False,limitxl=False,limitxu=False,limityl=False,limityu=False,geomspace=False):
    
    #case x-axis=0
    df0 = df[df[columnx]==0]
    df0_mean = []
    df0_sem = []
    for y in periods_list:
        df0_y = df0[df0.period==y]
        df0_mean.append(np.mean(df0_y[columny]))
        df0_sem.append(stats.sem(df0_y[columny]))
        print(f'{periods_labels[y]}: {df0_mean[y]:.2f} ({df0_sem[y]:.2f})')
    
    if geomspace:
        df = df[df[columnx]>0]
        
    fig, ax = plt.subplots()

    for y in periods_list:
        df1 = df[df.period==y]

        x1 = np.array(df1[columnx]) 
        y1 = np.array(df1[columny])  
         
    
        # Define the grid
        gridsize = 15
        if geomspace:
            xbins1 = np.geomspace(x1.min(), x1.max(), gridsize)
        else:
            xbins1 = np.linspace(x1.min(), x1.max(), gridsize)
        
        # Calculate the mean values within each column
        xaxis_values1 = []
        mean_values1 = []
        sem_values1 = []
        for i in range(len(xbins1) - 1):
            mask = (x1 >= xbins1[i]) & (x1 < xbins1[i + 1])
            xaxis_values1.append(np.mean(x1[mask])) 
            mean_values1.append(np.mean(y1[mask])) 
            sem_values1.append(stats.sem(y1[mask]))
        xaxis_values1 = np.array(xaxis_values1) #(xbins1[:-1] + xbins1[1:]) / 2
        mean_values1 = np.array(mean_values1)
        sem_values1 = np.array(sem_values1)
        
        ax.scatter(xaxis_values1, mean_values1,color=color_dict[y], marker=marker_dict[y],s=5, label = periods_labels[y])
        ax.errorbar(xaxis_values1, mean_values1, yerr = sem_values1,color=color_dict[y], marker=marker_dict[y], xerr = None, ls='none') 
    ax.tick_params(axis='both', which='major', labelsize=15)
    ax.set_xlabel(labelx,size=20)
    ax.set_ylabel(labely,size=20)
    #ax.set_title(title,size=30)
    if logx==True:
        ax.set_xscale('log')  
    if logy==True:
        ax.set_yscale('log')
    if limitxu!=False :
        ax.set_xlim([limitxl,limitxu]) 
    if limityu!=False:
        ax.set_ylim([limityl, limityu])  
    ax.legend(bbox_to_anchor=(1.6, 0.9), prop={'size': 15},markerscale=5)
    plt.show() 

In [None]:
#load metrics tables
my_file = "cit_count_1year.pickle"
with open(os.path.join('./IF', my_file),"rb") as fp:
    IF_df = pickle.load(fp)
IF_df = IF_df.reset_index().sort_values(by=['publication_date_1','work_id']).rename(columns={'C1':'IF'}).reset_index(drop=True)
IF_df = IF_df[IF_df.publication_date_1>='2000']
IF_df = IF_df[IF_df.publication_date_1<='2023-12-01']

In [None]:
my_file = "work_edges_dist_mean.csv"  
ATD_df = pd.read_csv(os.path.join('./TeamDistance', my_file))
ATD_df = ATD_df.sort_values(by=['publication_date_1','work_id']).rename(columns={'dist':'ATD'}).reset_index(drop=True)
ATD_df['publication_date_1'] = pd.to_datetime(ATD_df['publication_date_1'])
ATD_df = ATD_df[ATD_df.publication_date_1>='2000']
ATD_df = ATD_df[ATD_df.publication_date_1<='2023-12-01']

In [None]:
#restrict to same papers
works_set = set(IF_df.work_id) & set(ATD_df.work_id)
IF_df = IF_df[IF_df.work_id.isin(works_set)]
ATD_df = ATD_df[ATD_df.work_id.isin(works_set)]

In [None]:
periods_list_ = [(2000,2009),(2010,2014),(2015,2019),(2020,2020),(2021,2023)]
periods_list = [0,1,2,3,4]
periods_labels = ['[2000,2009]','[2010,2014]','[2015,2019]','[2020,2020]','[2021,2023]']
color_dict = {0: 'yellow', 1: 'magenta', 2: 'orange', 3: 'red', 4: 'cyan'}
marker_dict = {0: ".", 1: ".", 2: ".", 3: ".", 4: "."} #marker_dict = {0: ".", 1: "p", 2: "d", 3: "^", 4: "*"}

In [None]:
def relations_metrics(df1,df2,metric1,metric2,limitxl=False,limitxu=False,limityl=False,limityu=False,logx=False,logy=False,geomspace=False):
    df = (df1.drop(columns=['publication_date_1'])).merge(df2,on='work_id').sort_values(by='publication_date_1')
    df['publication_year'] = df['publication_date_1'].dt.year 
    df = df.set_index('publication_year')
    df = df.loc[periods_list_[0][0]:periods_list_[-1][1]]
    for y in range(len(periods_list_)):
        period = periods_list_[y]
        df.loc[period[0]:period[1],'period'] = y
    df.dropna(inplace=True)
    mean_scatter_plot_multi(df,metric1,metric2,metric1,metric2,logx,logy,limitxl,limitxu,limityl,limityu,geomspace)

In [None]:
relations_metrics(ATD_df,IF_df,'ATD','IF')
relations_metrics(ATD_df,IF_df,'ATD','IF',False,False,1,9)

In [None]:
#log scale
relations_metrics(ATD_df,IF_df,'ATD','IF',False,False,False,False,True,False,True)
relations_metrics(ATD_df,IF_df,'ATD','IF',False,False,2,8,True,False,True)

## COVID

In [None]:
basepath = Path('./Tables_final')
my_file = 'preprint_id_set_COVID'
with open(os.path.join(basepath, my_file),"rb") as fp:
    preprint_id_set_COVID = pickle.load(fp)
works_set_noCOVID = works_set.difference(preprint_id_set_COVID)
IF_df = IF_df[IF_df.work_id.isin(works_set_noCOVID)]
ATD_df = ATD_df[ATD_df.work_id.isin(works_set_noCOVID)]
II_df = II_df[II_df.work_id.isin(works_set_noCOVID)]
size_df = size_df[size_df.work_id.isin(works_set_noCOVID)]

In [None]:
relations_metrics(ATD_df,IF_df,'ATD','IF',False,False,2,7)