# Impact factor - IF

In [None]:
from pathlib import Path
from time import time
from tqdm.auto import tqdm
import pandas as pd 
import numpy as np
import os
import pickle
import networkx as nx
import random
tqdm.pandas()

import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns
plt.rcParams.update({'font.size': 22})
sns.set(style="ticks", context="talk")
plt.style.use("dark_background")
pd.options.plotting.backend = 'plotly'
pio.templates.default = 'plotly_dark+presentation'

import matplotlib.dates as dates
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
def form(x,pos):
    if x<1e2:
        return '%1.2f' % (x)
    elif x<1e3:
        return '%1.3f' % (x)
    elif x<1e6:
        return '%1.1fK' % (x * 1e-3)
    else:
        return '%1.1fM' % (x * 1e-6)
formatter = FuncFormatter(form)

def plot_(df_,x_column,y_column,x_label,title):
    plt.style.use("dark_background")
    fig, ax = plt.subplots(figsize=(15, 5))
    x_dates = list(df_[x_column])
    y_data = df_[y_column]
    x_data = x_dates

    ax.plot(x_data, y_data, "co-", markersize=6,label='dataset')
    ax.axvline(pd.Timestamp(2020, 3, 1),color='r')

    plt.grid(True, linewidth=0.5)
    ax.yaxis.set_major_formatter(formatter)

    ax.set_xlabel(x_label,size=20)
    ax.set_title(title,size=30)
    
def read_parquet(name, **args):
    path = name
    print(f'Reading {name!r}')
    tic = time()
    df = pd.read_parquet(path, engine='fastparquet', **args)
    before = len(df)
    toc = time()
    after = len(df)
        
    print(f'Read {len(df):,} rows from {path.stem!r} in {toc-tic:.2f} sec. {before-after:,} duplicates.')
    return df

In [None]:
basepath = Path('./Tables_final') 
my_path_ = Path('./IF')
if not os.path.exists(my_path_):
    os.makedirs(my_path_)

In [None]:
works_referenced_works = read_parquet(basepath / 'works_referenced_works')
my_file = 'preprint_id_set.pickle'
with open(os.path.join(basepath, my_file),"rb") as fp:
    preprint_id_set = pickle.load(fp)
my_file = "works_month_dict.pickle"
with open(os.path.join(basepath, my_file),"rb") as fp:
    works_month_dict = pickle.load(fp)

In [None]:
#add column - citations before Y years
works_referenced_works['Y1'] = works_referenced_works.diff_publication_date_1<=1
works_referenced_works['Y2'] = works_referenced_works.diff_publication_date_1<=2

In [None]:
def IF(works_referenced_works,preprint_id_set,works_month_dict,my_path_,Y):
    
    wrw = works_referenced_works[works_referenced_works['Y'+str(Y)]==1]
    wrw = wrw[['work_id','referenced_work_id']]

    cit_count = wrw.groupby('referenced_work_id').work_id.count().to_frame()
    cit_count = cit_count.reset_index(drop=False).rename(columns={'work_id':'C'+str(Y)})
    cit_count = cit_count.rename(columns={'referenced_work_id':'work_id'})
    
    #add works zero citations
    preprint_nocit = list(preprint_id_set - set(cit_count.work_id))
    cit_count = pd.concat([cit_count,pd.DataFrame.from_dict({'work_id':list(preprint_nocit),'C'+str(Y):[0]*len(preprint_nocit)})])

    cit_count['publication_date_1'] = cit_count['work_id'].map(works_month_dict)
    cit_count = cit_count.set_index('publication_date_1')

    my_file = "cit_count_"+str(Y)+"year.pickle"
    pickle.dump(cit_count, open(os.path.join(my_path_, my_file), 'wb'))
    
    cit_count = cit_count.reset_index()
    cit_count = cit_count[cit_count.reset_index().publication_date_1>='2000']
    if Y==1:
        cit_count = cit_count[cit_count.publication_date_1<='2023-01-01']
    elif Y==2:
        cit_count = cit_count[cit_count.publication_date_1<='2022-01-01']
    
    cit_count_stat = cit_count.groupby('publication_date_1')['C'+str(Y)].agg(['mean', 'std','sem']).reset_index()

    plt.style.use("dark_background")
    fig, ax = plt.subplots(figsize=(15, 5))
    x_data = list(cit_count_stat['publication_date_1'])
    y_mean = cit_count_stat['mean']
    y_sem = cit_count_stat['sem']
    ax.plot(x_data, y_mean, "o", color = 'orange',markersize=3)
    #ax.fill_between(x_data, y_mean - y_sem, y_mean + y_sem, color = 'orange',alpha=0.4)
    ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
    plt.grid(True, linewidth=0.5)
    ax.set_xlabel('month',size=20)
    ax.set_title('Monthly Average Impact Factor ('+str(Y)+' years)',size=30)
    
    return cit_count_stat

In [None]:
cit_count_stat_1 = IF(works_referenced_works,preprint_id_set,works_month_dict,my_path_,1) #1 year
cit_count_stat_2 = IF(works_referenced_works,preprint_id_set,works_month_dict,my_path_,2) #2 year

## COVID

In [None]:
my_file = 'preprint_id_set_COVID'
with open(os.path.join(basepath, my_file),"rb") as fp:
    preprint_id_set_COVID = pickle.load(fp)
my_file = 'preprint_id_set_noCOVID'
with open(os.path.join(basepath, my_file),"rb") as fp:
    preprint_id_set_noCOVID = pickle.load(fp)

In [None]:
def IF_COVID(Y,preprint_id_set_COVID,preprint_id_set_noCOVID):
    
    my_file = "cit_count_"+str(Y)+"year.pickle"
    with open(os.path.join(my_path_, my_file),"rb") as fp:
        cit_count = pickle.load(fp) 
    cit_count = cit_count.reset_index()
    cit_count = cit_count[cit_count.publication_date_1>='2000']
    
    cit_count_COVID = cit_count.query('work_id.isin(@preprint_id_set_COVID)')
    cit_count_noCOVID = cit_count.query('work_id.isin(@preprint_id_set_noCOVID)')
        
    cit_count_stat = cit_count.groupby('publication_date_1')['C'+str(Y)].agg(['mean', 'std','sem']).reset_index()
    cit_count_stat_COVID = cit_count_COVID.groupby('publication_date_1')['C'+str(Y)].agg(['mean', 'std','sem']).reset_index()
    cit_count_stat_noCOVID = cit_count_noCOVID.groupby('publication_date_1')['C'+str(Y)].agg(['mean', 'std','sem']).reset_index()

    if Y==1:
        cit_count_stat = cit_count_stat[cit_count_stat.publication_date_1<'2024-01-01']
        cit_count_stat_COVID = cit_count_stat_COVID[cit_count_stat_COVID.publication_date_1<'2024-01-01']
        cit_count_stat_noCOVID = cit_count_stat_noCOVID[cit_count_stat_noCOVID.publication_date_1<'2024-01-01']
    elif Y==2:
        cit_count_stat = cit_count_stat[cit_count_stat.publication_date_1<'2023-01-01']
        cit_count_stat_COVID = cit_count_stat_COVID[cit_count_stat_COVID.publication_date_1<'2023-01-01']
        cit_count_stat_noCOVID = cit_count_stat_noCOVID[cit_count_stat_noCOVID.publication_date_1<'2023-01-01']
    
    plt.style.use("dark_background")
    fig, ax = plt.subplots(figsize=(15, 5))
    x_data = list(cit_count_stat['publication_date_1'])
    y_mean = cit_count_stat_noCOVID['mean']
    y_sem = cit_count_stat_noCOVID['sem']
    ax.plot(x_data, y_mean, "o-", color = 'green',markersize=3,label='non-COVID papers')
    ax.fill_between(x_data, y_mean - y_sem, y_mean + y_sem, color = 'green',alpha=0.4)
    y_mean2 = cit_count_stat['mean']
    y_sem2 = cit_count_stat['sem']
    ax.plot(x_data, y_mean2, "o-", color = 'orange',markersize=3,label='overall papers')
    ax.fill_between(x_data, y_mean2 - y_sem2, y_mean2 + y_sem2, color = 'orange',alpha=0.4)
    ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
    plt.grid(True, linewidth=0.5)
    ax.set_xlabel('month',size=20)
    ax.legend()
    ax.set_title('Monthly Average Impact Factor ( '+str(Y)+' years)',size=30)
    
    cit_count_stat_COVID = cit_count_stat_COVID[cit_count_stat_COVID.publication_date_1>='2020-01-01']
    cit_count_stat_COVID = cit_count_stat_COVID.merge(cit_count_COVID[cit_count_COVID.publication_date_1>='2020-01-01'].groupby('publication_date_1').work_id.count().to_frame().reset_index().rename(columns={'work_id':'work_id_count'}), on='publication_date_1')

    plt.style.use("dark_background")
    fig, ax = plt.subplots(figsize=(15, 5))
    x_data = list(cit_count_stat_COVID['publication_date_1'])
    y_mean = cit_count_stat_COVID['mean']
    y_sem = cit_count_stat_COVID['sem']
    ax.plot(x_data, y_mean, "o-", color = 'red',markersize=3,label='IF (log)')
    ax.fill_between(x_data, y_mean - y_sem, y_mean + y_sem, color = 'red',alpha=0.4)
    y_mean2 = cit_count_stat_COVID['work_id_count']
    ax.plot(x_data, y_mean2, "o-", color = 'blue',markersize=3,label='COVID papers')
    ax.set_yscale('log')
    ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
    plt.grid(True, linewidth=0.5)
    ax.set_xlabel('month',size=20)
    ax.legend()
    ax.set_title('Monthly Average Impact Factor ( '+str(Y)+' years) - COVID',size=30)
    
    return cit_count_stat_COVID,cit_count_stat_noCOVID

In [None]:
cit_count_stat_COVID_1,cit_count_stat_noCOVID_1 = IF_COVID(1,preprint_id_set_COVID,preprint_id_set_noCOVID)

In [None]:
cit_count_stat_COVID_2,cit_count_stat_noCOVID_2 = IF_COVID(2,preprint_id_set_COVID,preprint_id_set_noCOVID)

In [None]:
my_file = "cit_count_stat_COVID_1.pickle"
pickle.dump(cit_count_stat_COVID_1, open(os.path.join(my_path_, my_file), 'wb'))
my_file = "cit_count_stat_COVID_2.pickle"
pickle.dump(cit_count_stat_COVID_2, open(os.path.join(my_path_, my_file), 'wb'))
my_file = "cit_count_stat_noCOVID_1.pickle"
pickle.dump(cit_count_stat_noCOVID_1, open(os.path.join(my_path_, my_file), 'wb'))
my_file = "cit_count_stat_noCOVID_2.pickle"
pickle.dump(cit_count_stat_noCOVID_2, open(os.path.join(my_path_, my_file), 'wb'))

In [None]:
from scipy import optimize
import numpy.polynomial.polynomial as npoly
def plot_fit_rolling_breakpoints(df,x_column,x_label,title,window_size,num_breakpoints,ff):
    
    #save for all possible combination of breakpoints the correspondent error 
    def f(breakpoints, x, y, fcache): 
        breakpoints = tuple(map(int, sorted(breakpoints)))
        if breakpoints not in fcache:
            total_error = 0
            for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x, y):
                total_error += ((f(xi) - yi)**2).sum()
            fcache[breakpoints] = total_error 
        # print('{} --> {}'.format(breakpoints, fcache[breakpoints]))
        return fcache[breakpoints]

    def find_best_piecewise_polynomial(breakpoints, x, y):
        breakpoints = tuple(map(int, sorted(breakpoints)))
        xs = np.split(x, breakpoints)
        ys = np.split(y, breakpoints)
        result = []
        for xi, yi in zip(xs, ys):
            if len(xi) < 2: continue
            coefs = npoly.polyfit(xi, yi, 1)
            f = npoly.Polynomial(coefs)
            result.append([f, xi, yi])
        return result

    fig, ax = plt.subplots(figsize=(15, 5))
    x_dates = list(df['publication_date_1'])
    y_data = df[x_column].rolling(window=window_size).mean()[window_size-1:]
    x_data = x_dates[window_size-1:]
    
    ax.plot(x_data, y_data, "o", markersize=6, color='orange')

    plt.grid(True, linewidth=0.5)
    ax.yaxis.set_major_formatter(formatter)

    ax.set_xlabel(x_label,size=20)
    ax.set_title(title,size=30)

    #ax.xaxis.set_major_locator(mdates.MonthLocator()) # Make ticks on occurrences of each month
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %y')) # Get only the month to show in the x-axis

    ax.axvline(pd.Timestamp(2020, 3, 1),color='r') #ax.axvline(x_dates[84],color='r')

    x_num = dates.date2num(x_data)
    breakpoints = optimize.brute(f, [slice(1, len(x_data), 1)]*num_breakpoints, args=(x_num, y_data, {}), finish=None)
    if num_breakpoints==1:
        breakpoints = [breakpoints] 

    for f, xi, yi in find_best_piecewise_polynomial(breakpoints, x_num, y_data):
        xi_min = x_data[np.where(x_num == xi.min())[0][0]]
        xi_max = x_data[np.where(x_num == xi.max())[0][0]]
        x_interval = np.array([xi_min, xi_max]) 
        #print('y = {:35s}, if x in [{}, {}]'.format(str(f), *x_interval))
        x_interval = np.array([xi.min(), xi.max()])
        #ax.plot(x_interval, f(x_interval), 'yo-')
        coef = f.convert().coef
        b = coef[0]
        a = coef[1]
        if ff==1:
            if b>0:
                ll = 'y = {:.2f} x + {:.0f}'.format(a,b)
            else: #b<0
                ll = 'y = {:.2f} x - {:.0f}'.format(a,abs(b))    
        else:
            if b>0:
                ll = 'y = {:.5f} x + {:.0f}'.format(a,b)
            else: #b<0
                ll = 'y = {:.5f} x - {:.0f}'.format(a,abs(b))
        ax.plot(x_interval, f(x_interval), 'go-',label=ll,linewidth=2, markersize=7)
        print(x_interval, f(x_interval), (a,b), x_dates[window_size-1:][np.where(x_num==x_interval[0])[0][0]], x_dates[window_size-1:][np.where(x_num==x_interval[1])[0][0]])

        
    ax.legend()
    ax.set_title(title,size=30)
    #plt.savefig(os.path.join(my_path_plots, title+'_breaks'+str(num_breakpoints)+'.png'), bbox_inches='tight', pad_inches=0.02)
    plt.style.use("dark_background")

In [None]:
df = cit_count_stat_noCOVID_1
x_column = 'mean'
x_label = 'IF'
title = 'Monthly Average Impact Factor ( '+str(1)+' years) - no COVID'
window_size = 1
num_breakpoints = 2
ff = 3
plot_fit_rolling_breakpoints(df,x_column,x_label,title,window_size,num_breakpoints,ff)

In [None]:
df = cit_count_stat_noCOVID_1
x_column = 'mean'
x_label = 'IF'
title = 'Monthly Average Impact Factor ( '+str(1)+' years) - no COVID'
window_size = 1
num_breakpoints = 1
ff = 3
plot_fit_rolling_breakpoints(df,x_column,x_label,title,window_size,num_breakpoints,ff)

In [None]:
df = cit_count_stat_1
x_column = 'mean'
x_label = 'IF'
title = 'Monthly Average Impact Factor ( '+str(1)+' years)'
window_size = 1
num_breakpoints = 1
ff = 3
plot_fit_rolling_breakpoints(df,x_column,x_label,title,window_size,num_breakpoints,ff)

## Categories

In [None]:
my_file = "preprint_dict.pickle"
with open(os.path.join(Path('./Tables'), my_file),"rb") as fp:
    preprint_dict = pickle.load(fp)
rows = [(category, work_id) for category, work_ids in preprint_dict.items() for work_id in work_ids]
preprint_df = pd.DataFrame(rows, columns=['tax_name', 'work_id'])
preprint_categories_list = list(preprint_df.groupby('tax_name').work_id.count().to_frame().sort_values(by='work_id',ascending=False).index)

In [None]:
def IF_cat(Y,preprint_df,preprint_categories_list):
    cit_count_stat_tot = pd.DataFrame()
    my_file = "cit_count_"+str(Y)+"year.pickle"
    with open(os.path.join(my_path_, my_file),"rb") as fp:
        cit_count = pickle.load(fp)
    cit_count = cit_count.reset_index().merge(preprint_df,on='work_id')
    
    preprint_dict
    cit_count = cit_count[cit_count.publication_date_1>='2000']
    for k in preprint_categories_list:
        cit_count_k = cit_count[cit_count.tax_name==k]
        cit_count_stat = cit_count_k.groupby('publication_date_1')['C'+str(Y)].agg(['mean', 'std','sem']).reset_index()
    
        plt.style.use("dark_background")
        fig, ax = plt.subplots(figsize=(15, 5))
        x_data = list(cit_count_stat['publication_date_1'])
        y_mean = cit_count_stat['mean']
        #y_sem = cit_count_stat['sem']
        ax.plot(x_data, y_mean, "o", color = 'green',markersize=5)
        #ax.fill_between(x_data, y_mean - y_sem, y_mean + y_sem, color = 'green',alpha=0.4)
        ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
        plt.grid(True, linewidth=0.5)
        ax.yaxis.set_major_formatter(formatter)

        ax.set_xlabel('month',size=20)
        title = 'Monthly Average Impact Factor ( '+str(Y)+' years) - '+k
        ax.set_title(title,size=30)
        #plt.savefig(os.path.join(my_path_plots, title+'.png'), bbox_inches='tight', pad_inches=0.02) 
        plt.savefig(os.path.join(Path('./IF/IF_categories'), title+'.png'), bbox_inches='tight', pad_inches=0.02)
        
        cit_count_stat.insert(1, 'category', k)
        cit_count_stat_tot = pd.concat([cit_count_stat_tot,cit_count_stat])
        
    return cit_count_stat_tot

In [None]:
cit_count_stat_cat_1 = IF_cat(1,preprint_df,preprint_categories_list)

In [None]:
cit_count_stat_cat_2 = IF_cat(2,preprint_df,preprint_categories_list)

In [None]:
my_file = "cit_count_stat_cat_1.pickle"
pickle.dump(cit_count_stat_cat_1, open(os.path.join(my_path_, my_file), 'wb'))