# Impact factor - IF

In [None]:
from pathlib import Path
from time import time
from tqdm.auto import tqdm
import pandas as pd 
import numpy as np
import os
import pickle
import networkx as nx
import random
tqdm.pandas()

import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns
plt.rcParams.update({'font.size': 22})
sns.set(style="ticks", context="talk")
plt.style.use("dark_background")
pd.options.plotting.backend = 'plotly'
pio.templates.default = 'plotly_dark+presentation'

import matplotlib.dates as dates
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
def form(x,pos):
    if x<1e2:
        return '%1.2f' % (x)
    elif x<1e3:
        return '%1.3f' % (x)
    elif x<1e6:
        return '%1.1fK' % (x * 1e-3)
    else:
        return '%1.1fM' % (x * 1e-6)
formatter = FuncFormatter(form)

def plot_(df_,x_column,y_column,x_label,title):
    plt.style.use("dark_background")
    fig, ax = plt.subplots(figsize=(15, 5))
    x_dates = list(df_[x_column])
    y_data = df_[y_column]
    x_data = x_dates

    ax.plot(x_data, y_data, "co-", markersize=6,label='dataset')
    ax.axvline(pd.Timestamp(2020, 3, 1),color='r')

    plt.grid(True, linewidth=0.5)
    ax.yaxis.set_major_formatter(formatter)

    ax.set_xlabel(x_label,size=20)
    ax.set_title(title,size=30)
    
def read_parquet(name, **args):
    path = name
    print(f'Reading {name!r}')
    tic = time()
    df = pd.read_parquet(path, engine='fastparquet', **args)
    before = len(df)
    toc = time()
    after = len(df)
        
    print(f'Read {len(df):,} rows from {path.stem!r} in {toc-tic:.2f} sec. {before-after:,} duplicates.')
    return df

In [None]:
basepath = Path('Tables_final') 
my_path_ = Path('IF')
if not os.path.exists(my_path_):
    os.makedirs(my_path_)

In [None]:
works_referenced_works = read_parquet(basepath / 'works_referenced_works')
my_file = 'preprint_id_set.pickle'
with open(os.path.join(basepath, my_file),"rb") as fp:
    preprint_id_set = pickle.load(fp)
my_file = "works_month_dict.pickle"
with open(os.path.join(basepath, my_file),"rb") as fp:
    works_month_dict = pickle.load(fp)

In [None]:
#add column - citations before Y years
works_referenced_works['Y1'] = works_referenced_works.diff_publication_date_1<=1
works_referenced_works['Y2'] = works_referenced_works.diff_publication_date_1<=2

In [None]:
def IF(works_referenced_works,preprint_id_set,works_month_dict,my_path_,Y):
    
    wrw = works_referenced_works[works_referenced_works['Y'+str(Y)]==1]
    wrw = wrw[['work_id','referenced_work_id']]

    cit_count = wrw.groupby('referenced_work_id').work_id.count().to_frame()
    cit_count = cit_count.reset_index(drop=False).rename(columns={'work_id':'C'+str(Y)})
    cit_count = cit_count.rename(columns={'referenced_work_id':'work_id'})
    
    #add works zero citations
    preprint_nocit = list(preprint_id_set - set(cit_count.work_id))
    cit_count = pd.concat([cit_count,pd.DataFrame.from_dict({'work_id':list(preprint_nocit),'C'+str(Y):[0]*len(preprint_nocit)})])

    cit_count['publication_date_1'] = cit_count['work_id'].map(works_month_dict)
    cit_count = cit_count.set_index('publication_date_1')

    my_file = "cit_count_"+str(Y)+"year.pickle"
    pickle.dump(cit_count, open(os.path.join(my_path_, my_file), 'wb'))
    
    cit_count = cit_count.reset_index()
    cit_count = cit_count[cit_count.reset_index().publication_date_1>='2000']
    if Y==1:
        cit_count = cit_count[cit_count.publication_date_1<='2023-01-01']
    elif Y==2:
        cit_count = cit_count[cit_count.publication_date_1<='2022-01-01']
    
    cit_count_stat = cit_count.groupby('publication_date_1')['C'+str(Y)].agg(['mean', 'std','sem']).reset_index()

    plt.style.use("dark_background")
    fig, ax = plt.subplots(figsize=(15, 5))
    x_data = list(cit_count_stat['publication_date_1'])
    y_mean = cit_count_stat['mean']
    y_sem = cit_count_stat['sem']
    ax.plot(x_data, y_mean, "o", color = 'orange',markersize=3)
    #ax.fill_between(x_data, y_mean - y_sem, y_mean + y_sem, color = 'orange',alpha=0.4)
    ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
    plt.grid(True, linewidth=0.5)
    ax.set_xlabel('month',size=20)
    ax.set_title('Monthly Average Impact Factor ('+str(Y)+' years)',size=30)
    
    return cit_count_stat

In [None]:
cit_count_stat_1 = IF(works_referenced_works,preprint_id_set,works_month_dict,my_path_,1) #1 year
cit_count_stat_2 = IF(works_referenced_works,preprint_id_set,works_month_dict,my_path_,2) #2 year

## COVID

In [None]:
my_file = 'preprint_id_set_COVID'
with open(os.path.join(basepath, my_file),"rb") as fp:
    preprint_id_set_COVID = pickle.load(fp)
my_file = 'preprint_id_set_noCOVID'
with open(os.path.join(basepath, my_file),"rb") as fp:
    preprint_id_set_noCOVID = pickle.load(fp)

In [None]:
def IF_COVID(Y,preprint_id_set_COVID,preprint_id_set_noCOVID):
    
    my_file = "cit_count_"+str(Y)+"year.pickle"
    with open(os.path.join(my_path_, my_file),"rb") as fp:
        cit_count = pickle.load(fp) 
    cit_count = cit_count.reset_index()
    cit_count = cit_count[cit_count.publication_date_1>='2000']
    
    cit_count_COVID = cit_count.query('work_id.isin(@preprint_id_set_COVID)')
    cit_count_noCOVID = cit_count.query('work_id.isin(@preprint_id_set_noCOVID)')
        
    cit_count_stat = cit_count.groupby('publication_date_1')['C'+str(Y)].agg(['mean', 'std','sem']).reset_index()
    cit_count_stat_COVID = cit_count_COVID.groupby('publication_date_1')['C'+str(Y)].agg(['mean', 'std','sem']).reset_index()
    cit_count_stat_noCOVID = cit_count_noCOVID.groupby('publication_date_1')['C'+str(Y)].agg(['mean', 'std','sem']).reset_index()

    if Y==1:
        cit_count_stat = cit_count_stat[cit_count_stat.publication_date_1<'2024-01-01']
        cit_count_stat_COVID = cit_count_stat_COVID[cit_count_stat_COVID.publication_date_1<'2024-01-01']
        cit_count_stat_noCOVID = cit_count_stat_noCOVID[cit_count_stat_noCOVID.publication_date_1<'2024-01-01']
    elif Y==2:
        cit_count_stat = cit_count_stat[cit_count_stat.publication_date_1<'2023-01-01']
        cit_count_stat_COVID = cit_count_stat_COVID[cit_count_stat_COVID.publication_date_1<'2023-01-01']
        cit_count_stat_noCOVID = cit_count_stat_noCOVID[cit_count_stat_noCOVID.publication_date_1<'2023-01-01']
    
    plt.style.use("dark_background")
    fig, ax = plt.subplots(figsize=(15, 5))
    x_data = list(cit_count_stat['publication_date_1'])
    y_mean = cit_count_stat_noCOVID['mean']
    y_sem = cit_count_stat_noCOVID['sem']
    ax.plot(x_data, y_mean, "o-", color = 'green',markersize=3,label='non-COVID papers')
    ax.fill_between(x_data, y_mean - y_sem, y_mean + y_sem, color = 'green',alpha=0.4)
    y_mean2 = cit_count_stat['mean']
    y_sem2 = cit_count_stat['sem']
    ax.plot(x_data, y_mean2, "o-", color = 'orange',markersize=3,label='overall papers')
    ax.fill_between(x_data, y_mean2 - y_sem2, y_mean2 + y_sem2, color = 'orange',alpha=0.4)
    ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
    plt.grid(True, linewidth=0.5)
    ax.set_xlabel('month',size=20)
    ax.legend()
    ax.set_title('Monthly Average Impact Factor ( '+str(Y)+' years)',size=30)
    
    cit_count_stat_COVID = cit_count_stat_COVID[cit_count_stat_COVID.publication_date_1>='2020-01-01']
    cit_count_stat_COVID = cit_count_stat_COVID.merge(cit_count_COVID[cit_count_COVID.publication_date_1>='2020-01-01'].groupby('publication_date_1').work_id.count().to_frame().reset_index().rename(columns={'work_id':'work_id_count'}), on='publication_date_1')

    plt.style.use("dark_background")
    fig, ax = plt.subplots(figsize=(15, 5))
    x_data = list(cit_count_stat_COVID['publication_date_1'])
    y_mean = cit_count_stat_COVID['mean']
    y_sem = cit_count_stat_COVID['sem']
    ax.plot(x_data, y_mean, "o-", color = 'red',markersize=3,label='IF (log)')
    ax.fill_between(x_data, y_mean - y_sem, y_mean + y_sem, color = 'red',alpha=0.4)
    y_mean2 = cit_count_stat_COVID['work_id_count']
    ax.plot(x_data, y_mean2, "o-", color = 'blue',markersize=3,label='COVID papers')
    ax.set_yscale('log')
    ax.axvline(pd.Timestamp(2020, 3, 1),color='r')
    plt.grid(True, linewidth=0.5)
    ax.set_xlabel('month',size=20)
    ax.legend()
    ax.set_title('Monthly Average Impact Factor ( '+str(Y)+' years) - COVID',size=30)
    
    return cit_count_stat_COVID,cit_count_stat_noCOVID

In [None]:
cit_count_stat_COVID_1,cit_count_stat_noCOVID_1 = IF_COVID(1,preprint_id_set_COVID,preprint_id_set_noCOVID)

In [None]:
cit_count_stat_COVID_2,cit_count_stat_noCOVID_2 = IF_COVID(2,preprint_id_set_COVID,preprint_id_set_noCOVID)

In [None]:
my_file = "cit_count_stat_COVID_1.pickle"
pickle.dump(cit_count_stat_COVID_1, open(os.path.join(my_path_, my_file), 'wb'))
my_file = "cit_count_stat_COVID_2.pickle"
pickle.dump(cit_count_stat_COVID_2, open(os.path.join(my_path_, my_file), 'wb'))
my_file = "cit_count_stat_noCOVID_1.pickle"
pickle.dump(cit_count_stat_noCOVID_1, open(os.path.join(my_path_, my_file), 'wb'))
my_file = "cit_count_stat_noCOVID_2.pickle"
pickle.dump(cit_count_stat_noCOVID_2, open(os.path.join(my_path_, my_file), 'wb'))