In [2]:
#heterogeneous quality + heterogeneous rhetorical value full model

import random
import numpy as np
random.seed(100000)
np.random.seed(100000)
#fix random seeds across six models to make results exactly reproducible
#average across a large number of fixed-seed runs and get average effects/error bands, etc.
#this is because we want to ensure that
#observed differences between models
#are due to the mechanisms we proposed and not random variations between comparison experiments

import scipy.stats as stats
from scipy.stats import pearsonr
from scipy.stats import norm

#helper function for calculating Gini coefficient
def gini_coefficient(x):
    """Compute Gini coefficient of array of values"""
    diffsum = 0
    for i, xi in enumerate(x[:-1], 1):
        diffsum += np.sum(np.abs(xi - x[i:]))
    return diffsum / (len(x)**2 * np.mean(x))

#helper function for getting indices of the top N values of a list (which to read/cite)
def f(a,N):
    """Get indices of the top N values of a list"""
    return np.argsort(a)[::-1][:N]

#args need to use
tmax= 1000#time steps
#-----------------------------
num = 600#paper population
nummax=600 #by varying field size
#key variable of interest
#-----------------------------
reference = 20#expected reference size, need to vary
refmax =100  #by varying reference size
#key variable of interest
#-----------------------------
reading = 120#reading size, need to vary
readingmax=120 #by varying reading size
#key variable of interest
#-----------------------------
noise =0.05
#tested for robustness (see Appendix 1.3)

fit =0.1
#tested for robustness (see Appendix 1.4)

shape =6#shape for value distribution
#tested for robustness (see Appendix 1.1)

listcorr=[] #correlation (citation-quality)
listgini=[] #gini
#figure 2, in what way top-quality/mid-quality papers are cited? (substantive or rhetorical)
list1=[] #top papers cited substantively
list2=[] #top papers cited rhetorically
list3=[] #mid papers cited substantively
list4=[] #mid papers cited rhetorically

for num in range(num, nummax+1): #varying field size
  normative=np.random.beta(1, shape, size=num)
  qrank =[]
  qrank = list(np.argsort(normative)[-(num):][::-1]) #quality distribution and its rank

#-------------------------------------------------------------------------------
  #move to the model
  weight=0.001 #the weight of citation count on perceived quality
  weightq =0.3#the signal-based gain in rhetoric value
  #tested for robustness (see Appendix 1.5: reinforcing process)

  for reference in range(reference, refmax+1):
  #for reading in range(reading, readingmax+1):

    top1q=qrank[0:40]# top 40 quality (high quality)
    top2q=qrank[40:int(150)]# top 40-150 quality (mid-to-high quality) /600 in total

    cite_population = [0]*num #citation count over the entire paper population, initial as 0
    rhe_list=[0]*num #rhetoric value, initial as 0
    chunk =[] #citation churn

    for t in range(1, tmax+1): #a reader joins
    #heterogeneous quality: the reader has her own perception of--
    #--threshold/fit/error/underlying rhetorical value
    #initialize them within the loop

      threshold =random.uniform(0,1)
      #normal distribution of threshold for robustness
      #we tested robustness (see Appendix 1.2)
      #mu, sigma = 0.5, 0.2# for high peak
      #lower, upper = 0, 1
      #X1 = stats.truncnorm(
      #  (lower - mu) / sigma, (upper - mu) / sigma, loc=mu, scale=sigma)
      #threshold1=X1.rvs(1)
      #threshold =threshold1[0]

      #underlying rhetorical value is heterogeneous
      base_rhe=np.random.beta(1, shape, size=num)

      noise_list=np.random.normal(0, noise, num)
      #we tested it for robustness (see Appendix 1.3)

      #fit - heterogeneous
      fit_list=[]
      for i in range(num):
        fit_list.append(random.uniform(-fit, fit))
      #tested for robustness (see Appendix 1.4)

      #perceived quality signal
      signal =[]
      for i in range(num):
        #0-1 cut off
        signal.append(normative[i]+ fit_list[i] + weight * cite_population[i] + noise_list[i])
      for i in range(num):
        if (signal[i] >2): #maximum 2, quality +fit +error =1, weight * citation (citation premium) =1
          signal[i] =2
        if (signal[i] <0):
          signal[i] =0 #non-negative

      #reading size
      reading_index = list(np.argsort(signal)[-(reading):][::-1])#read papers with the highest perceived quality

      #overall rhetorical values ==
      #underlying rhetorical value + ...
      #...weight_on_perceived_quality * (quality + fit + citation permium), in the *read* list (so no error)
      for i in range(len(reading_index)):
        rhe_list[reading_index[i]] =base_rhe[reading_index[i]] + weightq* fit_list[reading_index[i]] + weightq*normative[reading_index[i]] + weightq* weight*cite_population[reading_index[i]]

      #if unread
      #overall rhetorical values == underlying rhetorical value + ...
      #...weight_on_perceived_quality * (quality + fit + citation premium + error (noise_list))
      unread=[]
      unread = [i for i in list(range(0,num)) if i not in reading_index]
      for i in range(len(unread)):
        rhe_list[unread[i]] =base_rhe[unread[i]] + weightq* fit_list[unread[i]] + weightq*normative[unread[i]] + weightq* weight*cite_population[unread[i]] + weightq* noise_list[unread[i]]

      #truncted overall rhetorical value
      #min ==0
      #max == maximum underlying rhetorical value (1) + ...
      #...weight_on_perceived_quality * perceived quality [1(quality + fit + error) + 1(citation premium)]
      #see table 1 in the original paper
      for i in range(num):
        if (rhe_list[i] <0):
          rhe_list[i] =0
        if(rhe_list[i] > 1+ weightq+ weightq):
          rhe_list[i] =1+ weightq+ weightq

      #after reading, the error disappears, so quality in eyes = quality + fit
      norm_list =[]
      for i in range(len(reading_index)):
        norm_list.append(normative[reading_index[i]] + fit_list[reading_index[i]])

      #quality in eyes = quality + fit range [0,1]
      for i in range(len(reading_index)):
        if (norm_list[i] <0):
          norm_list[i] =0
        if (norm_list[i] >1):
          norm_list[i] =1

      #which one is beyond the substantive citing threshold?
      over_threshold=[]
      for i in range(len(norm_list)):
        if (norm_list[i]> threshold):
          over_threshold.append(reading_index[i])

      normative_cite=[]# substantive citation list
      rhetorical_cite=[]# rhetorical citation list
      overlap=[]#overlap between substantive and rhetorical citing
      overall_cite =[]

      #if there are enough good papers for substantive citing
      #then all cites are substantive (cite the best ones within the citing budget)
      #update citation counts -- and the impact that citations induce
      if (len(over_threshold) >= reference):
        cite = list(f(norm_list, reference))
        for i in range(len(cite)):
          cite_population[reading_index[cite[i]]]= cite_population[reading_index[cite[i]]]+1
          normative_cite.append(reading_index[cite[i]])
        rhetorical_cite=[]
        overlap =[]
        overall_cite = normative_cite + rhetorical_cite + overlap

      #if there are insufficient good papers for substantive citing
      #first, cite all of these good ones substantively as "normative_cite"
      else:
        normative_cite = over_threshold.copy()
        #how many slots are left -- we fill up all of them according to the overall rhetorical values
        rhetoric_no = reference - len(over_threshold)
        new_rhe = rhe_list.copy()
        rhe2=[]
        rhe2 = sorted(new_rhe, reverse = True)
        itr=0
        itr2=0
        while (itr < rhetoric_no):
          #a small proportion of papers first is cited substantively
          #then if people find them rhetorically useful as well
          #move them into the set of "overlap" --
          #cited both substantively and rhetorically (we allow this which is the case in the real world)

          if ((new_rhe.index(rhe2[itr2]) in normative_cite) == True):
            normative_cite.remove(new_rhe.index(rhe2[itr2]))
            numitr = new_rhe.index(rhe2[itr2])
            overlap.append(numitr)
            itr2 =itr2+1
          else:
            rhetorical_cite.append(new_rhe.index(rhe2[itr2]))
            itr =itr+1
            itr2 =itr2+1

        #update citation count
        overall_cite = normative_cite + rhetorical_cite + overlap
        for i in range(len(overall_cite)):
          cite_population[overall_cite[i]] = cite_population[overall_cite[i]] +1
      #churn
      #this round, which papers get cited?
      chunk.append(overall_cite)

#here, experiments--------------------------------------------------------------
      #figure 2: how are two groups (high quality VS mid quality) of papers cited in different models?
      #top1=0
      #for i in range(40):
      # if ((top1q[i] in (normative_cite + overlap)) == True):
      #    top1 = top1+1
      #list1.append(top1/reference)
      #top2=0
      #for i in range(40):
      #  if ((top1q[i] in (rhetorical_cite + overlap)) == True):
      #    top2 = top2+1
      #list2.append(top2/reference)
      #top3=0
      #for i in range(110):
      #  if ((top2q[i] in (normative_cite + overlap)) == True):
      #    top3 = top3+1
      #list3.append(top3/reference)
     # top4=0
      #for i in range(110):
      #   if ((top2q[i] in (rhetorical_cite + overlap)) == True):
      #    top4 = top4+1
      #list4.append(top4/reference)

    #print(*cite_population, sep='\n') #every round the citation count distribution

    #listgini.append(gini_coefficient(np.array(cite_population))) #gini coefficient

    corr1, _ = stats.pearsonr(normative, cite_population)
    listcorr.append(corr1) #citation-quality correlation, example output

    # churn, which is the newly cited papers compared to the last round
    #dnew=[]
    #for i in range(len(chunk) -1):
    #  a=0
     # for j in range(len(chunk[i+1])):
     #   if ((chunk[i+1][j] in chunk[i]) == False):
     #     a=a+1
      #dnew.append(a)
    #print(np.mean(dnew))

#print(list1)
#print(list2)
#print(list3)
#print(list4)
print('example output: Correlation (citation-quality)')
print(*listcorr, sep='\n')
print('the full model\'s correlation is higher than either null models')
#print(listgini)

example output: Correlation (citation-quality)
0.6836293522666834
0.696374430127724
0.7013635474254413
0.7059455212382381
0.715919561504083
0.7221730505784769
0.7268020772055044
0.7349468471921673
0.741802858120647
0.7462776172783917
0.7526300688954846
0.7608874242970943
0.7614715822026559
0.7677739619582702
0.7730344947412569
0.7781479266672098
0.7813029530685538
0.7863997064037257
0.7919528445944846
0.7948991690441161
0.8010161863744167
0.8044901530268589
0.811284593242172
0.8098364572980024
0.8143138512026423
0.8184007038207295
0.8159597225504489
0.8245640009852617
0.8280352757427344
0.8320036140699847
0.8330913665122204
0.8354360544352559
0.8387071981454703
0.8438777865094035
0.846125881283999
0.8501523013890638
0.8547232109288513
0.8536824553499324
0.8557236460252265
0.861086974934922
0.8637619377034851
0.8655696353363695
0.8697167921903648
0.8715036279884668
0.8736914917481844
0.8748366326936343
0.8777770115014641
0.8802989699019195
0.8800265597186165
0.8855013841430374
0.8883757