Written by Saraswat Bhattacharyya.

Big To-Do list for Backend:

> Find better metrics for Category Score, Final Evaluation and Preference Update algorithm for recommendation weights

> Write better code to identify the person (e.g. middle names, initials, special characters, orcid identifiers)

Optimizations / Improvements suggested:
> Theoretical Physics database in readable format

> Better way to identify people on ArXiv.

> Make recommendation algorithm less ad-hoc

> Front end

### EXTRACT AUTHOR DETAILS FROM ARXIV API

In [None]:
import urllib, urllib.request
from bs4 import BeautifulSoup as bs
from collections import Counter

import pandas as pd
import numpy as np

To construct a query, refer to https://arxiv.org/help/api/user-manual#query_details


In [None]:
def find_person(firstname, lastname, max_results=50):
  url = 'http://export.arxiv.org/api/query?search_query=au:'+firstname+'+AND+'+lastname+'&start=0&max_results='+str(max_results)
  try:
    data = urllib.request.urlopen(url)
    #bs_content = bs(data.read().decode('utf-8'), "lxml")
    bs_content = bs(data.read().decode('utf-8'), "lxml-xml")
    return bs_content
  except:
    print('Person '+firstname+ ' ' + lastname+' not found.')
    return None

We use Beautiful Soup to extract data. For some basic ideas about the syntax of Beautiful Soup, we refer to https://linuxhint.com/parse_xml_python_beautifulsoup/

### FIND ArXiv CATEGORY LIST FOR AUTHOR

And we simply use a python counter object to count the number of occurences of each category, and level 0 categorization is done

In [None]:
def find_titles(bs_content):
  entries = bs_content.find_all("entry")
  titles = []
  for entry in entries:
    all_cats = entry.find_all("category")
    for cat in all_cats:
      titles.append(cat["term"])

  return Counter(titles)

### FIND DISTANCE BETWEEN PERSON 'A' AND PERSON 'B' BY EXPANDING CO-AUTHOR LIST

We implement this technique, only for connections <='depth' in size. They say everyone is connected to everyone through a maximum of 6 people, so 4 isn't too small :) . We expand the author list of both A and B's papers to top 10 people, and then expand their author list to top 10 people. We put this is a list so we avoid overlaps (and this is very basic at the moment).

In [None]:
def clean_name(author_name):
  #this function returns the name as "Firstname Lastname", deleting everything else
  firstname, lastname = firstlastname(author_name)
  return firstname + ' ' + lastname

def firstlastname(author_name):
  author_name = author_name.replace('\n', '').replace('.', '')
  first_space = author_name.find(' ')
  last_space = author_name.rfind(' ')
  return author_name[:first_space], author_name[last_space+1:]

def find_coauthors(bs_content):
  if (bs_content is None):
    return Counter()

  entries = bs_content.find_all("entry")
  coauthors = []
  for entry in entries:
    all_auths = entry.find_all("author")
    if (len(all_auths)>10):
      continue
    for auth in all_auths:
      coauthors.append(clean_name(auth.get_text()))

  return Counter(coauthors)

def findPageCoauthsFromName(author_name):
  fname, lname = firstlastname(author_name)
  try:
    author_page = find_person(fname, lname)
    coauth_list = find_coauthors(author_page)
    return author_page, coauth_list
  except:
    return "", Counter()

In [None]:
def lookForConnection(personA, personB, max_depth = 2, debug = 0):
  personA_FN, personA_LN = firstlastname(personA)
  personB_FN, personB_LN = firstlastname(personB)
  A_page = find_person(personA_FN, personA_LN)
  B_page = find_person(personB_FN, personB_LN)

  A_coauth_list = find_coauthors(A_page)
  B_coauth_list = find_coauthors(B_page)
  A_seen_list = [personA]
  B_seen_list = [personB]

  found_flag = False
  depth = 0

  if ( next((x for x in A_seen_list if x == personB), None) != None ): #list iterator
    #A and B are coauthors
    return True, 0

  depth = 1
  while(depth <= max_depth):
    for auth1, n in A_coauth_list.most_common(10*depth)[1:]:
      if ( next((x for x in B_seen_list if x == auth1), None) != None ):
        #match has been found
        print("Connection found at depth = " + str(depth)  + " through " + auth1)
        found_flag = True
        return True, depth

      if ( next((x for x in A_seen_list if x == auth1), None) == None ):
        auth1_F, auth1_L = firstlastname(auth1)
        A_coauth_list.update(find_coauthors(find_person(auth1_F, auth1_L , max_results = 10) ) )
        A_seen_list.append(auth1)
      else:
        #person has been processed
        continue

    for auth1, n in B_coauth_list.most_common(10*depth)[1:]:
      if ( next((x for x in A_seen_list if x == auth1), None) != None ):
        #person has been found
        print("Connection found at depth = " + str(depth) + " through " + auth1)
        found_flag = True
        return True, depth

      if ( next((x for x in B_seen_list if x == auth1), None) == None ):
        auth1_F, auth1_L = firstlastname(auth1)
        B_coauth_list.update(find_coauthors(find_person(auth1_F, auth1_L, max_results = 10) ) )
        B_seen_list.append(auth1)
      else:
        #person has been processed
        continue

    depth = depth + 1


  return False, max_depth

In [None]:
def quickConnectionSearch(userA, seminarB, debug=1):
    coauthsA = userA.coauths
    coauthsB = seminarB.coauths
    coauthsInt = coauthsA & coauthsB

    if (debug == 1):
      print(coauthsA)
      print(coauthsB)
      print(coauthsInt)

    #print(coauthsInt)
    coauthsScore = 0
    for key in coauthsInt:
        coauthsScore += 1
        #print(key)

    return coauthsScore

### CREATING THE USER CLASS AND CATEGORY LISTS

We start off with defining a structure which contains every single arxiv category. We will add the preference for each category as a feature vector for each person, in the recommender algorithm. We organize this in lists, with separate preferences for subcategory and full category.

Full category preference = average of subcategory

##### ARXIV CATEGORIES

In [None]:
def find_bigcat(full_category):
  dot_loc = full_category.find('.')
  if (dot_loc == -1):
    return full_category
  big_cat = full_category[:dot_loc]
  return big_cat

In [None]:
#added from here: https://arxiv.org/category_taxonomy

big_cats = ['cs', 'econ', 'eess', 'math', 'astro-ph', 'cond-mat', 'gr-qc', 'hep-ex', \
            'hep-lat', 'hep-ph', 'hep-th', 'math-ph', 'nlin', 'nucl-ex', 'nucl-th', \
            'physics', 'quant-ph', 'q-bio', 'q-fin', 'stat']

n_cats = len(big_cats)
sub_cats = [None]*n_cats

#typical format for adding lists to the big list
cs_list = ['cs.AI', 'cs.AR', 'cs.CC', 'cs.CE', 'cs.CG', 'cs.CL', 'cs.CR', 'cs.CV', 'cs.CY', \
          'cs.DB', 'cs.DC', 'cs.DL', 'cs.DM', 'cs.DS', 'cs.ET', 'cs.FL', 'cs.GL', 'cs.GR', 'cs.GT', \
          'cs.HC', 'cs.IR', 'cs.IT', 'cs.LG', 'cs.LO', 'cs.MA', 'cs.MM', 'cs.MS', 'cs.NA', 'cs.NE', \
          'cs.NI', 'cs.OH', 'cs.PF', 'cs.PL', 'cs.RO', 'cs.SC', 'cs.SD', 'cs.SE', 'cs.SI', 'cs.SY']
cs_idx = big_cats.index(find_bigcat(cs_list[0]))
sub_cats[cs_idx] = cs_list

#next
econ_list = ['econ.EM', 'econ.GN', 'econ.TH']
econ_idx = big_cats.index(find_bigcat(econ_list[0]))
sub_cats[econ_idx] = econ_list

#next
eess_list = ['eess.AS', 'eess.IV', 'eess.SP', 'eess.SY']
eess_idx = big_cats.index(find_bigcat(eess_list[0]))
sub_cats[eess_idx] = eess_list

#next
math_list = ['math.AC', 'math.AG', 'math.AP', 'math.AT', 'math.CA', 'math.CO', 'math.CT', 'math.CV', \
             'math.DG', 'math.DS', 'math.FA', 'math.GM', 'math.GN', 'math.GR', 'math.GT', 'math.HO', \
             'math.IT', 'math.KT', 'math.LO', 'math.MG', 'math.MP', 'math.NA', 'math.NT', 'math.OA', \
             'math.OC', 'math.PR', 'math.QA', 'math.RA', 'math.RT', 'math.SG', 'math.SP', 'math.ST']
math_idx = big_cats.index(find_bigcat(math_list[0]))
sub_cats[math_idx] = math_list

#next
astro_list = ['astro-ph.CO', 'astro-ph.EP', 'astro-ph.GA', 'astro-ph.HE', 'astro-ph.IM', 'astro-ph.SR']
astro_idx = big_cats.index(find_bigcat(astro_list[0]))
sub_cats[astro_idx] = astro_list

#another one
cond_mat_list = ['cond-mat.dis-nn', 'cond-mat.mes-hall', 'cond-mat.mtrl-sci', \
                 'cond-mat.other', 'cond-mat.quant-gas', 'cond-mat.soft', \
                 'cond-mat.stat-mech', 'cond-mat.str-el', 'cond-mat.supr-con']
cond_mat_idx = big_cats.index(find_bigcat(cond_mat_list[0]))
sub_cats[cond_mat_idx] = cond_mat_list

gr_list = ['gr-qc']
gr_idx = big_cats.index(find_bigcat(gr_list[0]))
sub_cats[gr_idx] = gr_list

hep_ex_list = ['hep-ex']
hep_ex_idx = big_cats.index(find_bigcat(hep_ex_list[0]))
sub_cats[hep_ex_idx] = hep_ex_list

hep_lat_list = ['hep-lat']
hep_lat_idx = big_cats.index(find_bigcat(hep_lat_list[0]))
sub_cats[hep_lat_idx] = hep_lat_list

hep_ph_list = ['hep-ph']
hep_ph_idx = big_cats.index(find_bigcat(hep_ph_list[0]))
sub_cats[hep_ph_idx] = hep_ph_list

hep_th_list = ['hep-th']
hep_th_idx = big_cats.index(find_bigcat(hep_th_list[0]))
sub_cats[hep_th_idx] = hep_th_list

math_ph_list = ['math-ph']
math_ph_idx = big_cats.index(find_bigcat(math_ph_list[0]))
sub_cats[math_ph_idx] = math_ph_list

#another one
nlin_list = ['nlin.AO', 'nlin.CD', 'nlin.CG', 'nlin.PS', 'nlin.SI']
nlin_idx = big_cats.index(find_bigcat(nlin_list[0]))
sub_cats[nlin_idx] = nlin_list

nucl_ex_list = ['nucl-ex']
nucl_ex_idx = big_cats.index(find_bigcat(nucl_ex_list[0]))
sub_cats[nucl_ex_idx] = nucl_ex_list

nucl_th_list = ['nucl-th']
nucl_th_idx = big_cats.index(find_bigcat(nucl_th_list[0]))
sub_cats[nucl_th_idx] = nucl_th_list

phys_list = ['physics.acc-ph', 'physics.ao-ph', 'physics.app-ph', 'physics.atm-clus', \
             'physics.atom-ph', 'physics.bio-ph', 'physics.chem-ph', 'physics.class-ph', \
             'physics.comp-ph', 'physics.data-an', 'physics.ed-ph', 'physics.flu-dyn', \
             'physics.gen-ph', 'physics.geo-ph', 'physics.hist-ph', 'physics.ins-det', \
             'physics.med-ph', 'physics.optics', 'physics.plasm-ph', 'physics.pop-ph', \
             'physics.soc-ph', 'physics.space-ph']
phys_idx = big_cats.index(find_bigcat(phys_list[0]))
sub_cats[phys_idx] = phys_list

quant_ph_list = ['quant-ph']
quant_ph_idx = big_cats.index(find_bigcat(quant_ph_list[0]))
sub_cats[quant_ph_idx] = quant_ph_list

q_bio_list = ['q-bio.BM', 'q-bio.CB', 'q-bio.GN', 'q-bio.MN', 'q-bio.NC', 'q-bio.OT', 'q-bio.PE', \
              'q-bio.QM', 'q-bio.SC', 'q-bio.TO']
q_bio_idx = big_cats.index(find_bigcat(q_bio_list[0]))
sub_cats[q_bio_idx] = q_bio_list

q_fin_list = ['q-fin.CP', 'q-fin.EC', 'q-fin.EC', 'q-fin.GN', 'q-fin.MF', 'q-fin.PM', 'q-fin.PR', \
              'q-fin.RM', 'q-fin.ST', 'q-fin.TR']
q_fin_idx = big_cats.index(find_bigcat(q_fin_list[0]))
sub_cats[q_fin_idx] = q_fin_list

stat_list = ['stat.AP', 'stat.CO', 'stat.ME', 'stat.ML', 'stat.OT', 'stat.TH']
stat_idx = big_cats.index(find_bigcat(stat_list[0]))
sub_cats[stat_idx] = stat_list

#what sub_cats looks like
#sub_cats

In [None]:
#flatten the subcats list to form the full index list, replacing empty indices with the title label
full_flat_list = []
cat_label_list = []
cat_start_idx = np.zeros(n_cats) #inclusive
cat_end_idx = np.zeros(n_cats) #exclusive

for i in range(n_cats):
  if (sub_cats[i] is None):
    full_flat_list.append(big_cats[i])
    cat_label_list.append(i)
  else:
    for subcat in sub_cats[i]:
      full_flat_list.append(subcat)
      cat_label_list.append(i)

n_fullsubcats = len(full_flat_list)
for i in range(1, n_cats-1):
  cat_start_idx[i] = cat_label_list.index(i)
  cat_end_idx[i] = cat_label_list.index(i+1)

cat_start_idx[-1] = cat_label_list.index(n_cats-1)
cat_end_idx[0] = cat_label_list.index(1)
cat_end_idx[-1] = n_fullsubcats

#full_flat_list
#cat_label_list

#### USERS

In [None]:
class EndUser:
  def __init__(self, name, id, big_cat_pref, lil_cat_pref):
    self.name = name
    self.id = id

    self.page, self.coauths = findPageCoauthsFromName(self.name)

    #volatility parameters: count previous instances seen
    self.fc_volN = [1]*n_cats
    self.sc_volN = [1]*n_fullsubcats

    self.cat_list = [0]*n_cats
    if type(big_cat_pref) is list:
      for pref in big_cat_pref:
        try:
          self.cat_list[big_cats.index(pref)] = 1
        except:
          print(str(pref)+' not found')
    else:
      try:
        self.cat_list[big_cats.index(big_cat_pref)] = 1
      except:
        print(str(big_cat_pref)+' not found')

    self.subcat_list = [0]*n_fullsubcats
    if type(lil_cat_pref) is list:
      for pref in lil_cat_pref:
        try:
          self.subcat_list[full_flat_list.index(pref)] = 1
        except:
          print(str(pref)+' not found')
    else:
      try:
        self.subcat_list[full_flat_list.index(lil_cat_pref)] = 1
      except:
        print(str(lil_cat_pref)+' not found')

    self.keywordList = []

    #def evaluateAutocatScore(self):
    #  Fname, Lname = firstlastname(self.name)
    #  bs_content = find_person(Fname, Lname, 100)
    #  self.cat_lists = find_titles(bs_content).most_common(15)

  def addKeywords(self, addKeywordList):
    self.keywordList.append(addKeywordList)

  def deleteKeywords(self, deleteKeywordList):
    try:
        self.keywordList.delete(deleteKeywordList)
    except:
        print("Keyword " + str(deleteKeywordList) + " not found")

  def addMultipleKeywords(self, multipleKeywords):
    for keyword in multipleKeywords:
      self.addKeywords(keyword)

  def deleteMultipleKeywords(self, multipleKeywords):
    for keyword in multipleKeywords:
      self.deleteKeywords(keyword)

  def evaluateAuthor(self, randomAuthor):
    fullCatScore, subCatScore = self.calcCatScore(randomAuthor)

    isConnection, connDepth = lookForConnection(self.name, randomAuthor, 2)
    connectionScore = 1.0/connDepth if isConnection else 0

    #construct a score, currently arbitrary
    return 5*connectionScore + fullCatScore + subCatScore

  def evaluateSeminar(self, randomSeminar, debug = 0):
    fullCatScore, subCatScore = self.calcCatScore_seminar(randomSeminar)
    coauthsScore = quickConnectionSearch(self, randomSeminar, debug)
    keywordScore = self.calcKeywordScore(randomSeminar)
    totalScore = 5*coauthsScore + fullCatScore + subCatScore + keywordScore


    if (debug == 1):
         print(str(randomSeminar.author_name)+" | "+str(round(fullCatScore, 2))+ " | " +
               str(round(subCatScore, 2)) + " | " + str(round(coauthsScore, 2)) + " | " +
               str(round(keywordScore, 2)) + " | " + str(round(totalScore, 2)) + " | "+ str(randomSeminar.title) )

    return totalScore

  def updatePrefs(self, randomAuthor, userInput):
    """
    Update the end user preference values by using the scoring mechanism.
    d (User preference) = (Userinput-3)/5.0*(1/sqrt(volN))*(difference in pref)
    """

    bigCatUpdate, subCatUpdate = self.updateCatScore(randomAuthor, userInput)

    for i in range(n_cats):
      self.cat_list[i] = self.cat_list[i]+bigCatUpdate[i]
    for i in range(n_fullsubcats):
      self.subcat_list[i] = self.subcat_list[i]+subCatUpdate[i]

    return

  def updateCatScore(self, randomAuthor, userInput):
    Fname, Lname = firstlastname(randomAuthor)
    bs_content = find_person(Fname, Lname, 100)
    categoryCounter = find_titles(bs_content).most_common(10) #

    subcat_tmp = [0]*n_fullsubcats
    bigcat_tmp = [0]*n_cats
    fullCatScore = [0]*n_cats
    subCatScore = [0]*n_fullsubcats

    val_max = 0.00001
    bigval_max = 0.0001

    for key, val in categoryCounter:
      try:
        sigmoid_val = 1.0/(1+np.exp(-(val-4)))
        bigcat_key = big_cats.index(find_bigcat(key))

        subcat_tmp[full_flat_list.index(key)] = sigmoid_val
        bigcat_tmp[bigcat_key] = bigcat_tmp[bigcat_key] + sigmoid_val

        if (sigmoid_val > val_max):
          val_max = sigmoid_val
        if (bigcat_tmp[bigcat_key] > bigval_max):
          bigval_max = bigcat_tmp[bigcat_key]

      except:
        print(key + ' not found in list')

    #scoring with the sigmoid value


    if (val_max >= 0.05):
      for i in range(n_fullsubcats):
        if (subcat_tmp[i] > 0):
          subCatScore[i] = (subcat_tmp[i]/val_max-self.subcat_list[i])*(userInput - 3.0)/5.0/np.sqrt(self.sc_volN[i])
          self.sc_volN[i] = self.sc_volN[i] + 1

      for i in range(n_cats):
        if (bigcat_tmp[i] > 0):
          fullCatScore[i] = (bigcat_tmp[i]/val_max-self.cat_list[i])*(userInput - 3.0)/5.0/np.sqrt(self.fc_volN[i])
          self.fc_volN[i] = self.fc_volN[i] + 1

    #print('Category score = ' + str(bigscore))
    #print('Subcategory score = ' + str(score))
    return fullCatScore, subCatScore

  def calcCatScore_subcat(self, subcatsCounter):
    """
    Calculate the category score of the given subcats with the current EndUser object.
    Defined as subcat category .* EndUser preferences.
    """

    categoryCounter = subcatsCounter #must be inputted in proper form, as a counter object

    subcat_tmp = [0]*n_fullsubcats
    bigcat_tmp = [0]*n_cats
    val_max = 0.00001 #avoiding divide by 0 errors
    bigval_max = 0.0001


    for key, val in categoryCounter:
      try:
        sigmoid_val = 1.0/(1+np.exp(-(val-5)))
        bigcat_key = big_cats.index(find_bigcat(key))

        subcat_tmp[full_flat_list.index(key)] = sigmoid_val
        bigcat_tmp[bigcat_key] = bigcat_tmp[bigcat_key] + sigmoid_val

        if (sigmoid_val > val_max):
          val_max = sigmoid_val
        if (bigcat_tmp[bigcat_key] > bigval_max):
          bigval_max = bigcat_tmp[bigcat_key]

      except:
        print(key + ' not found in list')

    #scoring with the sigmoid value

    bigscore = 0
    score = 0
    if (val_max >= 0.05):
      for i in range(n_fullsubcats):
        score = score + subcat_tmp[i]/val_max*self.subcat_list[i]

      for i in range(n_cats):
        bigscore = bigscore + bigcat_tmp[i]/bigval_max*self.cat_list[i]
    else:
      for i in range(n_fullsubcats):
        score = score + subcat_tmp[i]*self.subcat_list[i]

      for i in range(n_cats):
        bigscore = bigscore + bigcat_tmp[i]*self.cat_list[i]

    #print('Category score = ' + str(bigscore))
    #print('Subcategory score = ' + str(score))
    return bigscore, score

  def calcCatScore(self, randomAuthor):
    """
    Calculate the category score of the author with the current EndUser object.
    Defined as Author category .* EndUser preferences.
    """

    Fname, Lname = firstlastname(randomAuthor)
    bs_content = find_person(Fname, Lname, 100)
    categoryCounter = find_titles(bs_content).most_common(10) #

    return self.calcCatScore_subcat(categoryCounter)

  def calcCatScore_seminar(self, randomSeminar):
    return self.calcCatScore_subcat(randomSeminar.cat_lists)

  def calcKeywordScore(self, randomSeminar):
    keywordScore = 0
    lowerTitle = randomSeminar.title.lower()
    for keyword in self.keywordList:
        keywordIndex1 = randomSeminar.abstract.find(keyword)
        keywordIndex2 = lowerTitle.find(keyword)
        if not( (keywordIndex1==-1) and (keywordIndex2==-1)):
          keywordScore+=1
    return keywordScore

  #helper functions
  def __str__(self):
    personString = "User = " + self.name + " ||| Categories: "
    for i in range(len(big_cats)):
      if (self.cat_list[i]>=0.70): #distinction
        personString = personString + str(big_cats[i]) + " | "
    personString = personString + "||| Keywords: "
    for keyword in self.keywordList:
        personString = personString + str(keyword) + " | "
    return personString

  def printKeywords(self):
    keywordString = ""
    for keyword in self.keywordList:
      keywordString = keywordString + str(keyword) + "| "
    print(keywordString)

#### SEMINARS

In [None]:
class Seminar:
  def __init__(self, title, id, date, author_name, author_affiliation="", abstract="", user_catLists = ""):
    self.title = title
    self.id = id
    self.date = date

    self.author_name = author_name
    self.page, self.coauths = findPageCoauthsFromName(self.author_name)

    self.affiliation = author_affiliation
    self.abstract = abstract

    self.cat_confidence = 0.0
    self.cat_lists = Counter(user_catLists)
    if (user_catLists == ""):
        self.evaluateCatScore()
        self.cat_confidence = 0.70
    else:
        self.cat_confidence = 0.95
        self.evaluateCatScore()
        user_defined_list = Counter(user_catLists)
        for k in user_defined_list.keys():
            self.cat_lists[k]+=20

  def evaluateCatScore(self):
    Fname, Lname = firstlastname(self.author_name)
    bs_content = find_person(Fname, Lname, 100)
    self.cat_lists = find_titles(bs_content).most_common(15)

  def evaluateUserOld(self, randomUser):
    """
    Evaluates how a user would like the given seminar, using the user's CatScore Function
    """
    fullCatScore, subCatScore = randomUser.calcCatScore_subcat(self.cat_lists)
    isConnection, connDepth = lookForConnection(self.author_name, randomUser.name, 1)
    connectionScore = 0
    if (isConnection):
      connectionScore = 1.0/connDepth

    #construct a score, currently arbitrary
    return 5*connectionScore + fullCatScore + subCatScore

  def evaluateUser(self, randomUser, debug=0):
    return randomUser.evaluateSeminar(self, debug)

  def updatePrefs(self, randomUser, userInput):
    """
    Does nothing right now. Update the seminar rating based on the user's rating and preferences
    This is where the cat_confidence parameter might be used
    """

    pass

  ## helper functions
  def __str__(self):
    seminarString = self.title + " | " + self.author_name + " | " + self.date.strftime('%c')
    return seminarString

  def compareSeminar(self, testSeminar):
    """
    Compare seminars to avoid overlapping from multiple sources
    """
    Fname1, Lname1 = firstlastname(self.author_name)
    Fname2, Lname2 = firstlastname(testSeminar.author_name)
    if (Lname1 != Lname2):
      return False

    if (self.date == testSeminar.date):
      return True
    else:
      return False

### EXTRACTING DATA FROM INSPIRE

In [None]:
import json
import requests
import datetime

#### INSPIRE

The HEP server

In [None]:
urltest = "https://inspirehep.net/api/seminars?sort=datedesc&size=25&page=1&start_date=all&q=series.name%3A\"Oxford%20Theoretical%20Particle%20Physics%20seminar\""
urltest

'https://inspirehep.net/api/seminars?sort=datedesc&size=25&page=1&start_date=all&q=series.name%3A"Oxford%20Theoretical%20Particle%20Physics%20seminar"'

In [None]:
import requests
var=requests.get(urltest)
PTfile=json.loads(var.content)

In [None]:
def parseInspireName(strName):
  commaIndex = strName.find(',')
  lastName = strName[0:commaIndex]
  firstName = strName[commaIndex+2:]
  authorName = firstName + ' ' + lastName
  return authorName

### MAIN CODE

In [None]:
global seminar_list, user_list, seminarCount, userCount
seminar_list = []
user_list = []

userCount = 0
seminarCount = 0

In [None]:
def createNewSeminar(title, date, author_name, author_affiliation="", abstract="", user_catLists = ""):
  #code to check duplicates will go here first
  global seminarCount
  newSeminar = Seminar(title, seminarCount, date, author_name, author_affiliation, str.lower(abstract), user_catLists)
  seminarCount+=1
  seminar_list.append(newSeminar)

def createNewUser(name, big_cat_pref, lil_cat_pref):
  #code to check for duplicates will go here
  global userCount
  newUser = EndUser(name, userCount, big_cat_pref, lil_cat_pref)
  userCount+=1
  user_list.append(newUser)

In [None]:
def loadINSPIRESeminars():
  inspireURL =  "https://inspirehep.net/api/seminars?sort=datedesc&size=25&page=1&start_date=all&q=series.name%3A\"Oxford%20Theoretical%20Particle%20Physics%20seminar\""
  var=requests.get(urltest)
  PTfile=json.loads(var.content)

  for seminarEvent in PTfile['hits']['hits']:
    semTitle = seminarEvent['metadata']['title']['title'];
    semDate = datetime.datetime.fromisoformat(seminarEvent['metadata']['start_datetime'])
    semAuthor = parseInspireName(seminarEvent['metadata']['speakers'][0]['name'])
    semAffil = seminarEvent['metadata']['speakers'][0]['affiliations'][0]['value']
    createNewSeminar(semTitle, semDate, semAuthor, semAffil)

### DEMONSTRATION

First, load the current list of seminars from INSPIRE

In [None]:
loadINSPIRESeminars()

In [None]:
for seminar in seminar_list:
    print(str(seminar))

Kaon Decays as a Probe for New Physics | Stefan Schacht | Thu Jun 15 15:00:00 2023
High Energy Resummation for Jet Processes at the LHC | Jennifer M. Smillie | Thu Jun  8 15:00:00 2023
Extended Path Intensity Correlation | Ken Van Tilburg | Thu Jun  1 15:00:00 2023
Topological Aspects of String Phenomenology | Jonathan J. Heckman | Thu May 25 15:00:00 2023
Fits of alpha_s using power corrections in the 3-jet region | Giulia Zanderighi | Thu May 18 15:00:00 2023
Precision LHC predictions for top and dark matter | Jonas M. Lindert | Thu May 11 15:00:00 2023
Effective field theory for cosmological phase transitions | Oliver Gould | Thu May  4 15:00:00 2023
Dark Energy with a little help from its friends | S.L. Parameswaran | Thu Apr 27 15:00:00 2023
Chaotic spin-chains in AdS/CFT | Anne Spiering | Thu Apr 20 15:00:00 2023
Hadronic vacuum-polarisation of the photon: lattice SM predictions, experimental data and the muon g-2 puzzle | Roberto Frezzotti | Thu Mar  9 16:00:00 2023
Logarithmic 

Create an user "Andrei" which preferred arXiv categories and keywords

In [None]:
andrei = EndUser('Andrei Constantin', 0, ['hep-th', 'hep-ph'], ['hep-th', 'hep-ph'])
#andrei.addKeywords("string")
andrei.addMultipleKeywords(["string", "phenomenology", "topological"])
#andrei.printKeywords()
print(andrei)

User = Andrei Constantin ||| Categories: hep-ph | hep-th | ||| Keywords: string | phenomenology | topological | 


Score the entire seminar list for a user using pre-defined weights and output

In [None]:
def scoreSeminarsForUser(randomUser):
    recommendationList = np.zeros((len(seminar_list), 2))
    for i in range(len(seminar_list)):
        testSeminar = seminar_list[i]
        recommendationList[i, 0] = testSeminar.id
        recommendationList[i, 1] = round(testSeminar.evaluateUser(randomUser), 2)

    #recommendationList.sort(key = lambda x:x[1], reverse = True)
    sortIndices = np.flip(np.argsort(recommendationList[:, 1], axis=None))
    recommendationList[:, :] = recommendationList[sortIndices, :]
    return recommendationList

def printReccommendationList(randomUser, reccList):
    print("Recommendations for " + str(randomUser.name))
    print("Score \t || \t Seminar ")
    for i in range(len(reccList)):
        print(str(reccList[i, 1]) + " \t || \t "+str(seminar_list[int(reccList[i, 0])]) )
    print("_________________________________________________")


In [None]:
recListAndrei = scoreSeminarsForUser(andrei)
#recListAndrei

astro-ph not found in list
14H40 (Primary) 14K30, 14H20 (Secondary) not found in list
34C07, 34C25, 16T05, 16T30 not found in list
13F60, 20F36 (Primary) 16G20 (Secondary) not found in list
31C05, 60G50, 60J10, 82C24, 28A80 not found in list
astro-ph not found in list
cond-mat not found in list
53C25, 53D35, 57R17, 14J25 not found in list
53C25, 53D35, 57R17 not found in list
53D05, 57R17, 57R18 not found in list
astro-ph not found in list
astro-ph not found in list
astro-ph not found in list


In [None]:
printReccommendationList(andrei, recListAndrei)

Recommendations for Andrei Constantin
Score 	 || 	 Seminar 
30.0 	 || 	 UV/IR Mixing, EFTs, Hidden Cancellations, and Origami:    Calculating the Higgs Mass and Gauge-Coupling Running in String Theory | Steven Adam Abel | Thu Feb 16 16:00:00 2023
11.6 	 || 	 Topological Aspects of String Phenomenology | Jonathan J. Heckman | Thu May 25 15:00:00 2023
8.92 	 || 	 The tameness of quantum field theory | Thomas W. Grimm | Thu Nov 17 16:00:00 2022
7.96 	 || 	 The extent of anisotropy of our Universe | Roya Mohayaee | Thu Dec  1 14:30:00 2022
7.67 	 || 	 POSTPONED to 25th May | David J.E. Marsh | Thu Feb  9 16:00:00 2023
4.93 	 || 	 Index theorems, Generalized Hall current for topological insulators and superconductors | Srimoyee Sen | Thu Feb 23 16:00:00 2023
3.96 	 || 	 Hadronic vacuum-polarisation of the photon: lattice SM predictions, experimental data and the muon g-2 puzzle | Roberto Frezzotti | Thu Mar  9 16:00:00 2023
3.82 	 || 	 Effective field theory for cosmological phase transitio

Creating a new user "Liam" and showing that feedback on seminars changes recommendation score

In [None]:
liam = EndUser('Liam Ruske', 0, 'cond-mat', ['cond-mat.soft', 'cond-mat.stat-mech'])
print(liam)

User = Liam Ruske ||| Categories: cond-mat | ||| Keywords: 


In [None]:
liam.evaluateAuthor('Julia Yeomans')

Connection found at depth = 1 through Julia Yeomans


7.997527376843365

In [None]:
liam.updatePrefs('Julia Yeomans',5)
liam.evaluateAuthor('Julia Yeomans')

Connection found at depth = 1 through Julia Yeomans


10.198166273987894

In [None]:
liam.updatePrefs('Julia Yeomans',1)
liam.updatePrefs('Julia Yeomans',1)
liam.updatePrefs('Julia Yeomans',1)
liam.updatePrefs('Julia Yeomans',1)
liam.evaluateAuthor('Julia Yeomans')

Connection found at depth = 1 through Julia Yeomans


6.125123023238223

And showing that the whole INSPIRE list would get quite low scores for Liam, whereas it gets filtered for Andrei

In [None]:
recListLiam = scoreSeminarsForUser(liam)

astro-ph not found in list
14H40 (Primary) 14K30, 14H20 (Secondary) not found in list
34C07, 34C25, 16T05, 16T30 not found in list
13F60, 20F36 (Primary) 16G20 (Secondary) not found in list
31C05, 60G50, 60J10, 82C24, 28A80 not found in list
astro-ph not found in list
cond-mat not found in list
53C25, 53D35, 57R17, 14J25 not found in list
53C25, 53D35, 57R17 not found in list
53D05, 57R17, 57R18 not found in list
astro-ph not found in list
astro-ph not found in list
astro-ph not found in list


In [None]:
printReccommendationList(liam, recListLiam)

Recommendations for Liam Ruske
Score 	 || 	 Seminar 
0.41 	 || 	 Index theorems, Generalized Hall current for topological insulators and superconductors | Srimoyee Sen | Thu Feb 23 16:00:00 2023
0.33 	 || 	 Topological Aspects of String Phenomenology | Jonathan J. Heckman | Thu May 25 15:00:00 2023
0.26 	 || 	 Faces in the Forest: Feynman Integrals and the Method of Regions | Stephen Philip Jones | Thu Oct 27 15:00:00 2022
0.05 	 || 	 The tameness of quantum field theory | Thomas W. Grimm | Thu Nov 17 16:00:00 2022
0.04 	 || 	 The extent of anisotropy of our Universe | Roya Mohayaee | Thu Dec  1 14:30:00 2022
0.04 	 || 	 Random Matrix Spectroscopy and Quantum Gravity | Clifford Victor Johnson | Thu Jan 12 16:42:00 2023
0.03 	 || 	 Effective field theory for cosmological phase transitions | Oliver Gould | Thu May  4 15:00:00 2023
0.02 	 || 	 Evidence and implications of intrinsic charm in the proton | Juan Rojo | Thu Nov 24 16:00:00 2022
0.01 	 || 	 High Energy Resummation for Jet Proce

And the effect of adding a keyword is demonstrated here

In [None]:
liam.addKeywords("string")
recListLiam = scoreSeminarsForUser(liam)
printReccommendationList(liam, recListLiam)

astro-ph not found in list
14H40 (Primary) 14K30, 14H20 (Secondary) not found in list
34C07, 34C25, 16T05, 16T30 not found in list
13F60, 20F36 (Primary) 16G20 (Secondary) not found in list
31C05, 60G50, 60J10, 82C24, 28A80 not found in list
astro-ph not found in list
cond-mat not found in list
53C25, 53D35, 57R17, 14J25 not found in list
53C25, 53D35, 57R17 not found in list
53D05, 57R17, 57R18 not found in list
astro-ph not found in list
astro-ph not found in list
astro-ph not found in list
Recommendations for Liam Ruske
Score 	 || 	 Seminar 
1.33 	 || 	 Topological Aspects of String Phenomenology | Jonathan J. Heckman | Thu May 25 15:00:00 2023
1.01 	 || 	 Off Shell String Theory | Aron C. Wall | Tue Feb 14 13:00:00 2023
0.91 	 || 	 UV/IR Mixing, EFTs, Hidden Cancellations, and Origami:    Calculating the Higgs Mass and Gauge-Coupling Running in String Theory | Steven Adam Abel | Thu Feb 16 16:00:00 2023
0.41 	 || 	 Index theorems, Generalized Hall current for topological insulators

### TESTING SECTIONS THROWN HERE

In [None]:
bs_content = find_person('Andrei', 'Constantin', 100)
find_titles(bs_content).most_common(6)

[('hep-th', 35),
 ('math.AG', 11),
 ('cs.AI', 2),
 ('math.AP', 2),
 ('35Q35', 2),
 ('hep-ph', 2)]

In [None]:
coauthors = find_coauthors(bs_content)
coauthors.most_common(10)

[('Andrei Constantin', 35),
 ('Andre Lukas', 26),
 ('Callum Brodie', 9),
 ('Philip Candelas', 6),
 ('Evgeny Buchbinder', 6),
 ('Thomas Harvey', 6),
 ('James Gray', 4),
 ('Fabian Ruehle', 4),
 ('Rehan Deen', 3),
 ('Steven Abel', 3)]

In [None]:
Fname, Lname = firstlastname('Julia Yeomans')
bs_content = find_person(Fname, Lname, 100)
find_titles(bs_content).most_common(6)

[('cond-mat.soft', 89),
 ('physics.flu-dyn', 35),
 ('physics.bio-ph', 34),
 ('cond-mat.stat-mech', 11),
 ('cond-mat.mtrl-sci', 3),
 ('cond-mat.mes-hall', 2)]

For the recommender system, we could write some code which might convert the above values to a value between 0 to 1 for the recommender. Not a good idea to use a percentage, maybe a tanh like function that saturates for n>=10, and rescales everything so that the largest number is 1 at the end?

In [None]:
#bs_content

In [None]:
lookForConnection('Prateek Sarkar', 'Claire Gwenlan', 3)

In [None]:
#to show the contents of extracted file, uncomment below
#print(PTfile)

In [None]:
#for speaker in PTfile['hits']['hits']:
#  print(speaker['metadata']['speakers'][0]['name'])

#PTfile['hits']['hits'][0]['metadata']['speakers'][0]['name']

In [None]:
#sd = PTfile['hits']['hits'][0]['metadata']
#sd

In [None]:
#start and end indices for each category, seems unnecessary now
print(sub_cats)

print(cat_start_idx)
print(cat_end_idx)

In [None]:
print(liam)

In [None]:
sdddd = datetime.datetime.fromisoformat(sd['start_datetime'])
sdddd.strftime('%c')

In [None]:
liam.evaluateAuthor('Andrei Constantin')

In [None]:
liam.name

In [None]:
strName = "Sen, Srimoyee"
commaIndex = strName.find(',')
lastName = strName[0:commaIndex]
firstName = strName[commaIndex+2:]
authorName = firstName + ' ' + lastName


In [None]:
seminar_test = Seminar("HelloWorld", 1, "Saraswat Bhattacharyya")

In [None]:
seminar_test.cat_lists

In [None]:
seminar_test.evaluateUser(liam)

In [None]:
liam.calcCatScore_seminar(seminar_test)

The idea is as follows:


1.   Count keywords from all the different Arxiv categories. Keep in adjacency matrix
2.   Use dimensionality reduction to reduce number of features for easier training
3.   Count the value of dimension-reduced keywords in the abstract of the talk. This is the parameter value for the recommender system.
4.   Figure out a way to allow this thing to update with time (eventually)



In [None]:
temp_score_list = []
#print("Speaker Name | Full-category score | Sub-category score | Co-author score | Keyword score | Total score")
for seminar in seminar_list:
    temp_score = seminar.evaluateUser(andrei, debug=0)
    temp_score_list.append(temp_score)
    #print(str(seminar) + "  || SCORE - " + str(temp_score))

scored_seminar_list = seminar_list + temp_score_list
#for seminar in scored_seminar_list:
#    print(seminar)

astro-ph not found in list
14H40 (Primary) 14K30, 14H20 (Secondary) not found in list
34C07, 34C25, 16T05, 16T30 not found in list
13F60, 20F36 (Primary) 16G20 (Secondary) not found in list
31C05, 60G50, 60J10, 82C24, 28A80 not found in list
astro-ph not found in list
cond-mat not found in list
53C25, 53D35, 57R17, 14J25 not found in list
53C25, 53D35, 57R17 not found in list
53D05, 57R17, 57R18 not found in list
astro-ph not found in list
astro-ph not found in list
astro-ph not found in list


### JUNK THROWN HERE

#### OX PHYSICS

In [None]:
 urlstub = "https://web01-dev.physics.ox.ac.uk/api/all-events/v1?page="
 page1_url = urlstub+"1"
 print(page1_url)

https://web01-dev.physics.ox.ac.uk/api/all-events/v1?page=1


In [None]:
data = urllib.request.urlopen(page1_url)
print(data)

URLError: ignored

In [None]:
page_request = requests.get(page1_url)
page_file = json.loads(page_request.content)
print(page_file)

In [None]:
for element_index in range(len(page_file)):
    element = page_file[element_index]
    if (element['field_speakers'] != ''):
        print(str(element_index) + " " + str(element['field_speakers']))


In [None]:
page_file[1]

In [None]:
recListAndrei

In [None]:
def myFunc(e):
  return e[-1]

new_list.sort(reverse=True, key=myFunc)
new_list

In [None]:
for element in new_list:
    print(element[0])

In [None]:
str(seminar_list[3])

In [None]:
andrei.calcKeywordScore(seminar_list[3])
keywordList = andrei.keywordList
randomSeminar = seminar_list[15]

In [None]:
quickConnectionSearch(andrei, randomSeminar, debug=1)

In [None]:
cc = findPageCoauthsFromName('Andrei Constantin')

In [None]:
new_list = []
for i in range(len(seminar_list)):
    new_list.append([seminar_list[i], temp_score_list[i]])
new_list