This search engine allows the user to input a series of keywords as well as restrictions on allowed authors, venues, and years, and then returns the most common words associated with these searches. 

# Don't worry about it

In [0]:
!pip install nltk



In [0]:
import requests
from lxml import html
from functools import reduce
from collections import Counter
import string
import numpy
import itertools
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
# returns an xml of coauthors
# https://dblp.uni-trier.de/pers/xc/urlpt.xml
# for example: urlpt = l/Ley:Michael

In [0]:
def getCoauthors(author):
  url = "https://dblp.uni-trier.de/pers/xc/" + str(author) + ".xml"
  page = requests.get(url)
  tree = html.fromstring(page.content)
  coauthor_names = []
  for i in range(15):
    coauthor_names.append(tree.xpath('//author[@count="' + str(i) + '"]/text()'))
  return coauthor_names
  #with open(url, 'wb') as f: 
   #     f.write(resp.content)

In [0]:
# returns an xml of a publication's info
# https://dblp.uni-trier.de/rec/xml/key.xml
# for example: key = journals/sigmod/Ley00

In [0]:
# returns an xml of an author's works
# https://dblp.uni-trier.de/pers/xk/urlpt.xml
# for example: urlpt = l/Ley:Michael

In [0]:
# https://dblp.uni-trier.de/search/publ/api?q=big%20data&h=1000&format=xml

In [0]:
def getSearchResults(stringSet=None, peopleRestriction=None, venueRestriction=None, yearRestriction=None):
  listsofauthors = []
  for string in stringSet:
    string.replace(" ", "%20")
    url = "https://dblp.uni-trier.de/search/publ/api?q=" + str(string) + "&h=1000&format=xml"
    page = requests.get(url)
    tree = html.fromstring(page.content)
    author_names = tree.xpath('//author/text()')
    listsofauthors.append(author_names)
      #with open(url, 'wb') as f: 
      #     f.write(resp.content)
  return listsofauthors

## The important ones

In [0]:
# i'm getSearchResults but stronger
def getYourChoice(what, stringSet=[' '], peopleRestrictions = ['blank'], venueRestrictions = ['blank'], yearRestrictions = ['blank']):
  a = [stringSet, peopleRestrictions, venueRestrictions, yearRestrictions]
  b = list(itertools.product(*a))
  lists = []

  for i in range(len(b)):
    string = ""
    
    keyword = b[i][0]
    keyword.replace(" ", "%20")
    string += keyword

    person = b[i][1]
    if person != 'blank':
      person.replace(" ", "_")
      string += "%20author%3A" + person + "%3A"

    venue = b[i][2]
    if venue != 'blank':
      venue.replace(" ", "_")
      string += "%20venue%3A" + venue + "%3A"

    year = b[i][3]
    if year != 'blank':
      string += "%20year%3A" + year + "%3A"

    url = "https://dblp.uni-trier.de/search/publ/api?q=" + str(string) + "&h=1000&format=xml"
    page = requests.get(url)
    tree = html.fromstring(page.content)
    stuff = tree.xpath('//' + what + '/text()')
    lists.append(stuff)
      #with open(url, 'wb') as f: 
      #     f.write(resp.content)
  return lists

In [0]:
def userInput():
  stringSet = []
  authorSet = []
  venueSet = []
  yearSet = []

  print("Enter keywords to search across: ")
  while(True):
    s = input("Keyword: ")
    if len(s) > 0:
      stringSet.append(s)
    else:
      break

  print("Enter author restrictions: ")
  while(True):
    s = input("Author: ")
    if len(s) > 0:
      authorSet.append(s)
    else:
      break

  print("Enter venue restrictions: ")
  while(True):
    s = input("Venue: ")
    if len(s) > 0:
      venueSet.append(s)
    else:
      break

  print("Enter year restrictions: ")
  while(True):
    s = input("Year: ")
    if len(s) > 0:
      yearSet.append(s)
    else:
      break

  return stringSet, authorSet, venueSet, yearSet

In [0]:
def intersections(lists):
  temp = Counter(x for xs in lists for x in xs)
  most_common = temp.most_common()
  in_all = list(reduce(lambda i, j: i & j, (set(x) for x in lists)))
  both = []
  for i in range(len(most_common)):
    name = str(most_common[i][0])
    if (str(name) in in_all):
      both.append(name)
  return most_common

In [0]:
def executeSearch():
  stringSet = userInput()
  print()
  print("These authors are all related to", str(stringSet), ":")
  authors = getSearchResults(stringSet)
  common = intersections(authors)
  print(common)
  return common

In [0]:
# executeSearch()

In [0]:
def strongSearch(what): 
  stringSet, authorSet, venueSet, yearSet = userInput()
  print()
  if len(stringSet) == 0:
    stringSet = [' ']
  if len(authorSet) == 0:
    authorSet = ['blank']
  if len(venueSet) == 0:
    venueSet = ['blank']
  if len(yearSet) == 0:
    yearSet = ['blank']

  authors = getYourChoice(what, stringSet, authorSet, venueSet, yearSet)
  common = intersections(authors)
  #print(common)
  return common

In [0]:
# l = strongSearch('title')

In [0]:
import re

def getPopularWords(l): 
  titles = []
  words = []
  for i in range(len(l)):
    title = l[i][0]
    titles.append(title)
    res = re.findall(r'\w+', title)
    for word in res:
      words.append(word.lower())
  temp = Counter(words)
  return temp.most_common()

In [0]:
def wordCountSearch(what = 'title'):
  l = strongSearch(what)
  searchResult = getPopularWords(l)
  stopWords = set(stopwords.words('english'))
  filteredResult = [word for word in searchResult if not word[0] in stopWords]
  return filteredResult

# Implementing wordCountSearch

In [0]:
searchResult = wordCountSearch()

Enter keywords to search across: 
Keyword: data sciencce
Keyword: machine learning
Keyword: 
Enter author restrictions: 
Author: 
Enter venue restrictions: 
Venue: 
Enter year restrictions: 
Year: 2017
Year: 2018
Year: 



In [0]:
print(searchResult)



In [0]:
dataAuthors = strongSearch('author')

Enter keywords to search across: 
Keyword: data science
Keyword: 
Enter author restrictions: 
Author: 
Enter venue restrictions: 
Venue: 
Enter year restrictions: 
Year: 



In [0]:
bioAuthors = strongSearch('author')

Enter keywords to search across: 
Keyword: biology
Keyword: 
Enter author restrictions: 
Author: 
Enter venue restrictions: 
Venue: 
Enter year restrictions: 
Year: 



In [0]:
print(len(dataAuthors))
print(len(bioAuthors))


def authorIntersection(firstList, secondList):
  secondListNames = [author[0] for author in secondList]
  firstListNames = [author[0] for author in firstList]
  intersectedList = list( set(firstListNames) & set(secondListNames))
  return intersectedList


attempt = authorIntersection(dataAuthors, bioAuthors)
print(len(attempt))
print(attempt)

2937
3682
36
['Yang Liu', 'Chao Sun', 'Reinhard Schneider 0002', 'Emanuela Merelli', 'Robert Hoehndorf', 'Bindu Nanduri', 'Wei Yang', 'Clemens Beckstein', 'Plamen Angelov 0001', 'Chao Fang', 'Yaochu Jin', 'Xiaodong Wang', 'Xuequn Shang', 'Qi Li', 'Xiaofeng Song', 'Peter M. A. Sloot', 'Yang Zhang', 'Yu Wang', 'Allyson L. Lister', 'Yuhui Shi', 'Boris G. Mirkin', 'Ying Li', 'Peter V. Coveney', 'Peter Dittrich', 'Yue Zhang', 'Yong Liu', 'Xiaohua Hu', 'Chao Wang', 'Min Li 0007', 'Nicolas Férey', 'Zhongming Zhao', 'Bud Mishra', 'Ying Tan 0002', 'Min Zeng', 'Lu Zhang', 'Michel Dumontier']


In [0]:
dataWords = wordCountSearch()
bioWords = wordCountSearch()


Enter keywords to search across: 
Keyword: data science
Keyword: 
Enter author restrictions: 
Author: 
Enter venue restrictions: 
Venue: 
Enter year restrictions: 
Year: 

Enter keywords to search across: 
Keyword: biology
Keyword: 
Enter author restrictions: 
Author: 
Enter venue restrictions: 
Venue: 
Enter year restrictions: 
Year: 



In [0]:

def searchCorrelation(firstDict, secondDict):
  totalWords = sum(firstDict.values()) + sum(secondDict.values())
  correlationList = []
  for word1, num1 in firstDict.items():
    for word2, num2 in secondDict.items():
      if word1 == word2:
        correlationList.append([word1, num1 + num2, (num1 + num2) / totalWords])
  return correlationList


dataWords = dict(dataWords)
bioWords = dict(bioWords)

dictDataAuthors = dict(dataAuthors)
dictBioAuthors = dict(bioAuthors)

dataBioAuthorCorr = searchCorrelation(dictDataAuthors, dictBioAuthors)

totalWords = sum(dataWords.values()) + sum(bioWords.values())
dataBioCorr = searchCorrelation(dataWords, bioWords)



In [0]:
print(dataBioCorr)
print(dataBioAuthorCorr)
print(totalWords)

keys = set(dataWords.keys()) | set(bioWords.keys())
dataBioCorrTotal = numpy.corrcoef(
    [dataWords.get(x, 0) for x in keys],
    [bioWords.get(x, 0) for x in keys])[0, 1]
print(dataBioCorrTotal)

authorKeys = set(dictDataAuthors.keys()) | set(dictBioAuthors.keys())
dataBioCorrTotalAuthors = numpy.corrcoef(
    [dictDataAuthors.get(x, 0) for x in authorKeys],
    [dictBioAuthors.get(x, 0) for x in authorKeys])[0, 1]

print(dataBioCorrTotalAuthors)





[['data', 1424, 0.06968777527649996], ['science', 552, 0.02701380052853088], ['big', 152, 0.007438582754233141], ['international', 139, 0.006802388176568464], ['conference', 135, 0.006606635998825487], ['systems', 316, 0.015464422041695213], ['mining', 89, 0.004355485954781247], ['proceedings', 81, 0.003963981599295292], ['database', 91, 0.004453362043652736], ['management', 75, 0.003670353332680826], ['based', 177, 0.00866203386512675], ['sciences', 72, 0.003523539199373593], ['knowledge', 71, 0.0034746011549378485], ['applications', 84, 0.004110795732602525], ['research', 86, 0.004208671821474014], ['databases', 65, 0.0031809728883233825], ['analysis', 132, 0.0064598218655182536], ['large', 58, 0.0028384065772731723], ['scale', 60, 0.002936282666144661], ['life', 53, 0.0025937163550944506], ['transactions', 60, 0.002936282666144661], ['open', 51, 0.0024958402662229617], ['2018', 51, 0.0024958402662229617], ['papers', 51, 0.0024958402662229617], ['centered', 41, 0.0020064598218655183]

In [45]:
compsciAuthors = strongSearch('author')
compsciWords = wordCountSearch()

Enter keywords to search across: 
Keyword: data science
Keyword: 
Enter author restrictions: 
Author: 
Enter venue restrictions: 
Venue: 
Enter year restrictions: 
Year: 

Enter keywords to search across: 
Keyword: asdf
Keyword: 
Enter author restrictions: 
Author: 
Enter venue restrictions: 
Venue: 
Year: 
Enter year restrictions: 



In [23]:
compsciAuthors = dict(compsciAuthors)
compsciWords = dict(compsciWords)

dataCSCorr = searchCorrelation(compsciWords, dataWords)

dataCompSciCorrTotal = numpy.corrcoef(
    [dataWords.get(x, 0) for x in keys],
    [compsciWords.get(x, 0) for x in keys])[0, 1]
print(dataCompSciCorrTotal)

dataCSAuthorCorr = searchCorrelation(compsciAuthors, dictDataAuthors)
print(dataCSAuthorCorr)

0.27584726879933674
[['Jack J. Dongarra', 25, 0.004066363044892648], ['Peter M. A. Sloot', 20, 0.0032530904359141183], ['Zeguang Lu', 12, 0.001951854261548471], ['Vassil N. Alexandrov', 7, 0.0011385816525699415], ['Jaroslav Pokorný', 6, 0.0009759271307742355], ['Roger Y. Lee', 5, 0.0008132726089785296], ['Andreas Holzinger', 9, 0.0014638906961613532], ['Albert Y. Zomaya', 5, 0.0008132726089785296], ['Julius Stuller', 4, 0.0006506180871828237], ['Alexandru Telea', 4, 0.0006506180871828237], ['Weipeng Jing', 6, 0.0009759271307742355], ['Xianhua Song', 6, 0.0009759271307742355], ['Hongzhi Wang', 6, 0.0009759271307742355], ['Gerhard Weikum', 4, 0.0006506180871828237], ['Barbara Catania', 3, 0.00048796356538711777], ['A Min Tjoa', 3, 0.00048796356538711777], ['Georg Gottlob', 4, 0.0006506180871828237], ['Krzysztof Grochla', 3, 0.00048796356538711777], ['Qinglei Zhou', 4, 0.0006506180871828237], ['Yan Wang 0031', 4, 0.0006506180871828237], ['Wei Xie 0004', 4, 0.0006506180871828237], ['Beiji 

In [0]:

dataBioAuthorCorr
dataBioCorr
dataCSAuthorCorr
dataCSCorr

import pandas as pd
df=pd.DataFrame.from_dict(dic,orient='index',columns=['words'])

from google.colab import files
df.to_csv('sarah_yurick.csv') 
files.download('sarah_yurick.csv')


In [0]:
import pandas as pd
from google.colab import files

dataBioAuthorCorrdf = pd.DataFrame.from_records(dataBioAuthorCorr)
dataBioAuthorCorrdf.columns = ['Name', 'Comparison', 'Score']
dataBioAuthorCorrdf.to_csv('dataBioAuthorCorrdf.csv') 
files.download('dataBioAuthorCorrdf.csv')

dataBioCorrdf = pd.DataFrame.from_records(dataBioCorr)
dataBioCorrdf.columns = ['Name', 'Comparison', 'Score']
dataBioCorrdf.to_csv('dataBioCorrdf.csv') 
files.download('dataBioCorrdf.csv')

dataCSAuthorCorrdf = pd.DataFrame.from_records(dataCSAuthorCorr)
dataCSAuthorCorrdf.columns = ['Name', 'Comparison', 'Score']
dataCSAuthorCorrdf.to_csv('dataCSAuthorCorrdf.csv') 
files.download('dataCSAuthorCorrdf.csv')

dataCSCorrdf = pd.DataFrame.from_records(dataCSCorr)
dataCSCorrdf.columns = ['Name', 'Comparison', 'Score']
dataCSCorrdf.to_csv('dataCSCorrdf.csv') 
files.download('dataCSCorrdf.csv')


In [0]:
dataCSCorrdf = pd.DataFrame.from_records(dataCSCorr)
dataCSCorrdf.columns = ['Name', 'Comparison', 'Score']
df.to_csv('dataCSCorrdf.csv') 
files.download('dataCSCorrdf.csv')