# Self study 2


In this self-study we build an index that supports Boolean search over the web pages that you crawl with the crawler from the 1st self study. You can continue to just extract the titles of the web-pages you crawl, or you can be more adventurous and look at the whole text that you get from the .get_text() method of a BeautifulSoup parser. In either case, the collection of texts from the crawled web-pages is you corpus. You should then:

- construct the vocabulary of terms for your corpus
- build an 'inverted' index for your vocabulary
- implement Boolean search for your index (perhaps only for a limited set of Boolean queries)

In [6]:
# Some things already used in self study 1:
import requests
from bs4 import BeautifulSoup
from urllib.robotparser import RobotFileParser
from nltk.stem.snowball import SnowballStemmer
import string


A useful resource is the nltk natural language processing package:
https://www.nltk.org/
which provides methods for tokenization, stemming, and much more (the 'punkt' package is needed for tokenization):

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sebas\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

Now let's use the title string of the AAU homepage as an example:

In [4]:
r=requests.get('https://www.aau.dk/')
r_parse = BeautifulSoup(r.text, 'html.parser')
string=r_parse.find('title').string
print(string)

AAU - Viden for verden - Aalborg Universitet


We can tokenize:

In [7]:
tokens=nltk.word_tokenize(string)
for t in tokens:
    print(t)

AAU
-
Viden
for
verden
-
Aalborg
Universitet


And we can stem:

In [8]:
ps=nltk.PorterStemmer()
for t in tokens:
    print(ps.stem(t))



aau
-
viden
for
verden
-
aalborg
universitet


For Danish language the Porter stemmer will not be terribly useful! There is also a Danish option:

In [9]:
from nltk.stem.snowball import SnowballStemmer

dstemmer=SnowballStemmer("danish")

In [12]:
for t in tokens:
    print(dstemmer.stem(t))


aau
-
vid
for
verd
-
aalborg
universit


In [13]:
titles = []
corpus = [] #collection of documents
postings = []

numOfArticles = 10

dstemmer=SnowballStemmer("danish")

#construct the vocabulary of terms for your corpus (corpus)
#build an 'inverted' index for your vocabulary (postings)

# web crawls on the given link
def getTitles(link):
    titles = []

    rp=RobotFileParser()
    rp.set_url(link)
    rp.read()
    r=requests.get(link)

    r_parse = BeautifulSoup(r.text, 'html.parser')
    r_parse.find('title').string
    for i, a in enumerate(r_parse.find_all('a')):
        if(i == numOfArticles+1): break
        _link = a['href']
        if(_link == '#main'):   continue
        if(_link[0] == '/'):    _link = link+_link[1:]
        titles.append(_getTitles(_link))
    return titles

# takes the found links in the crawl and finds the title of them
def _getTitles(link):
    r=requests.get(link)
    r_parse = BeautifulSoup(r.text, 'html.parser')
    return r_parse.find('title').string

def remove_non_ascii(a_str):
    ascii_chars = set(string.printable)

    return ''.join(
        filter(lambda x: x in ascii_chars, a_str)
    )

def tokenizeAndStemTitles(titles):
    _invertedIndex = [] # [{'vocabulary': 'aau', 'postings': [0, 1, 4]},...]
    tokens = []

    for i, title in enumerate(titles):
        #first tokenize the document
        _tokens=nltk.word_tokenize(title)

        #then stem the tokens and remove unwanted characters.
        ps=nltk.PorterStemmer()
        for t in _tokens:
            s = ps.stem(t)
            s = remove_non_ascii(s)
            s = s.replace("-", "")
            if(s == ''): continue
            if s not in tokens: tokens.append(s)

            flag = 0
            for el in _invertedIndex:
                if el['vocabulary'] == s:
                    el['postings'].append(i) #add index to posting
                    flag = 1

            #if flag is 0, the only occurrence of the word is in the current document
            if flag == 0: _invertedIndex.append(dict(vocabulary=s, postings=[i]))

    return tokens, _invertedIndex

titles = getTitles('https://www.aau.dk/')

corpus, invertedIndex = tokenizeAndStemTitles(titles)
print("corpus:", corpus)
print("inverted matrix:", invertedIndex)

#implement Boolean search for your index (perhaps only for a limited set of Boolean queries)
validQueryOperators = ["AND", "OR"]
query = "aau AND aalborg OR viden AND verden OR og AND sidefag"

def getPosting(invertedIndex, wd):
    for dict in invertedIndex:
        if(dict['vocabulary'] == wd):
            return dict['postings']
    return False

def booleanSearch(query, validQueryOperators, corpus):
    t = [x for x in query.split(" ")]

    for wd in t:
        if wd not in corpus and wd not in validQueryOperators:
            print(wd, "was not in the corpus. Try again.")
            return False

    print(t)

    t = merge(mergeAND, t, "AND")
    t = merge(mergeOR, t, "OR")
    
    return sorted(t[0]) #from [[1,0,3]] --> [1,0,3]

def merge(merge, t, boolOperator):
    end = False
    while not end:
        for i,wd in enumerate(t):
            if(wd == boolOperator):
                #take operator as it is, or get the posting from inverted index
                prev = t[i-1] if isinstance(t[i-1], list) else getPosting(invertedIndex, t[i-1])
                next = t[i+1] if isinstance(t[i+1], list) else getPosting(invertedIndex, t[i+1])
                t[i] = merge([prev,next])
                del t[i+1]
                del t[i-1]
                print(t)

        if boolOperator not in t:
            end = True
    return t

def mergeOR(postings):
    lstOr = []
    idx0 = 0
    idx1 = 0
    pst0 = postings[0]
    pst1 = postings[1]

    len0 = len(postings[0])
    len1 = len(postings[1])

    done=False

    while(not done):
        #break if max length for both is reached, otherwise append the rest of the other list
        if(idx0 > len0-1 and idx1 > len1-1):
            break
        elif(idx0 > len0-1):
            lstOr.append(pst1[idx1])
            idx1 += 1
            continue
        elif(idx1 > len1-1):
            lstOr.append(pst0[idx0])
            idx0 += 1
            continue

        if(pst0[idx0] > pst1[idx1]):
            lstOr.append(pst0[idx0])
            idx0 += 1
        elif(pst1[idx1] > pst0[idx0]):
            lstOr.append(pst1[idx1])
            idx1 += 1
        elif(pst0[idx0] == pst1[idx1]):
            lstOr.append(pst0[idx0])
            idx0 += 1
            idx1 += 1

    return lstOr

def mergeAND(postings):
    lstAnd = []
    idx0 = 0
    idx1 = 0
    pst0 = postings[0]
    pst1 = postings[1]

    len0 = len(postings[0])
    len1 = len(postings[1])

    smallest = len1 if len1 < len0 else len0
    
    done=False
    while(not done):
        if(idx0 > smallest-1 or idx1 > smallest-1):
            done = True
        elif(pst0[idx0] > pst1[idx1]):
            idx0 += 1
        elif(pst1[idx1] > pst0[idx0]):
            idx1 += 1
        elif(pst0[idx0] == pst1[idx1]):
            lstAnd.append(pst0[idx0])
            idx0 += 1
            idx1 += 1
    return lstAnd

print("query:", query)
print("---")
booleanSearch(query, validQueryOperators, corpus)
print("---")


corpus: ['aau', 'viden', 'for', 'verden', 'aalborg', 'universitet', 'universitetsuddannels', 'videregend', 'uddannels', 'p', 'kandidatuddannels', 'sidefag', 'og', 'tilvalgsfag', 'studieby', 'her', 'kan', 'du', 'studer', 'su', 'sp', 'stttemulighed', 'forskn', 'forskningsnyt', 'fra', 'ph.d.uddannels']
inverted matrix: [{'vocabulary': 'aau', 'postings': [0, 1, 4]}, {'vocabulary': 'viden', 'postings': [0]}, {'vocabulary': 'for', 'postings': [0]}, {'vocabulary': 'verden', 'postings': [0]}, {'vocabulary': 'aalborg', 'postings': [0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9]}, {'vocabulary': 'universitet', 'postings': [0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9]}, {'vocabulary': 'universitetsuddannels', 'postings': [1]}, {'vocabulary': 'videregend', 'postings': [1]}, {'vocabulary': 'uddannels', 'postings': [1]}, {'vocabulary': 'p', 'postings': [1, 2, 3, 4, 5, 6, 7, 9]}, {'vocabulary': 'kandidatuddannels', 'postings': [2]}, {'vocabulary': 'sidefag', 'postings': [3]}, {'vocabulary':

What is most useful for you depends on which websites you crawl. It is not essential for the exercise that the stemming always is the best possible ...!