# Self Study 3

This self study concludes our first "miniproject" on crawling and search. The tasks for this self study are:
- modify/extend the inverted index you constructed in the previous self study to contain for all postings the term frequencies (if your documents are just the titles of the web pages, you will see very few term frequencies larger than 1, but do not worry about that).
- calculate the idf values for all terms, and also include them in your index (cf. slide 3.20 for a schematic view)
- implement ranked retrieval as described on slides 3.19 and 3.20 for the ntc.bnc similarity metric 

In [103]:
import requests
from bs4 import BeautifulSoup
from urllib.robotparser import RobotFileParser
from nltk.stem.snowball import SnowballStemmer
import string
import nltk
nltk.download('punkt')
import math

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\minhs\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [104]:
titles = []
corpus = []
urls = []

numOfArticles = 10

dstemmer=SnowballStemmer("danish")

def getTitlesAndUrls(link):
    titles = []

    rp=RobotFileParser()
    rp.set_url(link)
    rp.read()
    r=requests.get(link)

    r_parse = BeautifulSoup(r.text, 'html.parser')
    r_parse.find('title').string
    for i, a in enumerate(r_parse.find_all('a')):
        if(i == numOfArticles+1): break
        _link = a['href']
        if(_link == '#main'):   continue
        if(_link[0] == '/'):    _link = link+_link[1:]
        titles.append(_getTitles(_link))
        urls.append(_link)
    return titles, urls

def _getTitles(link):
    r=requests.get(link)
    r_parse = BeautifulSoup(r.text, 'html.parser')
    return r_parse.find('title').string

def remove_non_ascii(a_str):
    ascii_chars = set(string.printable)

    return ''.join(
        filter(lambda x: x in ascii_chars, a_str)
    )

def tokenizeAndStemTitles(titles):
    _invertedIndex = []
    tokens = []
    for i, title in enumerate(titles):
        _tokens=nltk.word_tokenize(title)
        ps=nltk.PorterStemmer()
        for t in _tokens:
            s = ps.stem(t)
            s = remove_non_ascii(s)
            s = s.replace("-", "")
            if(s == ''): continue
            if s not in tokens: tokens.append(s)

            flag = 0
            for el in _invertedIndex:
                if el['vocabulary'] == s:
                    el['postings'].append(i)
                    flag = 1

            if flag == 0: _invertedIndex.append(dict(vocabulary=s, postings=[i]))

    return tokens, _invertedIndex

titles, urls = getTitlesAndUrls('https://www.aau.dk/')

corpus, invertedIndex = tokenizeAndStemTitles(titles)
print("corpus:", corpus)
print("inverted matrix:", invertedIndex)
print()

##new exercise:

#1.modify/extend the inverted index you constructed in the previous self study to contain for all postings the term frequencies 
#(if your documents are just the titles of the web pages, you will see very few term frequencies larger than 1, but do not worry about that).

def showTF(termFrequencyMatrix):
    print("termFrequencyMatrix:")
    for row in termFrequencyMatrix:
        print(row)

termFrequencyMatrix = [[0 for _ in range(len(corpus)+1)] for _ in range(numOfArticles+1)]
termFrequencyMatrix[0][0] = "X"

for i in range(len(corpus)):
    termFrequencyMatrix[0][i+1] = corpus[i]
    for n in invertedIndex[i]['postings']:
        termFrequencyMatrix[n+1][i+1] = invertedIndex[i]['postings'].count(n)

for i in range(len(urls)):
    termFrequencyMatrix[i+1][0] = urls[i]

#showTF(termFrequencyMatrix)

#2.calculate the idf values for all terms, and also include them in your index (cf. slide 3.20 for a schematic view)
# df(t) = sum(document in Corpus I[d,t])
# idf(t) = log(N/df(t))

#firstly make the term-document incident matrix for calculating df
def column(matrix, i):
    return [row[i] for row in matrix]

def showIncidentMatrix(incidentMatrix):
    for row in incidentMatrix:
        print(row)

def showidfTable(idfTable):
    for row in idfTable:
        print(row)

incidentMatrix = [[0 for _ in range(len(corpus)+1)] for _ in range(numOfArticles+1)]
incidentMatrix[0][0] = "X"
for i in range(len(corpus)):
    incidentMatrix[0][i+1] = corpus[i]
    for n in invertedIndex[i]['postings']:
        incidentMatrix[n+1][i+1] = 1

for i in range(len(urls)):
    incidentMatrix[i+1][0] = urls[i]

idfTable = [[0 for _ in range(3)] for _ in range(len(corpus)+1)]
idfTable[0] = ['term', 'df_t', 'idf_t']
for i in range(len(corpus)):
    _df = sum(column(incidentMatrix[1:], i+1))
    idfTable[i+1][0] = incidentMatrix[0][i+1]
    idfTable[i+1][1] = _df
    idfTable[i+1][2] = math.log10(numOfArticles/_df)

#showIncidentMatrix(incidentMatrix)
showidfTable(idfTable)

#3. implement ranked retrieval as described on slides 3.19 and 3.20 for the ntc.bnc similarity metric
# F[d,t] * idf(t) * 1 / ||V~(d)|| 

corpus: ['aau', 'viden', 'for', 'verden', 'aalborg', 'universitet', 'universitetsuddannels', 'videregend', 'uddannels', 'p', 'kandidatuddannels', 'sidefag', 'og', 'tilvalgsfag', 'studieby', 'her', 'kan', 'du', 'studer', 'su', 'sp', 'stttemulighed', 'forskn', 'forskningsnyt', 'fra', 'ph.d.uddannels']
inverted matrix: [{'vocabulary': 'aau', 'postings': [0, 1, 4]}, {'vocabulary': 'viden', 'postings': [0]}, {'vocabulary': 'for', 'postings': [0]}, {'vocabulary': 'verden', 'postings': [0]}, {'vocabulary': 'aalborg', 'postings': [0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9]}, {'vocabulary': 'universitet', 'postings': [0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9]}, {'vocabulary': 'universitetsuddannels', 'postings': [1]}, {'vocabulary': 'videregend', 'postings': [1]}, {'vocabulary': 'uddannels', 'postings': [1]}, {'vocabulary': 'p', 'postings': [1, 2, 3, 4, 5, 6, 7, 9]}, {'vocabulary': 'kandidatuddannels', 'postings': [2]}, {'vocabulary': 'sidefag', 'postings': [3]}, {'vocabulary':

In [97]:

column(incidentMatrix[1:], 1)

[1, 1, 0, 0, 1, 0, 0, 0, 0, 0]