In [7]:
import pickle
import csv
import pandas as pd
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk import sent_tokenize
import json
import time
import math

In [2]:
def cleaner(text):#input: a text to be clean:
                    #output: a list of words of the text cleaned
    text=text.lower()   
    # remove remaining tokens that are not alphabetic
    clean_text="".join([ch if ch.isalnum() else " " for ch in text]) #trasform all non alnumeric character
                                                                    #into a space
    words = word_tokenize(clean_text) 
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    # stemming of words
    porter = PorterStemmer()
    stemmed = [porter.stem(w) for w in words]
    
    return(stemmed)

In [14]:
vocab={}
index2={}
al=[]

In [15]:
path_vocab='/home/tiago/Scrivania/Libri Magistrale/1st semester/ADM/HomeWork3/vocabulary.pkl'
path_index='/home/tiago/Scrivania/Libri Magistrale/1st semester/ADM/HomeWork3/inverted_index2.pkl'
path_tsv='/home/tiago/Scrivania/Libri Magistrale/1st semester/ADM/HomeWork3/fileTsv/'
path_html='/home/tiago/Scrivania/Libri Magistrale/1st semester/ADM/HomeWork3/htmls.txt'


with open(path_index, 'rb') as f:
    index2=pickle.load(f)
with open(path_vocab, 'rb') as f:
    vocab=pickle.load(f)
with open(path_html, 'r') as f:
    al=json.loads(f.read())

In [29]:
def get_q_tfidf(query_words, term):
    freq_in_doc=1/len(query_words)
    num_of_words_in_doc=len(query_words)
    num_docs=30000
    num_docs_with_w=len(index2[vocab[term]])
    tf=freq_in_doc/num_of_words_in_doc
    idf=math.log(num_docs/num_docs_with_w)

    return tf*idf

def get_d_tfidf(term, doc):
    return index2[vocab[term]][doc]


def cosine(query_words, doc):
    #numerator
    numerator=0
    sum_q=0
    sum_d=0
    for term in query_words:
        q_tfidf=get_q_tfidf(query_words, term)
        d_tfidf=get_d_tfidf(term, doc)
        
        numerator+=(q_tfidf*d_tfidf)
        sum_q=sum_q+(q_tfidf**2)
        sum_d=sum_d+(d_tfidf**2)
    sum_q_sqrt=math.sqrt(sum_q)
    sum_d_sqrt=math.sqrt(sum_d)
    
    denominator=sum_q_sqrt*sum_d_sqrt
    print('num:', numerator, 'den: ', denominator, 'sum_q: ', sum_q_sqrt, 'sum_d: ', sum_d_sqrt)
    return numerator/denominator   

In [30]:
query=input('Give me a query :)')

query_words=set(cleaner(query))
posts_list=[]
start=time.time()
for word in query_words: # iterate over words
    if word in vocab.keys():#check if the word is present
        num_word=vocab[word]#retrive the number of the word
        posts_list.append(index[num_word])#retrive the doducemnts of the word

conjuntive_docs=set()
posts_list.sort(key=len) #sorting by the len of the dictionary inside
if len(posts_list)>0:
    conjuntive_docs=set(posts_list[0]) #take the smallest
    for posts in posts_list[1:]: #iterate over the documents from 1 to the end
        conjuntive_docs.intersection_update(set(posts.keys())) #update the set with commons documents

docs=list(conjuntive_docs)
output=[]
for doc in docs:#iterate over documents
    name_file_tsv=''.join(['article_', str(doc), '.tsv']) #obtain the name of the file
    file_tsv=None
    out_doc=[]
    with open(path_tsv+name_file_tsv, 'r') as file: #open the file
        file_tsv_reader=csv.reader(file, delimiter='\t') #read it
        list_file_content=next(file_tsv_reader)#read the only one line
        #print(type(list_file_content))
        
        out_doc.append(list_file_content[0]) #append title
        out_doc.append(list_file_content[1])#append intro
        out_doc.append(al[doc])
        out_doc.append(cosine(query_words, doc))
        

    output.append(out_doc)
seconds=round(time.time()-start, 2)
if len(output)>0:
    print('We have found {} results in {} second(s).'.format(len(output), seconds))
    output_df=pd.DataFrame(output, columns=['title', 'intro', 'url', 'similarity'])
    print(output_df)
else:
    print('No results.')
    


Give me a query :) more love


num: 0.20210713350422746 den:  0.20210713350422746 sum_q:  7.7440033031827555 sum_d:  0.026098534
num: 0.039542700706827395 den:  0.039542700706827395 sum_q:  7.7440033031827555 sum_d:  0.005106235
num: 0.07308338067750979 den:  0.07308338067750979 sum_q:  7.7440033031827555 sum_d:  0.009437416
num: 0.03907082760955126 den:  0.03907082760955126 sum_q:  7.7440033031827555 sum_d:  0.005045301
num: 0.14112653034112327 den:  0.14112653034112327 sum_q:  7.7440033031827555 sum_d:  0.018223976
num: 0.0750948545594983 den:  0.0750948545594983 sum_q:  7.7440033031827555 sum_d:  0.009697162
num: 0.024804058068100973 den:  0.024804058068100973 sum_q:  7.7440033031827555 sum_d:  0.003203002
num: 0.06496300332578574 den:  0.06496300332578574 sum_q:  7.7440033031827555 sum_d:  0.008388814
num: 0.03054231030773627 den:  0.03054231030773627 sum_q:  7.7440033031827555 sum_d:  0.003943995
num: 0.07820387054164028 den:  0.07820387054164028 sum_q:  7.7440033031827555 sum_d:  0.010098636
num: 0.11862810212