![image.png](attachment:9ecc7090-6b1a-4794-889d-cd58d12059ca.png)

In [32]:
import os
import json
import math
from collections import Counter
import pandas as pd

In [33]:
path = '../5/stemmed/gutenberg/'

In [34]:
inverted_index = '../6/inverted_index.json'

In [35]:
index = None

In [36]:
with open(inverted_index, 'r') as file:
    index = json.load(file)

In [37]:
vocabulary = list(index.keys())

In [38]:
files = os.listdir(path)
number_of_documents = len(files)

![image.png](attachment:5c36f2e5-1b9e-4349-9a65-0ee3f58d086d.png)

## Výpočet TF složky ze cesty, která je vstupem do metody

In [39]:
def get_tf_from_path(path):
    docId2tf = {}
    for docId, file_name in enumerate(os.listdir(path)):
        if '.txt' not in file_name:
            continue
        
        file_path = os.path.join(path, file_name)

        content = ''
        
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        words = content.split(' ')
        c = Counter(words)
        docId2tf[docId] = dict(c)
        
    return docId2tf

In [40]:
tf = get_tf_from_path(path)

![image.png](attachment:ed5a542a-f0a9-4961-90ef-eba3f9ce2672.png)

In [41]:
number_of_documents

90

## Výpočet IDF složky z invertovaného indexu, který obsahuje všechny slova a k nim v kolika dokumentech se vyskytuje

In [11]:
def get_idf(inverted_index):
    idf = {}
    print(number_of_documents)
    
    for k, v in inverted_index.items():
        idf[k] = math.log(number_of_documents / len(v))
    
    return idf

In [12]:
idf = get_idf(index)

90


In [57]:
def create_matrix(tf, idf):
    terms = list(idf.keys())
    columns = list(tf.keys())
    
    tf_df = pd.DataFrame(index=terms, columns=columns) 
    idf_df = pd.DataFrame(index=terms)
        
    idf_df['Value'] = 0
    for k, v in idf.items():
        idf_df.loc[k, 'Value'] = v
        
    tf_idf_df = pd.DataFrame(index=terms, columns=columns)
    
    for docId, values in tf.items():
        tf_df.loc[:, docId] = 0
        for term, value in values.items():
            tf_df.loc[term, docId] = value
            
    return tf_df, idf_df

In [58]:
tf_df, idf_df = create_matrix(tf, idf)

In [104]:
def get_fallback(df, k):
    try:
        return df.loc[k, 'Value']
    except:
        return 0

In [105]:
def create_tf_idf(tf_df, idf_tf):          
    terms = list(tf_df.index)
    tf_idf_df = pd.DataFrame(index=terms)
    
    for docId in tf_df.columns:
        print(docId)
        for k in terms:
            tf_idf_df.loc[k, docId] = tf_df.loc[k, docId] * get_fallback(idf_df, k)
    
    return tf_idf_df

In [106]:
tf_idf_df = create_tf_idf(tf_df, idf_df)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89


In [110]:
tf_idf_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,80,81,82,83,84,85,86,87,88,89
note,0.620936,0.275971,14.971453,1.655829,0.413957,2.966693,3.104679,0.137986,2.759715,0.620936,...,1.724822,4.001587,1.103886,0.344964,1.655829,0.068993,0.068993,0.275971,0.275971,1.379857
file,0.234639,0.03352,0.346372,0.055867,0.022347,0.06704,0.06704,0.044693,0.03352,0.044693,...,0.10056,0.089386,0.044693,0.044693,0.10056,0.06704,0.055867,0.044693,0.044693,0.089386
combin,1.363706,0.0,6.136679,0.0,1.363706,2.04556,1.704633,0.0,1.363706,0.681853,...,5.113899,3.409266,1.704633,1.363706,1.02278,0.0,0.0,0.340927,0.340927,1.704633
first,0.561821,10.180204,30.92265,0.898914,1.303426,3.033836,2.764161,1.146116,3.573184,2.337177,...,2.269758,2.449541,2.449541,2.427068,2.786634,0.494403,0.47193,0.876441,0.809023,2.651797
two,0.561821,18.80978,17.416463,2.000084,2.427068,4.741773,3.797913,0.853969,4.224897,0.921387,...,2.921471,4.539517,4.135005,0.853969,3.370928,0.606767,0.898914,0.449457,1.707937,5.19123


In [111]:
idf_df.head()

Unnamed: 0,Value
note,0.068993
file,0.011173
combin,0.340927
first,0.022473
two,0.022473


In [112]:
tf_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,80,81,82,83,84,85,86,87,88,89
note,9.0,4.0,217.0,24.0,6.0,43.0,45.0,2.0,40.0,9.0,...,25,58,16,5,24,1,1,4,4,20
file,21.0,3.0,31.0,5.0,2.0,6.0,6.0,4.0,3.0,4.0,...,9,8,4,4,9,6,5,4,4,8
combin,4.0,0.0,18.0,0.0,4.0,6.0,5.0,0.0,4.0,2.0,...,15,10,5,4,3,0,0,1,1,5
first,25.0,453.0,1376.0,40.0,58.0,135.0,123.0,51.0,159.0,104.0,...,101,109,109,108,124,22,21,39,36,118
two,25.0,837.0,775.0,89.0,108.0,211.0,169.0,38.0,188.0,41.0,...,130,202,184,38,150,27,40,20,76,231


## Znovu vytvoření indexu vzhledem k vstupní tabulce, který obsahuje složky TF, IDF, a TF-IDF

In [144]:
def rebuild_index(inverted_index, tf, idf, tf_idf):
    res = {}
    for term, docIds in inverted_index.items():
        for docId in docIds:
            value = res.get(term, [])
            tf_part = int(tf.loc[term, docId])
            idf_part = float(get_fallback(idf, term))
            tf_idf_part = float(tf_idf.loc[term, docId])
            value.append((docId, tf_part, idf_part, tf_idf_part))
            res[term] = value
    ## každý term bude obsahovat tuply (docId, tf, idf, tf-idf)    
    return res

In [145]:
new_index = rebuild_index(index, tf_df, idf_df, tf_idf_df)

In [146]:
new_index;

In [147]:
with open('new_inverted_index.json', 'w') as fp:
    json.dump(new_index, fp)

# Vyhledavácí systém

- Lepší je udělat kde všude se vyskytuje
- Nebo jen relevanci i když se nevyskytuje

In [167]:
query = "produc AND anonym"

In [186]:
def and_list(a, b):
    a_doc_id = [i[0] for i in a ]
    b_doc_id = [i[0] for i in b]
    a_s = set(a_doc_id)
    b_s = set(b_doc_id)
    keep_ids = a_s.intersection(b_s)
    
    
    res = {}
    
    for i in a:
        docid = i[0]
        if docid in keep_ids:
            res[docid] = res.get(docid, 0) + i[-1]
            
    for i in b:
        docid = i[0]
        if docid in keep_ids:
            res[docid] = res.get(docid, 0) + i[-1]
    
    return list(res.items())

In [205]:
def search(query_sequence, inverted_index, n):
    res = [v.strip() for v in query_sequence.split("AND")]
    l = []
    
    for item in res:
        if item == 'AND':
            continue
        else:
            if len(l) == 0:
                value = inverted_index.get(item, [])
                l = value
            else:
                value = inverted_index.get(item, [])
                l = and_list(l, value)
    res = sorted(l, key=lambda x: x[1], reverse=True)
    return res[0:n]

## Specific n

In [206]:
n = 10

In [207]:
search(query, new_index, n)

[(73, 4.472662258859316),
 (25, 3.9067395688493836),
 (6, 3.4410897737799573),
 (74, 2.4653837916912256),
 (35, 2.43186388989685),
 (7, 2.3759973869062234),
 (29, 2.3759973869062234),
 (13, 2.3648240863080985),
 (30, 2.3648240863080985),
 (84, 2.3424774851118477)]