# IR - HW5 Query Modeling
- In this project, you will have
    - 150 Queries
        - 60%PublicQueries&40%PrivateQueries
    - 30,000 Documents
- Our goal is to implement a PRF algorithm for retrieval
        
https://www.kaggle.com/c/2020-information-retrieval-and-applications-hw5/

In [1]:
import numpy as np
import pandas as pd
import math
import os

In [2]:
DATA_PATH = 'ntust-ir-2020_hw5_new'

In [29]:
thresh = 80
# BM25
k3 = 100
b = 0.75
k1 = 0.8

# Rocchio
n_rel = 0
rel = 10
iter_num = 3
ALPHA = 1
BETA = 0.75
GAMMA = 0.15

In [4]:
def open_files(root_path = DATA_PATH, extension = ".txt"):
    """
    Retrun terms in querys and docs, the list of querys and docs name.
    """
    
    path_query = DATA_PATH + "/queries"
    path_docs = DATA_PATH + "/docs"
    
    qlf = open(os.path.join(DATA_PATH, "query_list.txt"))
    dlf = open(os.path.join(DATA_PATH, "doc_list.txt"))
    
    querys = {}
    query_name = []
    for fname in qlf:
        fname = fname.strip("\n")
        file = os.path.join(path_query, fname + extension)
        
        fq = open(file)
        query = [q.strip('\n').lower().split(' ') for q in fq][0]
        querys[fname] = query
        query_name.append(fname)
        fq.close()

    docs = {}
    doc_name = []
    for fname in dlf:
        fname = fname.strip("\n")
        file = os.path.join(path_docs, fname + extension)
        
        fd = open(file)
        doc = [d.strip("\n").lower().split(" ") for d in fd][0]
#         docs.append(doc)
        docs[fname] = doc
        doc_name.append(fname)
        fd.close()

    dlf.close()
    qlf.close()
    
    
    return querys, docs, query_name, doc_name

In [5]:
def TF(WV, vec_dict, word_2_id, subLinear = False):
    """
    Return TF.
    """
    
    _tf = {}

    for fname in vec_dict.keys():
        _tf[fname] = np.ones(len(WV))
    
    for fname, terms in vec_dict.items():
        for term in terms:
            if term in word_2_id:
                _tf[fname][word_2_id[term]] += 1
    
    if subLinear:
        # sublinear_tf: replace tf with 1 + log(tf).
        for fname, tf_val in _tf.items():
            _tf[fname] = 1 + np.log(_tf[fname])
#             for idx, val in enumerate(tf_val):
#                 try:
#                     _tf[fname][idx] = 1 + math.log(val) 
#                 except:
#                     print(f'error tf: {val}')

    return _tf

In [6]:
def TFIDF(WV, vec, word_2_id, subLinear=False):
    """
    Return IDF.
    """
    ni = np.zeros(len(WV))
    
    # TF
    _tf = TF(WV, vec, word_2_id, subLinear)
    
    print("[INFO] TF Done...")
    
    for terms in vec.values():
        check_term = {}
        for term in terms:
            if term in word_2_id and term not in check_term:
                ni[word_2_id[term]] += 1
                check_term[term] = True
    # IDF
    _idf = ni.copy()
    for idx, n in enumerate(ni):
        _idf[idx] = math.log( (len(vec) - n + 0.5) / (0.5 + n) )
#         _idf[idx] = math.log( ((1 + len(vec)) / (1 + n)) + 1) 
    
    print("[INFO] IDF Done...")
    
    # TF-IDF
    tf_idf = {}
    for fname, terms in vec.items():
        tf_idf[fname] = np.zeros(len(WV))
        for term in terms:
            if term in word_2_id:
                tf_idf[fname][word_2_id[term]] = _tf[fname][word_2_id[term]] * _idf[word_2_id[term]]

    print("[INFO] TF-IDF Done...")
    
    return tf_idf, _tf, _idf

In [7]:
def avgDocLength(docs):
    """
    Average documents length.
    """
    avgDL = 0
    for term in docs.values():
        avgDL += len(term)
    return avgDL / len(docs)

In [8]:
def word_vector(word_vec):
    """
    Make the Word Vector.
    """
    word_terms = set()
    for terms in word_vec.values():
        for t in terms:
            word_terms.add(t)
    word_terms = list(word_terms)
    
    word_2_id = {}
    for idx, word in enumerate(word_terms):
        word_2_id[word] = idx
    
    return word_terms, word_2_id

In [9]:
def word_vector_filter(word_vec, querys, filter_type=1, tfidf_thresh = 120):

    # query terms
    if filter_type == 1:
        WV, word_2_id = word_vector(word_vec)
    # query terms and some high tf terms.
    elif filter_type == 2:
        WV, word_2_id = word_vector(word_vec)
#         tf = TF(WV, word_vec, word_2_id)
        tfidf, tf, idf = TFIDF(WV, word_vec, word_2_id)
        
        # query terms
        use_terms = set()
        WV_q, _ = word_vector(querys)
        # update WV
        for fname, words in word_vec.items():
            for word in words:
                if word in WV_q:
                    use_terms.add(word)
                elif len(word) > 1 and not word.isdigit() and tfidf[fname][word_2_id[word]] > tfidf_thresh:
            #            print(f'word: {word}, tfidf: {tfidf[fname][word_2_id[word]]}')
                    use_terms.add(word)
        WV = list(use_terms)

        # update word_2_id
        word_2_id = {}
        for idx, word in enumerate(WV):
            word_2_id[word] = idx

    return WV, word_2_id

In [10]:
import time
start = time.time()

In [11]:
querys, docs, query_name, doc_name = open_files()

In [12]:
all_vec = docs.copy()
all_vec.update(querys)

In [13]:
WV, word_2_id = word_vector_filter(all_vec, querys, filter_type=2, tfidf_thresh=thresh)

[INFO] TF Done...
[INFO] IDF Done...
[INFO] TF-IDF Done...


WV > 50000 runs tfidf need almost 20 min.

In [14]:
len(WV)

10325

In [15]:
tfidf, tf, idf = TFIDF(WV, all_vec, word_2_id, subLinear=True)

[INFO] TF Done...
[INFO] IDF Done...
[INFO] TF-IDF Done...


In [16]:
# normalize
query_vecs = {k:v/np.linalg.norm(v) for k, v in tfidf.items() if k in querys}
# query_vecs = {k:v for k, v in tfidf.items() if k in querys}

In [17]:
time.time() - start

251.56637811660767

In [18]:
def cosine(vec1, vec2):
    dot = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    try:
        cos = dot / (norm1 * norm2)
    except:
        cos = 0
    return cos

In [19]:
def _ranking(query_vec):
    avgDL = avgDocLength(docs)

    sim_dict = {}
    sorted_sim_dict = {}
    i = 0
    for fqname, terms_q in querys.items():
        print(i)
        i += 1

        sim_dict[fqname] = {}
        for fdname, terms_d in docs.items():
            score = 0.0
            for term in terms_q:
                if term not in terms_d:
                    continue
                term_id = word_2_id[term]
                score += (k1 + 1) * tf[fdname][term_id] / (k1 * ((1 - b) + b * len(terms_d) / avgDL) + tf[fdname][term_id]) * idf[term_id]
            sim_dict[fqname][fdname] = cosine(query_vecs[fqname], tfidf[fdname]) * score
        sorted_sim_dict[fqname] = sorted(sim_dict[fqname], key = sim_dict[fqname].get, reverse=True)
    return sorted_sim_dict

In [20]:
def init_ranking(query_vec):
    """
    Return the score of relevant document ranked by using BM25 and VSM, and the BM25 rank.
    """
    avgDL = avgDocLength(docs)

    sim_dict = {}
    sorted_sim_dict = {}
    BM_rank = {}
    i = 0
    for fqname, terms_q in querys.items():
        print(i)
        i += 1

        sim_dict[fqname] = {}
        BM_rank[fqname] = {}
        for fdname, terms_d in docs.items():
            BM_rank[fqname][fdname] = 0.0
            for term in terms_q:
                if term not in terms_d:
                    continue
                term_id = word_2_id[term]
                BM_rank[fqname][fdname] += (k1 + 1) * tf[fdname][term_id] / (k1 * ((1 - b) + b * len(terms_d) / avgDL) + tf[fdname][term_id]) * idf[term_id]
            sim_dict[fqname][fdname] = cosine(query_vecs[fqname], tfidf[fdname]) * BM_rank[fqname][fdname]
        sorted_sim_dict[fqname] = sorted(sim_dict[fqname], key = sim_dict[fqname].get, reverse=True)
    return sorted_sim_dict, BM_rank

In [21]:
def ranking(query_vec):
    """
    Used with initial one.
    """
    sim_dict = {}
    sorted_sim_dict = {}
    i = 0
    for fqname, terms_q in querys.items():
        print(i)
        i += 1

        sim_dict[fqname] = {}
        for fdname, terms_d in docs.items():
            sim_dict[fqname][fdname] = cosine(query_vecs[fqname], tfidf[fdname]) * BM_rank[fqname][fdname]
        sorted_sim_dict[fqname] = sorted(sim_dict[fqname], key = sim_dict[fqname].get, reverse=True)
    return sorted_sim_dict

In [22]:
VSM_rank, BM_rank = init_ranking(query_vecs)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149


In [30]:
for _ in range(iter_num):
    # Rocchio
    for q_name, rank_doc in VSM_rank.items():
        mean_rel = []
        mean_nrel = []
        
        for i in range(0, rel):
            mean_rel.append(tfidf[rank_doc[i]] * (1 + (rel - i)) / rel)
#             mean_rel.append(tfidf[rank_doc[i]])
        for i in range(n_rel + len(rank_doc), len(rank_doc)):
            mean_nrel.append(tfidf[rank_doc[i]])
        
        mean_rel = np.mean(mean_rel, axis=0)

        if n_rel >= 0:
            query_vecs[q_name] = ALPHA * query_vecs[q_name] + BETA * mean_rel            
        else:            
            mean_nrel = np.mean(mean_nrel, axis=0)
            query_vecs[q_name] = ALPHA * query_vecs[q_name] + BETA * mean_rel - GAMMA * mean_nrel
        
        query_vecs[q_name] = query_vecs[q_name] / np.linalg.norm(query_vecs[q_name])

        
#     print(BETA * mean_rel)
#     print(GAMMA * mean_nrel)
#     print(tf_q)
    # Update Rank
    VSM_rank = ranking(query_vecs)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149


## Output file

In [24]:
def output(sorted_sim_dict, filename = "result.txt"):
    # output file
    if os.path.exists(filename):
        os.remove(filename)

    with open(filename, "w") as ofile:
        ofile.write("Query,RetrievedDocuments\n")
        for query_name, score_list in sorted_sim_dict.items():
            ofile.write(query_name + ",")
            for score in score_list:
                ofile.write(score + " ")
            ofile.write("\n")

In [31]:
output(VSM_rank)

In [26]:
time.time() - start

746.7830352783203

In [27]:
def ranking_BM(tf_q):
    avgDL = avgDocLength(docs)

    sim_dict = {}
    sorted_sim_dict = {}
    
    i = 0
    for fqname, terms_q in querys.items():
        print(i)
        i += 1
        sim_dict[fqname] = {}
        for fdname, terms_d in docs.items():
            score = 0.0
            # BM25
            for term in terms_q:
                if term not in terms_d:
                    continue
                term_id = word_2_id[term]
                score += (k1 + 1) * tf[fdname][term_id] / (k1 * ((1 - b) + b * len(terms_d) / avgDL) + tf[fdname][term_id]) * idf[term_id]
#                 score += (k1 + 1) * tf_d[fdname][term_id] / (k1 * ((1 - b) + b * len(terms_d) / avgDL) + tf_d[fdname][term_id]) \
#                              * idf[term_id] * (k3 + 1) * tf_q[fqname][term_id] / (k3 + tf_q[fqname][term_id])
            sim_dict[fqname][fdname] = score
        # Sort the sim score
        sorted_sim_dict[fqname] = sorted(sim_dict[fqname], key=sim_dict[fqname].get, reverse=True)
    return sorted_sim_dict

In [28]:
def ranking_VSM(query_vecs):
    sim_dict = {}
    sorted_sim_dict = {}
    i = 0

    for fqname, terms_q in querys.items():
        print(i)
        i += 1
        sim_dict[fqname] = {}
#         query_vec = []
#         # query_vec
#         for term in terms_q:
#             term_id = word_2_id[term]
#             try:
#                 query_vec.append(query_vecs[fqname][term_id])
#             except:
#                 query_vec.append(0)
        for fdname, terms_d in docs.items():
#             doc_vec = []
#             # VSM
#             for term in terms_q:
#                 term_id = word_2_id[term]
#                 # doc_vec
#                 try:
#                     doc_vec.append(tfidf[fdname][term_id])
#                 except:
#                     dec_vec.append(0)
            sim_dict[fqname][fdname] = cosine(query_vecs[fqname], tfidf[fdname])
        sorted_sim_dict[fqname] = sorted(sim_dict[fqname], key=sim_dict[fqname].get, reverse=True)

    return sorted_sim_dict