In [109]:
import datetime
import json
import re
import math
import pickle
from hazm import *

data_content = []
data_url = []
data_title = []
masked_id = {}

normalizer = Normalizer()
stemmer = Stemmer()
lemmatizer = Lemmatizer()

stop_words = set(stopwords_list())
# print("stop words: ", stop_words)

def load_data():
    _file = open('index', 'rb')
    data = pickle.load(_file)

    return data

def store_data(obj):
    _file = open('index', 'ab')
    pickle.dump(obj, _file)

def preProcess(data):
    json_keys = list(data.keys())

    for i in range(len(data)):
        masked_id[i] = json_keys[i]

    # Extracting data to arrays
    for i in json_keys:
        index = str(i)
        data_content.append(data[index]["content"])
        data_url.append(data[index]["url"])
        data_title.append(data[index]["title"])

    # Initializing variables and objects
    # stop_words = set(stopwords_list())
    # print("stop words: ", stop_words)

    punctuations = [')', '(', '>', '<', "؛", "،", '{', '}', "؟", ':', "–", '»', '"', '«', '[', ']', '"', '+', '=', '?',
                    '/', '//', '\\', '|', '!', '%', '&', '*', '$', '#', '؟', '*', '.', '_', '\u200c']
    punctuations_set = set(punctuations)

    data_tokens = []

    # Processing each document
    for content in data_content:
        # Normalizing and tokenizing the content
        content = normalizer.normalize(content)
        tokens = word_tokenize(content)

        # Removing stop words and punctuations
        tokens = [token for token in tokens if token not in stop_words and token not in punctuations_set]
        # tokens = [token for token in tokens if token not in punctuations_set]

        # Lemmatizing and stemming each token
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        # tokens = [stemmer.stem(token) for token in tokens]

        data_tokens.append(tokens)

    # temp = []
    # for data_token in data_tokens:
    #     temp.append(data_token)
    #
    # dic = set(temp)
    # print("length of dic: ", dic)

    return data_tokens


def update_postings(postings, docId, term_position):
    if docId not in postings:
        postings[docId] = {'positions': [], 'freq_in_doc': 0}
    postings[docId]['positions'].append(term_position)
    postings[docId]['freq_in_doc'] += 1


def positional_indexing(ppd):
    p_inv_index = {}
    for docId, doc in enumerate(ppd):
        for pos, term in enumerate(doc):
            if term in p_inv_index:
                update_postings(p_inv_index[term]['postings'], docId, pos)
                p_inv_index[term]['total_freq'] += 1
            else:
                p_inv_index[term] = {'postings': {docId: {'positions': [pos], 'freq_in_doc': 1}}, 'total_freq': 1}
    return p_inv_index


def get_terms_freq(positionalIndex):
    freq = {}
    for term in positionalIndex:
        freq[term] = positionalIndex[term]['total_freq']
    return freq


def query_service(and_terms, not_terms, phrase_term, pi):
    normalizer = Normalizer()
    stemmer = Stemmer()
    lemmatizer = Lemmatizer()

    for i in range(len(and_terms)):
        and_terms[i] = normalizer.normalize(and_terms[i])
        and_terms[i] = lemmatizer.lemmatize(and_terms[i])
        and_terms[i] = stemmer.stem(and_terms[i])

    for i in range(len(not_terms)):
        not_terms[i] = normalizer.normalize(not_terms[i])
        not_terms[i] = lemmatizer.lemmatize(not_terms[i])
        not_terms[i] = stemmer.stem(not_terms[i])

    for i in range(len(phrase_term)):
        for j in range(len(phrase_term[i])):
            phrase_term[i][j] = normalizer.normalize(phrase_term[i][j])
            phrase_term[i][j] = lemmatizer.lemmatize(phrase_term[i][j])
            phrase_term[i][j] = stemmer.stem(phrase_term[i][j])

    doc_ids = set(range(len(preprocessed_docs)))

    for term in not_terms:
        if term in pi:
            term_postings = set(pi[term]['postings'].keys())
            doc_ids.difference_update(term_postings)

    doc_freqs = {}
    for term in and_terms:
        if term in pi:
            term_postings = pi[term]['postings']
            for doc_id, doc_postings in term_postings.items():
                if doc_id in doc_ids and doc_postings['freq_in_doc'] > 0:
                    if doc_id not in doc_freqs:
                        doc_freqs[doc_id] = 0
                    doc_freqs[doc_id] += doc_postings['freq_in_doc']
    phrase_dict = {}
    if len(phrase_term) > 0:
        phrase_str = ' '.join(phrase_term[0])
        phrase_dict[phrase_str] = []

    for phrase in phrase_dict:
        phrase_list = phrase.split(' ')
        dic = {}
        for i in range(len(phrase_list)):
            if phrase_list[i] in pi:
                term_postings = pi[phrase_list[i]]['postings']
                for doc_id, doc_postings in term_postings.items():
                    if doc_id not in dic:
                        dic[doc_id] = []
                    dic[doc_id].append(doc_postings['positions'])
        phrase_dict[phrase] = dic

    for phrase in phrase_dict:
        for doc_id in phrase_dict[phrase]:
            pos_lists = phrase_dict[phrase][doc_id]
            idx = 0
            while idx < len(pos_lists) - 1:
                if pos_lists[idx + 1][0] - pos_lists[idx][-1] == 1:
                    idx += 1
                else:
                    break
            if idx == len(pos_lists) - 1:
                if doc_id in doc_freqs:
                    doc_freqs[doc_id] += 1
                else:
                    doc_freqs[doc_id] = 1

    ranked_docs = sorted(doc_freqs.items(), key=lambda x: x[1], reverse=True)

    return ranked_docs


def executing_query(data):
    # Example usage
    query = 'باید'

    phrase_query = get_phrase(query)
    not_query = not_terms(query)
    flat_phrases = [item for sublist in phrase_query for item in sublist]
    not_words = not_terms(query)
    query = query.replace('"', '')
    query = query.replace('!', '')
    splitted_query = query.split()
    looking_words = []
    for x in splitted_query:
        if x not in not_words and x not in flat_phrases:
            looking_words.append(x)

    a = datetime.datetime.now()
    result_docs = query_service(looking_words, not_query, phrase_query, positional_index)
    b = datetime.datetime.now()
    print("{:<5} result in {} ms\n".format(len(result_docs), 1000 * (b - a).total_seconds()))

    if len(result_docs) > 5:
        for i in range(5):
            num = str(result_docs[i])
            num = num.split(' ')
            right = num[1].split(')')
            score = right[0]
            left = num[0].split(',')
            left = left[0].split('(')
            document_id = left[1]

            print("document id: ", document_id, end='   ')
            print("score: " + score, end='  ')
            print(data[document_id]['title'])
            print(data[document_id]['url'], end='')
            print(data[document_id]['content'])
            print('-----------------------------------------------')


def get_phrase(query):
    res = []
    quoted = re.compile('"[^"]*"')
    for value in quoted.findall(query):
        value = value.replace('"', '').strip().split()
        res.append(value)
    return res


def not_terms(query):
    splitted_query = query.split()
    indices = [i for i in range(len(splitted_query)) if splitted_query[i] == '!']
    result = [splitted_query[i + 1] for i in indices]
    return result

In [110]:
with open("../IR_data_news_5k.json") as file:
    json_data = json.load(file)

preprocessed_docs = preProcess(json_data)

positional_index = positional_indexing(preprocessed_docs)

# sw = {}
# for term in stop_words:
#     if term in positional_index:
#         sw[term] = len(positional_index[term]['postings'])
#
# ssw = sorted(sw.items(), key=lambda x: x[1], reverse=True)
# print(ssw)

# print("length postings list: ",len(positional_index))

# executing_query(positional_index)


store_data(positional_index)

In [123]:
length_for_doc = {}

# all_term_freq = 0
# for term in positional_index:
#     for doc in positional_index[term]['postings']:
#         all_term_freq += positional_index[term]['postings'][doc]['freq_in_doc']

tokennn = []
for doc in data_content:
    tokennn = word_tokenize(doc)

tokenn = {}
for i in range(len(tokennn)):
    if tokennn[i] in tokenn:
        tokenn[tokennn[i]] += 1
    else:
        tokenn[tokennn[i]] = 1

summ = 0
for i in tokenn:
    summ += tokenn[i]

avg = summ/len(tokenn)

def calculate_documents_tf_idf():
    for term in positional_index:
        for doc in positional_index[term]['postings']:
            term_frequency = positional_index[term]['postings'][doc]['freq_in_doc']
            N = len(data_content)
            df = len(positional_index[term]['postings'])
            tf_idf = (1 + math.log2(term_frequency) / 1 + math.log2(avg)) * max(float(0), math.log2((N-df)/df))
            positional_index[term]['postings'][doc]['tf_idf'] = tf_idf
            if doc in length_for_doc:
                length_for_doc[doc] += tf_idf ** 2
            else:
                length_for_doc[doc] = tf_idf ** 2
    for doc_id in length_for_doc:
        length_for_doc[doc_id] = math.sqrt(length_for_doc[doc_id])

    # temp = {}
    # for term in positional_index:
    #     temp[term] = (math.log2(len(data_content) / positional_index[term]['total_freq']))
    #
    # sortedd = sorted(temp.items(), key=lambda item: item[1], reverse=True)


    # print(sortedd)

In [86]:
def cosine_score(query, k):
    query_tf = {}
    query_length = 0
    doc_cosine_score_dic = {}

    # Calculate term frequencies for query
    for term in query.split(" "):
        word = lemmatizer.lemmatize(normalizer.normalize(term))
        query_tf[word] = query_tf.get(word, 0) + 1

    # Calculate query length and apply logarithmic scaling to term frequencies
    for term, tf in query_tf.items():
        query_tf[term] = 1 + math.log2(tf)
        query_length += query_tf[term] ** 2

    # Calculate cosine scores
    for term in query_tf.keys():
        if term in positional_index:
            for doc_id in positional_index[term]['postings']:
                tf_idf = positional_index[term]['postings'][doc_id]['tf_idf']
                doc_cosine_score_dic[doc_id] = doc_cosine_score_dic.get(doc_id, 0) + query_tf[term] * tf_idf

    # Normalize cosine scores
    for doc in doc_cosine_score_dic:
        doc_cosine_score_dic[doc] /= (length_for_doc[doc] * query_length)

    # Sort documents by score
    sorted_doc_cosine_score = sorted(doc_cosine_score_dic.items(), key=lambda item: item[1], reverse=True)
    return sorted_doc_cosine_score[:k]

In [67]:
def jacquard_score(query, k):
    query_set = set(lemmatizer.lemmatize(token.lower()) for token in word_tokenize(query) if token.isalpha())
    doc_jacquard_score_dic = {}
    for term in query_set:
        if term in positional_index:
            for doc_id in positional_index[term]['postings']:
                if doc_id not in doc_jacquard_score_dic:
                    doc_set = set(word_tokenize(data_content[doc_id]))
                    jacquard_score = len(query_set.intersection(doc_set)) / len(query_set.union(doc_set))
                    doc_jacquard_score_dic[doc_id] = jacquard_score
    sorted_doc_jacquard_score = sorted(doc_jacquard_score_dic.items(), key=lambda item: item[1], reverse=True)
    return sorted_doc_jacquard_score[:k]

In [68]:
def jacquard_score_with_champion_list(query, k, champion_lists):
    query_set = set(lemmatizer.lemmatize(token.lower()) for token in word_tokenize(query) if token.isalpha())
    doc_jacquard_score_dic = {}
    for term in query_set:
        if term in champion_lists:
            for doc_id in champion_lists[term]:
                if doc_id not in doc_jacquard_score_dic:
                    doc_set = set(word_tokenize(data_content[doc_id]))
                    jacquard_score = len(query_set.intersection(doc_set)) / len(query_set.union(doc_set))
                    doc_jacquard_score_dic[doc_id] = jacquard_score
    sorted_doc_jacquard_score = sorted(doc_jacquard_score_dic.items(), key=lambda item: item[1], reverse=True)
    return sorted_doc_jacquard_score[:k]

In [69]:
def create_champion_lists(k):
    r = 10 * k
    champion_lists = {}
    for term in positional_index:
        doc_tf_idf = {}
        for doc in positional_index[term]['postings']:
            doc_tf_idf[doc] = positional_index[term]['postings'][doc]['tf_idf'] / length_for_doc[doc]
        sorted_doc_tf_idf = sorted(doc_tf_idf.items(), key=lambda item: item[1], reverse=True)
        documents = [doc_score[0] for index, doc_score in enumerate(sorted_doc_tf_idf) if index <= r]
        champion_lists[term] = documents
    return champion_lists

In [65]:
calculate_documents_tf_idf()
query = 'تراکتور'
champ_list = create_champion_lists(4)
jacquard_score_with_champion_list(query,20,champ_list)

[(3630, 0.029411764705882353),
 (3708, 0.023255813953488372),
 (4545, 0.020833333333333332),
 (195, 0.02),
 (1757, 0.02),
 (4600, 0.019230769230769232),
 (4939, 0.01639344262295082),
 (741, 0.016129032258064516),
 (339, 0.015873015873015872),
 (3027, 0.015625),
 (2356, 0.014492753623188406),
 (4227, 0.014285714285714285),
 (1036, 0.014084507042253521),
 (4446, 0.013888888888888888),
 (3593, 0.0136986301369863),
 (2885, 0.0136986301369863),
 (2872, 0.0136986301369863),
 (42, 0.013333333333333334),
 (1181, 0.011904761904761904),
 (3989, 0.011764705882352941)]

In [70]:
def cosine_score_with_champion_lists(query, k, champion_lists):
    query_tf = {}
    query_length = 0
    doc_cosine_score_dic = {}

    # Calculate term frequencies for query
    for term in query.split(" "):
        word = lemmatizer.lemmatize(normalizer.normalize(term))
        query_tf[word] = query_tf.get(word, 0) + 1

    # Calculate query length and apply logarithmic scaling to term frequencies
    for term, tf in query_tf.items():
        query_tf[term] = 1 + math.log2(tf)
        query_length += query_tf[term] ** 2

    # Calculate cosine scores
    for term in query_tf.keys():
        if term in champion_lists:
            for doc_id in champion_lists[term]:
                tf_idf = positional_index[term]['postings'][doc_id]['tf_idf']
                doc_cosine_score_dic[doc_id] = doc_cosine_score_dic.get(doc_id, 0) + query_tf[term] * tf_idf

    # Normalize cosine scores
    for doc in doc_cosine_score_dic:
        doc_cosine_score_dic[doc] /= (length_for_doc[doc] * query_length)

    # Sort documents by score
    sorted_doc_cosine_score = sorted(doc_cosine_score_dic.items(), key=lambda item: item[1], reverse=True)
    return sorted_doc_cosine_score[:k]

In [71]:
def compare_query(k, champions_list, query):
    cosine = cosine_score(query, 20)
    cosine_with_champ = cosine_score_with_champion_lists(query, 20, champions_list)
    jacquard = jacquard_score(query, 20)
    jacquard_with_champ = jacquard_score_with_champion_list(query, 20, champions_list)

    if len(cosine) > k:
        print("cosine: \n")
        for i in range(k):
            docID, score = cosine[i]
            print("found doc: ", masked_id[docID], "with score: ", score)
            print("title: ", data_title[docID])
            print("content: ", data_content[docID])
            print("--------------------------------------")

        print("cosine with champions list: \n")
        for i in range(k):
            docID, score = cosine_with_champ[i]
            print("found doc: ", masked_id[docID], "with score: ", score)
            print("title: ", data_title[docID])
            print("content: ", data_content[docID])
            print("--------------------------------------")

        print("jacquard: \n")
        for i in range(k):
            docID, score = jacquard[i]
            print("found doc: ", masked_id[docID], "with score: ", score)
            print("title: ", data_title[docID])
            print("content: ", data_content[docID])
            print("--------------------------------------")

        print("jacquard with champions list: \n")
        for i in range(k):
            docID, score = jacquard_with_champ[i]
            print("found doc: ", masked_id[docID], "with score: ", score)
            print("title: ", data_title[docID])
            print("content: ", data_content[docID])
    else:
        print("cosine: \n")
        for i in range(len(cosine)):
            docID, score = cosine[i]
            print("found doc: ", masked_id[docID], "with score: ", score)
            print("title: ", data_title[docID])
            print("content: ", data_content[docID])
            print("--------------------------------------")

        print("cosine with champions list: \n")
        for i in range(len(cosine_with_champ)):
            docID, score = cosine_with_champ[i]
            print("found doc: ", masked_id[docID], "with score: ", score)
            print("title: ", data_title[docID])
            print("content: ", data_content[docID])
            print("--------------------------------------")

        print("jacquard: \n")
        for i in range(len(jacquard)):
            docID, score = jacquard[i]
            print("found doc: ", masked_id[docID], "with score: ", score)
            print("title: ", data_title[docID])
            print("content: ", data_content[docID])
            print("--------------------------------------")

        print("jacquard with champions list: \n")
        for i in range(len(jacquard_with_champ)):
            docID, score = jacquard_with_champ[i]
            print("found doc: ", masked_id[docID], "with score: ", score)
            print("title: ", data_title[docID])
            print("content: ", data_content[docID])

In [124]:
if __name__ == '__main__':
    calculate_documents_tf_idf()
    query = 'نتیجه بازی تیم فوتبال'
    champ_list = create_champion_lists(5)
    compare_query(10, champ_list, query)

cosine: 

found doc:  3107 with score:  0.061451896428757063
title:  واکنش AFC به پیروزی تیم ملی مقابل شاگردان مارویک/اماراتی ها با گل طارمی دبل شدند
content:  
به گزارش خبرگزاری فارس، تیم ملی فوتبال کشورمان در مصاف با امارات با نتیجه یک بر صفر به پیروزی رسید. تک گل این بازی را مهدی طارمی در اواخر نیمه اول به ثمر رساند. صفحه رسمی کنفدراسیون فوتبال آسیا به پیروزی تیم ملی کشورمان واکنش نشان داد و نوشت: گل مهدی طارمی در نیمه اول برای دبل شدن امارات مقابل ایران کافی بود. اماراتی ها در دیدار رفت هم در کشورشان مقابل تیم ملی فوتبال کشورمان شکست خورده بودند. انتهای پیام/



--------------------------------------
found doc:  2659 with score:  0.060524734568007534
title:  آبشک: بیرون زمین با هم رفیق هستیم، اما در زمین برای موفقیت تیم می‌جنگیم
content:  
به گزارش خبرنگار ورزشی خبرگزاری فارس، محمد آبشک پس از کسب عنوان قهرمانی سوپرجام فوتبال ایران اظهار داشت: در چند سال اخیر همیشه مقابل پرسپولیس خوب بازی می‌کردیم، اما در کسب نتیجه موفق نبودیم. این بازی تک مسابقه بود و حساسیت‌ها بالا رفته بود و خدا 