In [1]:
import os
import re
import bs4
import time
import hazm

import numpy as np

from itertools import chain
from tqdm.notebook import tqdm
from collections import Counter

In [2]:
DATASET_PATH = '..\\dataset'
STOP_WORDS_PATH = '..\\utils\\stopwords.txt'

TAG_REGEX = re.compile(r'<.*?>.*?<.*?>')
HEAD_REGEX = re.compile(r'&lt;head&gt;.*?&lt;/head&gt;')
BODY_REGEX = re.compile(r'&lt;body.*?&gt;.*?&lt;/body&gt;')

TEXT_REGEX = re.compile(r'gt;.+?lt;')
PERSIAN_TEXT_REGEX = re.compile(r'[\u0600-\u06FF\s]+')

ALPHA = 0.3

vocab_map = {}
doc_map = {}

In [3]:
def read_docs(base_path=DATASET_PATH):
    dirs = []
    for root, _, files in os.walk(base_path):
        for file in files:
            dirs.append(root + '\\' + file)
    return dirs

In [4]:
def read_stopwords(path=STOP_WORDS_PATH):
    words = []
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            words.append(line.strip())
            
    return words

In [5]:
def read_pure_pages(file_path):
    document = ''
    
    appending = False
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            
            if line == '</DOC>':
                appending = False
                
                yield document
                document = ''
                
            if appending:
                document += line
            
            if line == '<DOC>':
                appending = True

In [6]:
def parse_page(page):
    doc, url, html = TAG_REGEX.findall(page)
    
    return {
        'doc': int(doc[7:-8]),
        'url': url[5:-6],
        'html': html.lower()
    }

In [7]:
def parse_html(page):
    items = {'head': [], 'body': []}
    for head, body in zip(HEAD_REGEX.findall(page['html']), BODY_REGEX.findall(page['html'])):
        items['head'].append(head)
        items['body'].append(body)
            
    return {
        'doc': page['doc'],
        'url': page['url'],
        'tags': items
    }

In [8]:
normalizer = hazm.Normalizer()
stemmer = hazm.Stemmer()
stopwords = read_stopwords()

def preprocess(text):
    tokens = []
    
    text = normalizer.normalize(text)
    words = hazm.word_tokenize(text)
    words = map(lambda word: stemmer.stem(word), words)
    
    for word in words:
        if word in stopwords or len(word) == 0:
            continue
            
        tokens.append(word)
        
    return tokens

In [9]:
N = 0
def clean_text_matrix(texts):
    global N
    data = {'head': {}, 'body': {}}
    counter = 0
    
    for document in texts:
        counter += 1
        
        if counter % 1000 == 0:
            print('1000 more docs processed')
        
        doc, url, tags = document['doc'], document['url'], document['tags']
        
        head = tags['head']
        body = tags['body']
        
        if not head:
            data['head'][doc] = Counter()
        if not body:
            data['body'][doc] = Counter()
        
        for item in head:
            cache = []
            for text in PERSIAN_TEXT_REGEX.findall(item):
                text = text[4:-4].replace('nbsp', '\u200c').replace('amp', '&').replace('&', '').strip().lower()
                if len(text) > 5:
                    cache.extend(preprocess(text))
            data['head'][doc] = Counter(cache)
            
        for item in body:
            cache = []
            for text in PERSIAN_TEXT_REGEX.findall(item):
                text = text[4:-4].replace('nbsp', '\u200c').replace('amp', '&').replace('&', '').strip().lower()
                if len(text) > 5:
                    cache.extend(preprocess(text))
            data['body'][doc] = Counter(cache)
               
    N = counter
    return data

In [10]:
def read_all_pages(files):
    pages = []
    
    counter = 0
    for file in files:
        for page in read_pure_pages(file):
            counter += 1
            page_dict = parse_page(page)
            text = parse_html(page_dict)
            pages.append(text)

            if counter % 1000 == 0:
                print('1000 more pages has been read')
                
    return pages

In [11]:
def calculate_df(data, keys):
    df = {}
    
    for doc_key in keys:
        for key, count in data['head'][doc_key].items():
            df[key] = df.get(key, 0) + count
        for key, count in data['body'][doc_key].items():
            df[key] = df.get(key, 0) + count
            
    return df

In [12]:
def build_tf_idf(data, doc_ids, main_key, helper_key, DF):
    tf_idf = {}
    
    for doc_id in doc_ids:
        word_count = sum(data[main_key][doc_id].values()) + sum(data[helper_key][doc_id].values())
        
        for token in data[main_key][doc_id].keys():
            counter = data[main_key][doc_id][token] + data[helper_key][doc_id][token]
            
            tf = counter / word_count
            df = DF[token]
            idf = np.log((N + 1) / (df + 1))
            
            tf_idf[doc_id, token] = tf * idf
            
    return tf_idf

In [13]:
def merge_tf_idfs(head_if_idf, body_if_idf):
    tf_idf = {}
    
    for key in head_if_idf:
        tf_idf[key] = head_if_idf[key]
        
    for key in body_if_idf:
        tf_idf[key] = body_if_idf[key] * ALPHA
    
    return tf_idf, len(tf_idf)

In [14]:
def build_index():
    tic = time.time()
    
    files = read_docs()
    pages = read_all_pages(files)
    cleaned_data = clean_text_matrix(pages)
    
    doc_ids = [page['doc'] for page in pages]
    df = calculate_df(cleaned_data, doc_ids)
    tokens = list(df.keys())
    N = len(df.keys())
    
    head_if_idf = build_tf_idf(cleaned_data, doc_ids, 'head', 'body', df)
    body_if_idf = build_tf_idf(cleaned_data, doc_ids, 'body', 'head', df)
    
    tf_idf, _ = merge_tf_idfs(head_if_idf, body_if_idf)
    
    toc = time.time()
    print(f'total time: {(toc - tic) / 60} min(s)')
        
    return tf_idf, tokens, df, doc_ids, pages

In [15]:
def matching_score(k, query, tf_idf):
    tokens = preprocess(query)
    tokens = set(tokens)
    query_weights = {}
    for key in tf_idf:
        if key[1] in tokens:
            query_weights[key[0]] = query_weights.get(key[0], 0) + tf_idf[key]
            
    query_weights = sorted(query_weights.items(), key = lambda x: x[1], reverse=True)
    res = []
    
    for i in query_weights[:k]:
         res.append(i[0])

    return res

In [16]:
def display_page(head, body):
    print('**************************************************')
    for item in head:
        for text in PERSIAN_TEXT_REGEX.findall(item):
            if len(text) > 5:
                print(text)
    print('---------------------------------------------------')
    for item in body:
        for text in PERSIAN_TEXT_REGEX.findall(item):
            if len(text) > 5:
                print(text)

In [None]:
tf_idf, vocab, DF, doc_ids, pages = build_index()

In [None]:
query = 'كاربرد كامپيوتر در زيست شناسي ملك'
indices = matching_score(2, query, tf_idf)
# print(indices)
for page_id in indices:
    for page in pages:
        if page['doc'] == page_id:
            display_page(page['tags']['head'], page['tags']['body'])