In [1]:
import os
from typing import List
from typing import Dict
from glob import glob

import re

In [2]:
RU_WEBS_DIR = "./../a-crawling/crawling/crawling/spiders/pages/ru"
ru_webs_pl = glob(f"{RU_WEBS_DIR}/**.html")

print(f"Russian websites: {len(ru_webs_pl)}")

Russian websites: 195


In [3]:
EN_WEBS_DIR = "./../a-crawling/crawling/crawling/spiders/pages/en"
en_webs_pl = glob(f"{EN_WEBS_DIR}/**.html")

print(f"English websites: {len(en_webs_pl)}")

English websites: 565


In [4]:
RU_LEMMAS_FILE = "./../b-tokens/ru_lemmas.txt"
RU_TOKENS_FILE = "./../b-tokens/ru_tokens.txt"

In [5]:
EN_LEMMAS_FILE = "./../b-tokens/en_lemmas.txt"
EN_TOKENS_FILE = "./../b-tokens/en_tokens.txt"

In [6]:
class TfForToken:
    def __init__(self, token, tf):
        self.token = token
        self.tf = tf

    def __repr__(self):
        return f"TfForToken(token = '{self.token}', tf = {self.tf})"


class FileTfIdfModel:
    def __init__(self, file_idx: int, tfs: List[TfForToken]):
        self.file_idx = file_idx
        self.tfs = tfs
        
    def __repr__(self):
        return f"FileTfIdfModel(file_idx = '{self.file_idx}', tfs = {self.tfs},)"

In [7]:
import html2text
import spacy
import spacy_transformers


h2t = html2text.HTML2Text()
h2t.ignore_links = True


nlp_en = spacy.load("en_core_web_sm")
nlp_ru = spacy.load("ru_core_news_sm")

def get_tokens(text, lang):
    if (lang == "RU"):
        doc = nlp_ru(text)
    else:
        doc = nlp_en(text)
    
    return [w for w in doc if (w.is_alpha and not w.is_stop)]

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [8]:
from math import log10

def calculate_idf_map(idf_files_map: {}, total_files_count: int) -> Dict:
    # token (string) - key, idf (double) - value
    idf_map = {}
    
    for token in idf_files_map.keys():
        files_appears_in = idf_files_map[token]
        files_appears_in_count = len(files_appears_in)
    
        idf = log10(total_files_count / files_appears_in_count)
        idf_map[token] = idf

    return idf_map

In [9]:

def write_tf_idf_results(tfs_per_file_list: List[FileTfIdfModel], idf_map: Dict, output_dir: str):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for tfs_for_file in tfs_per_file_list:
        
        file_name = output_dir + str(tfs_for_file.file_idx) + ".txt"
        
        with open(file_name, "w") as file:
            for token_entry in tfs_for_file.tfs:
                token = token_entry.token
                
                tf = token_entry.tf
                idf = idf_map[token]
                tf_idf = tf * idf
                
                file.write(f"{token} {tf} {tf_idf}\n")

In [10]:
# TF-IDF for TOKENS

def create_tf_idf_for_tokens(lang: str, webs_pl: list, input_tokens_file: str, output_dir: str):
    
    total_files_count = len(webs_pl)
    # token (string) - key, files_appears_in (set of int) - value
    idf_files_map = {}
    # list of FileTfIdfModel
    tfs_per_file_list = []

    for pl in webs_pl:
        # list of TfForToken
        tfs_list = []
    
        file_idx_match = re.search(r'(.+)/(\d+)-.*\.html$', pl)
        file_idx = int(file_idx_match.group(2))
        
        with open(pl, "r") as file:
            file_content = file.read()
            file_text = h2t.handle(file_content)
            
            file_tokens = get_tokens(file_text, lang)
            
            unique_tokens = {el.text.lower() for el in file_tokens}
    
            total_tokens_count = len(file_tokens)
            print(f'{file_idx} file has {len(unique_tokens)} of unique tokens')
            
            for u_token in unique_tokens:
                # сколько раз встречается токен u_token в тексте данного документа file
                token_count = sum(1 for t in file_tokens if t.text == u_token)
    
                # term frequency для u_token для данного документа
                tf = token_count / total_tokens_count
    
                # добавляем u_token в idf_map
                if (u_token not in idf_files_map.keys()):
                    idf_files_map[u_token] = set()
                idf_files_map[u_token].add(file_idx)
    
                tfs_list.append(TfForToken(u_token, tf))
    
        file_tfs = FileTfIdfModel(file_idx, tfs_list)
        tfs_per_file_list.append(file_tfs)

    print()
    print(f'IDF map contains {len(idf_files_map.keys())} entries')
    print(f'Files processed = {len(tfs_per_file_list)}')

    idf_map = calculate_idf_map(idf_files_map=idf_files_map, total_files_count=total_files_count)

    write_tf_idf_results(tfs_per_file_list=tfs_per_file_list, idf_map=idf_map, output_dir=output_dir)

In [11]:

create_tf_idf_for_tokens(lang = "RU", webs_pl=ru_webs_pl, input_tokens_file=RU_TOKENS_FILE, output_dir="./ru-tokens/")


113 file has 422 of unique tokens
191 file has 880 of unique tokens
87 file has 205 of unique tokens
174 file has 434 of unique tokens
124 file has 245 of unique tokens
95 file has 95 of unique tokens
169 file has 767 of unique tokens
80 file has 81 of unique tokens
94 file has 81 of unique tokens
49 file has 245 of unique tokens
118 file has 342 of unique tokens
69 file has 206 of unique tokens
194 file has 1392 of unique tokens
139 file has 156 of unique tokens
68 file has 173 of unique tokens
71 file has 49 of unique tokens
89 file has 131 of unique tokens
184 file has 1702 of unique tokens
173 file has 525 of unique tokens
23 file has 172 of unique tokens
51 file has 86 of unique tokens
99 file has 81 of unique tokens
45 file has 69 of unique tokens
128 file has 124 of unique tokens
106 file has 494 of unique tokens
33 file has 688 of unique tokens
136 file has 149 of unique tokens
93 file has 85 of unique tokens
141 file has 146 of unique tokens
13 file has 163 of unique tokens
10

In [14]:
# TF-IDF for LEMMAs

def create_tf_idf_for_lemmas(lang: str, webs_pl: list, input_lemmas_file: str, output_dir: str):
    
    total_files_count = len(webs_pl)

    
    # known lemmas. lemma - key, list of tokens - value
    lemmas_map = dict()
    file = open(input_lemmas_file, 'r')

    lines = file.readlines()
    for line in lines:
        tokens = re.split('\\s+', line)
        
        lemma_dirty = tokens[0]
        lemma = lemma_dirty[:len(lemma_dirty)-1]
        
        lemmas_map[lemma] = []
        for i in range(1, len(tokens) - 1):
            token = tokens[i]
            if (not len(token.strip()) == 0):
                lemmas_map[lemma].append(token)
    file.close()

    
    # token (string) - key, files_appears_in (set of int) - value
    idf_files_map = {}
    # list of FileTfIdfModel
    tfs_per_file_list = []

    
    for pl in webs_pl:
        # list of TfForToken
        tfs_list = []
    
        file_idx_match = re.search(r'(.+)/(\d+)-.*\.html$', pl)
        file_idx = int(file_idx_match.group(2))
        
        with open(pl, "r") as file:
            file_content = file.read()
            file_text = h2t.handle(file_content)
            
            file_tokens = get_tokens(file_text, lang)
            file_lemmas = {el.lemma_ for el in file_tokens}
    
            total_tokens_count = len(file_tokens)
            print(f'{file_idx} file has {len(file_lemmas)} lemmas')
            
            for lemma in file_lemmas:
                # сколько раз встречаются токены lemma в тексте данного документа file
                tokens_count = 0

                tokens_list = lemmas_map[lemma] if (lemma in lemmas_map.keys()) else []
                for token in tokens_list:
                    t_count = sum(1 for t in file_tokens if t.text == token)
                    tokens_count += t_count
    
                # term frequency для lemma для данного документа
                tf = tokens_count / total_tokens_count
    
                # добавляем lemma в idf_map
                if (lemma not in idf_files_map.keys()):
                    idf_files_map[lemma] = set()
                idf_files_map[lemma].add(file_idx)
    
                tfs_list.append(TfForToken(lemma, tf))
    
        file_tfs = FileTfIdfModel(file_idx, tfs_list)
        tfs_per_file_list.append(file_tfs)

    print()
    print(f'IDF map contains {len(idf_files_map.keys())} entries')
    print(f'Files processed = {len(tfs_per_file_list)}')

    idf_map = calculate_idf_map(idf_files_map=idf_files_map, total_files_count=total_files_count)

    write_tf_idf_results(tfs_per_file_list=tfs_per_file_list, idf_map=idf_map, output_dir=output_dir)

In [15]:

create_tf_idf_for_lemmas(lang = "RU", webs_pl=ru_webs_pl, input_lemmas_file=RU_LEMMAS_FILE, output_dir="./ru-lemmas/")


113 file has 357 lemmas
191 file has 769 lemmas
87 file has 178 lemmas
174 file has 349 lemmas
124 file has 205 lemmas
95 file has 89 lemmas
169 file has 594 lemmas
80 file has 76 lemmas
94 file has 76 lemmas
49 file has 214 lemmas
118 file has 306 lemmas
69 file has 163 lemmas
194 file has 1087 lemmas
139 file has 148 lemmas
68 file has 154 lemmas
71 file has 38 lemmas
89 file has 122 lemmas
184 file has 1396 lemmas
173 file has 429 lemmas
23 file has 161 lemmas
51 file has 81 lemmas
99 file has 76 lemmas
45 file has 66 lemmas
128 file has 119 lemmas
106 file has 403 lemmas
33 file has 529 lemmas
136 file has 138 lemmas
93 file has 78 lemmas
141 file has 133 lemmas
13 file has 154 lemmas
102 file has 231 lemmas
10 file has 517 lemmas
116 file has 342 lemmas
16 file has 428 lemmas
160 file has 482 lemmas
183 file has 362 lemmas
105 file has 76 lemmas
172 file has 349 lemmas
112 file has 254 lemmas
179 file has 287 lemmas
91 file has 132 lemmas
58 file has 66 lemmas
148 file has 324 lem