In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import glob
from collections import Counter
import math
import re
import json
import subprocess
import shutil
import pickle

import pandas as pd
import umap
from tqdm.autonotebook import tqdm
from nltk.tokenize import word_tokenize

import numpy as np
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split, KFold

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.utils import class_weight

import language
import text_nn
import grab_category
import news
import groups
import libs.cpp_stuff as cpp

In [3]:
all_folders = [os.path.join("data", folder) for folder in ["sample", "sample2", "sample3", "sample4", "sample5", "website/en", "website/ru", "sample6", "sample7"]]
train_folders = [os.path.join("data", folder) for folder in ["sample", "sample2", "sample3", "sample4", "sample5", "sample6", "sample7"]]

In [8]:
paragraphs = Counter()

for folder in all_folders:
    print(folder)
    for day_folder in tqdm(glob.glob(os.path.join(folder, "????????"))):
        for batch_folder in glob.glob(os.path.join(day_folder, "??")):
            htmls = glob.glob(os.path.join(batch_folder, "*.html"))
            for html in htmls:
                file_data = language.read_file(html, [])
                paragraphs.update(file_data["paragraphs"])

ignore_limit = 50
with open("data/ignore_paragraph", "w") as f:
    for p, c in paragraphs.most_common():
        if c < ignore_limit:
            break
            
        f.write(f"{p}\n")

data/sample


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


data/sample2


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


data/sample3


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


data/sample4


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


data/sample5


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




In [6]:
if not os.path.exists(language.site_languages_file):
    site_languages = {}
    for folder in tqdm(all_folders):
        language.collect_site_languages(folder, site_languages)
        
    for site, ctr in site_languages.items():
        total = sum(ctr.values())
        site_languages[site] = {code: count / total for code, count in ctr.items()}

    with open(language.site_languages_file, "w") as f:
        json.dump(site_languages, f, indent=2)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




In [7]:
with open(language.site_languages_file) as f:
    site_languages = json.load(f)
    cpp.load_sources_languages(site_languages)

In [None]:
for folder in tqdm(all_folders):
    print(folder)
    all_file = language.merge_folder(folder)
    if os.path.exists(os.path.join(folder, "langs")):
        print("Skipping")
        continue
        shutil.rmtree(os.path.join(folder, "langs"))
    
    file_info = language.read_dump(all_file)
    lang_data = language.detect_languages(file_info)
    lang_dumps = os.path.join(folder, "langs")
    os.makedirs(lang_dumps)
    for ld in lang_data:
        language.dump_texts(ld["articles"], os.path.join(lang_dumps, ld["lang_code"]))

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

data/sample
Skipping
data/sample2
Skipping
data/sample3
Skipping
data/sample4
Skipping
data/sample5
Skipping
data/website/en
Skipping
data/website/ru
Skipping
data/sample6
Skipping
data/sample7


In [11]:
for folder in all_folders:
    if os.path.exists(os.path.join(folder, "ground_truth")) or "website" in folder:
        continue
        
    file_info = language.read_dump(os.path.join(folder, "langs", "ru"))
    gts = [grab_category.grab_lenta_ru_categories(file_info), grab_category.grab_nv_categories(file_info), grab_category.grab_federalpress_categories(file_info),
           grab_category.grab_korrnet_categories(file_info), grab_category.grab_allhockey_categories(file_info)]
    
    gt = grab_category.join_gt(gts)
    grab_category.save_gt(gt, folder)

1410 lenta.ru files


HBox(children=(FloatProgress(value=0.0, max=119414.0), HTML(value='')))


1106 GT labels saved to data/sample6


In [12]:
for folder in all_folders[-1:]:
    file_info = language.read_dump(os.path.join(folder, "langs", "en"))
    gts = [grab_category.load_gt(folder), grab_category.grab_reuters_categories(file_info), grab_category.grab_theguardian_categories(file_info), 
           grab_category.grab_mirror_categories(file_info)]
    
    gt = grab_category.join_gt(gts)
    grab_category.save_gt(gt, folder)

1106 GT labels loaded from data/sample6
7410 GT labels saved to data/sample6


In [13]:
for folder in tqdm(all_folders):
    print(folder)
    if os.path.exists(os.path.join(folder, "categories_ru")):
        continue
        shutil.rmtree(os.path.join(folder, "categories_ru"))

    ru_dump = os.path.join(folder, "langs", "ru")
    if not os.path.exists(ru_dump):
        continue
        
    file_info = language.read_dump(ru_dump)
    ru_categories = news.classify_news(file_info)

    os.makedirs(os.path.join(folder, "categories_ru"))
    for cd in ru_categories:
        language.dump_texts(cd["articles"], os.path.join(folder, "categories_ru", cd["category"]))

data/sample
data/sample2
data/sample3
data/sample4
data/sample5
data/website/en
data/website/ru
data/sample6


In [14]:
for folder in tqdm(all_folders):
    print(folder)
    if os.path.exists(os.path.join(folder, "categories_en")):
        continue
        shutil.rmtree(os.path.join(folder, "categories_en"))

    en_dump = os.path.join(folder, "langs", "en")
    if not os.path.exists(en_dump):
        continue
        
    file_info = language.read_dump(en_dump)
    en_categories = news.classify_news(file_info)

    os.makedirs(os.path.join(folder, "categories_en"))
    for cd in en_categories:
        language.dump_texts(cd["articles"], os.path.join(folder, "categories_en", cd["category"]))

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

data/sample
data/sample2
data/sample3
data/sample4
data/sample5
data/website/en
data/website/ru
data/sample6



In [6]:
for lang in ["ru", "en"]:
    texts_for_grouping = []
    for folder in tqdm(train_folders):
        dump_file = os.path.join(folder, "langs", lang)
        if os.path.exists(dump_file):
            texts_for_grouping.extend(groups.extract_texts_for_grouping(language.read_dump(dump_file)))

    print(f"{lang}: {len(texts_for_grouping)} examples")
    cpp.make_idf(texts_for_grouping, f"data/chunk_counts_{lang}.bin")

ru: 423873 examples
en: 394408 examples


In [None]:
categories = ["society", "economy", "sports", "science", "other", "technology", "entertainment"]

for folder in all_folders:
    print(folder)
    if os.path.exists(os.path.join(folder, "threads_ru")):
        os.remove(os.path.join(folder, "threads_ru"))
    
    file_info = []
    cats = []
    similarities = []
    for cat in categories:
        cat_fi = language.read_dump(os.path.join(folder, "categories_ru", cat))
        
        texts = groups.extract_text_for_grouping(cat_fi)
        process_input = "\n".join([f"{len(texts)}"] + texts) + "\n"
        output = language.run_process(
            ["groups/Release/news_groups", "similarity", groups.counts_for_grouping_ru], process_input)
        
        cat_sims = [[y.split(" ") for y in x.strip().split("\t") if y != ""] for x in output[:-1].split("\n")]           
        cat_sims = [[(int(idx) + len(file_info), float(sim)) for idx, sim in sims] for sims in cat_sims]
        similarities.extend(cat_sims)
        file_info.extend(cat_fi)
        cats.extend([cat] * len(cat_fi))        
        
    with open(os.path.join(folder, "threads_ru"), "wb") as f:
        pickle.dump((file_info, cats, similarities), f)

In [9]:
categories = ["society", "economy", "sports", "science", "other", "technology", "entertainment"]

for folder in all_folders:
    print(folder)
    if os.path.exists(os.path.join(folder, "threads_en")):
        os.remove(os.path.join(folder, "threads_en"))
    
    file_info = []
    cats = []
    similarities = []
    for cat in categories:
        cat_fi = language.read_dump(os.path.join(folder, "categories_en", "dump", cat))
        
        texts = groups.extract_text_for_grouping(cat_fi)
        process_input = "\n".join([f"{len(texts)}"] + texts) + "\n"
        output = language.run_process(
            ["groups/Release/news_groups", "similarity", groups.counts_for_grouping_en], process_input)
        
        cat_sims = [[y.split(" ") for y in x.strip().split("\t") if y != ""] for x in output[:-1].split("\n")]           
        cat_sims = [[(int(idx) + len(file_info), float(sim)) for idx, sim in sims] for sims in cat_sims]
        similarities.extend(cat_sims)
        file_info.extend(cat_fi)
        cats.extend([cat] * len(cat_fi))
        
        
    with open(os.path.join(folder, "threads_en"), "wb") as f:
        pickle.dump((file_info, cats, similarities), f)

data/sample
data/sample2
data/sample3
data/sample4
