<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Download-and-Parse-Wiki" data-toc-modified-id="Download-and-Parse-Wiki-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Download and Parse Wiki</a></span></li><li><span><a href="#Process-and-Clean" data-toc-modified-id="Process-and-Clean-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Process and Clean</a></span></li><li><span><a href="#Generate-and-Upload" data-toc-modified-id="Generate-and-Upload-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Generate and Upload</a></span></li></ul></div>

**Scribe Autosuggest Generation**

This notebook is used to run the functions found in Scribe-Data to extract, clean and load autosuggestion files into Scribe apps.

In [None]:
import os
import sys
import json
import warnings
warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning)

from tqdm.auto import tqdm
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:99% !important; }</style>"))

In [None]:
pwd = os.path.dirname(os.path.realpath("gen_autosuggestions.ipynb"))
pwd = pwd.split("scribe_data")[0]
sys.path.append(pwd)

In [None]:
from scribe_data.extract_transform.extract_wiki import download_wiki, parse_to_ndjson
from scribe_data.extract_transform.process_wiki import clean, gen_autosuggestions
from scribe_data.utils import get_language_iso

# Download and Parse Wiki

In [None]:
# Languages: French, German, Italian, Portuguese, Russian, Spanish, Swedish
language = "French"
language_abbr = get_language_iso(language)

In [None]:
files = download_wiki(
    language=language, 
    target_dir=f"./{language_abbr}wiki_dump", 
    file_limit=None, # None is all files 
    dump_id="20220920"
)
print(f"Number of files: {len(files)}")

In [None]:
parse_to_ndjson(
    output_path=f"./{language_abbr}wiki.ndjson",
    input_dir=f"./{language_abbr}wiki_dump",
    partitions_dir=f"./{language_abbr}wiki_partitions",
    article_limit=None, # None is all articles
    delete_parsed_files=True,
    multicore=True,
    verbose=True,
)

# Process and Clean

In [None]:
with open(f"./{language_abbr}wiki.ndjson", "r") as fin:
    article_texts = [
        json.loads(l)[1] for l in tqdm(fin, desc="Articles added", unit="articles")
    ]

print(f"Number of articles: {len(article_texts)}")

In [None]:
# Define sample size for up to 1 million articles.
sample_size = 1000000 / len(article_texts)
if sample_size > 1:
    sample_size = 1
sample_size

In [None]:
text_corpus = clean(
    texts=article_texts,
    language=language,
    remove_words=None,
    sample_size=sample_size, 
    verbose=True,
)

# Generate and Upload

In [None]:
autosuggest_dict = gen_autosuggestions(
    text_corpus, 
    language=language,
    num_words=1000,
    ignore_words=None,
    update_local_data=True,
    verbose=True
)

In [None]:
# autosuggest_dict