In [None]:
from typing import Union, List, Generator, Tuple, TypeVar
from selectolax.parser import HTMLParser, Node
from playhouse.postgres_ext import Model
from uuid import uuid5, NAMESPACE_URL
from peewee import Field
from pathlib import Path
from furl import furl
import pandas as pd
import requests
import random
import base64
import shutil
import fitz
import re

from covid19_guidance_explorer.reports import generate_search_results_report
from covid19_guidance_explorer.config import config
from covid19_guidance_explorer.database import *
from covid19_guidance_explorer.utils import *

In [None]:
import urllib.parse as url_parse
from pathlib import Path
from uuid import uuid4
import pandas as pd
import fitz

In [None]:
save_root = Path('D:/save/2022-09-28/')

issuer_ids = {
    'cms': 4,
    'dchealth': 5,
    'doe': 6,
    'fda': 7,
    'ice': 8,
    'la': 9,
    'laacd': 10,
    'louisianabelieves': 11,
    'marin': 12,
    'mn': 13,
    'nc': 14,
    'newyork': 15,
    'nh': 16,
    'njhealth': 17,
    'nycdoh': 18,
    'ohio': 19,
    'tx': 20
}

file_extensions = {1: '.html', 2: '.pdf'}

def urlkey_to_url(urlkey: str) -> str:
    domain, *path = urlkey.split(')')
    path = url_parse.urlsplit(')'.join(path)).path
    domain = '.'.join(domain.split(',')[::-1])

    return url_parse.urlunsplit(['https', domain, path, '', ''])

def mimetype_to_file_type(mimetype: str) -> int:
    return {
        'text/html': 1,
        'application/pdf': 2
    }.get(mimetype)

def row_to_source(row: pd.Series) -> str:
    return 'https://web.archive.org/web/{}if_/{}'.format(
        row.timestamp,
        urlkey_to_url(row.urlkey)
    )

def url_to_slug(url: str) -> str:
    return url_parse \
        .urlsplit(url) \
        .path \
        .replace('/', '_') \
        .strip('_')

In [None]:
issuer_name = 'nc'

In [None]:
folder = save_root.joinpath(issuer_name)

sources_file = folder.joinpath('sources.csv')

files = pd.Series(
    f for f in folder.glob('[!.]*') if f != sources_file
)

sources = pd.read_csv(sources_file)

sources['timestamp'] = sources['timestamp'].astype(str)
sources['date'] = pd.to_datetime(sources['timestamp'])
sources['doc_source'] = sources['urlkey'].map(urlkey_to_url)

In [None]:
all_document_datas = []
corresponding_ids = {}

for doc_source, document in sources.groupby('doc_source'):
    document_data = {
        'source': doc_source,
        'slug': url_to_slug(doc_source),
        'file_type': mimetype_to_file_type(document['mimetype'].mode().item()),
        'variables': {},
        'issuer': issuer_ids[issuer_name]
    }

    titles = []

    for i, version in document.iterrows():
        file = next(folder.glob(f'{version["uuid"]}*'))

        try:
            with fitz.open(file) as pdf:
                titles.append(
                    pdf.metadata.get('title', '')
                )
        except:
            titles.append('')

    document_data['title'] = pd.Series(titles).mode().iloc[0]
    document_data['search_content'] = fn.to_tsvector(document_data['title'])

    all_document_datas.append(document_data)
    corresponding_ids[doc_source] = list(document.index)

for row in database.batch_commit(all_document_datas, 100):
    Document.create(**row)

In [None]:
# DocumentVersion: document, language, file_type
# Document: issuer, language, file_type

In [None]:
all_document_datas = []
corresponding_ids = {}

for doc_source, document in sources.groupby('doc_source'):
    document_data = {
        'source': doc_source,
        'slug': url_to_slug(doc_source),
        'file_type': mimetype_to_file_type(document['mimetype'].mode().item()),
        'variables': {},
        'issuer': issuer_ids[issuer_name]
    }

    dates = zip(document['date'], document['date'].shift(-1))

    versions_data = []
    titles = []

    for (i, version), (effective_date, termination_date) in zip(document.iterrows(), dates):
        file = next(folder.glob(f'{version["uuid"]}*'))
        try:
            with fitz.open(file) as pdf:
                titles.append(
                    pdf.metadata.get('title', '')
                )
        except:
            titles.append('')
        continue
        version_data = {
            'source': row_to_source(version),
            'slug': document_data['slug'],
            'file_type': mimetype_to_file_type(version['mimetype']),
            'effective_date': effective_date.to_pydatetime(),
            'termination_date': termination_date.to_pydatetime()
        }

        file = next(folder.glob(f'{version["uuid"]}*'))

        if file.suffix == '.pdf':
            with fitz.open(file) as pdf:
                version_data['variables'] = pdf.metadata
                version_data['content'] = '\n'.join(p.get_text() for p in pdf)
                version_data['content_hash'] = hash_text(version_data['content'])
                version_data['file_hash'] = hash_file(file)
                version_data['title'] = pdf.metadata.get('title', '')
                version_data['search_content'] = fn.to_tsvector(
                    f'{version_data["title"]} {version_data["content"]}'
                )
                version_data['quick_search_content'] = fn.to_tsvector(version_data['title'])

        versions_data.append(version_data)

    document_data['title'] = pd.Series(titles).mode().iloc[0]
    #document_data['title'] = pd.DataFrame(versions_data)['title'].mode().item() or ''
    document_data['search_content'] = fn.to_tsvector(document_data['title'])

    all_document_datas.append(document_data)
    corresponding_ids[doc_source] = list(document.index)