In [None]:
from juddges.settings import NSA_DATA_PATH
import polars as pl

lf = pl.scan_parquet(NSA_DATA_PATH / "pages" / "pages_chunk_*.parquet")

In [None]:
duplicated_page_rows = lf.with_columns(
    pl.col("page").is_duplicated().alias("is_duplicated")
).filter(pl.col("is_duplicated"))

In [None]:
duplicate_count = duplicated_page_rows.count().collect()
print(f"Number of duplicated page rows: {duplicate_count['_id'].item()}")

In [None]:
duplicated_rows_df = duplicated_page_rows.drop("is_duplicated").collect()

In [None]:
print(duplicated_rows_df.estimated_size("mb"))
duplicated_rows_df = duplicated_rows_df.shrink_to_fit()
print(duplicated_rows_df.estimated_size("mb"))

In [None]:
grouped_doc_ids = duplicated_rows_df.group_by("page").agg(pl.col("doc_id"))

In [None]:
for row in grouped_doc_ids.iter_rows():
    page, doc_ids = row
    # print(f"Page: {page}")
    print(page)
    for doc_id in doc_ids:
        print(f"https://orzeczenia.nsa.gov.pl{doc_id}")
    print("=" * 80)
    break

In [None]:
duplicated_id_rows = lf.with_columns(
    pl.col("doc_id").is_duplicated().alias("is_duplicated")
).filter(pl.col("is_duplicated"))
duplicate_count = duplicated_id_rows.count().collect()
print(f"Number of duplicated page rows: {duplicate_count['_id'].item()}")
assert duplicate_count["_id"].item() == 0
# duplicated_rows_df = duplicated_id_rows.drop("is_duplicated").collect()
# print(duplicated_rows_df.estimated_size("mb"))
# duplicated_rows_df = duplicated_rows_df.shrink_to_fit()
# print(duplicated_rows_df.estimated_size("mb"))

In [None]:
duplicated_id_rows = lf.with_columns(pl.col("_id").is_duplicated().alias("is_duplicated")).filter(
    pl.col("is_duplicated")
)
duplicate_count = duplicated_id_rows.count().collect()
print(f"Number of duplicated page rows: {duplicate_count['_id'].item()}")
assert duplicate_count["_id"].item() == 0
# duplicated_rows_df = duplicated_id_rows.drop("is_duplicated").collect()
# print(duplicated_rows_df.estimated_size("mb"))
# duplicated_rows_df = duplicated_rows_df.shrink_to_fit()
# print(duplicated_rows_df.estimated_size("mb"))

In [None]:
test_strs = [
    '<span class="h-oper">\n             Szczegóły orzeczenia\n            </span>',
    "<title>",
    "</title>",
    '<td class="lista-label">\n            Data orzeczenia\n           </td>',
    '<span class="navl">\n        Powrót do listy\n       </span>',
    '<a href="/cbo/query">\n           Centralna Baza Orzeczeń Sądów Administracyjnych\n          </a>',
    '<div id="sp">\n      Powered by SoftProdukt\n     </div>',
]

for test_str in test_strs:
    print(f"Test string: {repr(test_str)}")
    filtered_rows = lf.filter(pl.col("page").str.contains(test_str))

    # # Collect the count of filtered rows
    # filtered_count = filtered_rows.count().collect()[0, 0]
    #
    # # Collect the total count of rows
    # total_count = lf.count().collect()[0, 0]
    #
    # # Check if all pages contain the substring
    # all_contain = filtered_count == total_count

    # Collect the count of filtered rows
    filtered_count = filtered_rows.select(pl.count()).collect()[0, 0]

    # Collect the total count of rows
    total_count = lf.select(pl.count()).collect()[0, 0]

    # Check if all pages contain the substring
    all_contain = filtered_count == total_count

    print(f"All pages contain the substring: {all_contain}")

    # if not print example without the substring
    if not all_contain:
        example = lf.filter(pl.col("page").str.contains(test_str).not_()).collect()[0:1]
        print("Example:")
        print(example)
        break
    print("=" * 80)
    print()