# Text Tiler Improvements
After 10h long run on ~1mln 2024 Factiva articles, pipeline requires improvements:
* Optimized text pre-processing - each article has to be supported by TextTiling Algo
* Tiling quality: we have to setup proxy metrics - "how well the long text is split into sections?" and optimize TextTiling Algo params according to it
* Speed: apply parallelization and optimize where possible

In [1]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

from pprint import pprint

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from typing import Collection
from collections import Counter

tqdm.pandas()

In [2]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=20)


job_id='dj-synhub-extraction-lkbi9fy6zepu8rcjuxqhjwkbld52wgt0-ouutovygqc'
df = pd.read_csv(f'../../extractions/{job_id}/result.csv')
df

INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Unnamed: 0,source_name,dateline,ingestion_datetime,currency_codes,company_codes_association_ticker_exchange,title,snippet,company_codes_lineage_ticker_exchange,an,company_codes_occur_ticker_exchange,...,document_type,modification_datetime,company_codes,action,region_codes,market_index_codes,company_codes_about_ticker_exchange,company_codes_occur,section,company_codes_relevance_ticker_exchange
0,Dow Jones Institutional News,,1739355044000,,,Press Release: XPENG announces its official la...,"\n -- XPENG officially enters the UK, streng...",,DJDN000020250212el2c001sa,,...,article,1739355044000,",bp,hkexch,hkexch,hsflia,hsflia,imotol,imotol,...",add,",eurz,uk,weurz,",",xdjgic,xdjglc,xdjiic,xnyci,",,",myjjcc,imotol,hsflia,hkexch,",,",MMTOF:PSGM,7211:XTKS,MMTOY:PSGM,MMO:XFRA,"
1,Securities and Exchange Commission (SEC) Filings,,1743191927000,,,First Busey Corporation - Statement of Changes...,Access the original document here\n\nStatement...,,SAEXC00020250328el3s00cin,,...,article,1743191927000,",firbus,firbus,firbus,seexc,seexc,",add,",namz,usa,",,,",seexc,firbus,",,
2,PR Newswire,,1744722047000,,,PLAUD.AI Acquires YC-Backed StarJar to Power I...,"SAN FRANCISCO, April 15, 2025 /PRNewswire/ -- ...",,PRN0000020250415el4f000cs,",INTU:XWBO,1INTU:XMIL,INTU:XNAS,INTU:XMEX,ITU:...",...,article,1744722047000,",amzcom,amzcom,gognew,gognew,ituit,ituit,linkd...",add,",namz,sfra,usa,usca,usw,",,,",yoinco,teslmi,pkxwks,linkd,ituit,gognew,amzcom,",,",INTU:XWBO,1INTU:XMIL,INTU:XNAS,INTU:XMEX,ITU:..."
3,Public Companies News and Documents via PUBT,,1731288771000,,,Shionogi & Co. Ltd. - SHIONOGI Story 2 :Busine...,Access the original document here\n\nSHIONOGI ...,,LCDVP00020241111ekbb0020f,",SGIOY:PSGM,SGIOF:PSGM,SH0:XFRA,SH00:XMUN,4507...",...,article,1731288771000,",pingin,pingin,shimc,shimc,shnogi,shnogi,shnog...",add,",apacz,asiaz,china,chinaz,devgcoz,dvpcoz,easia...",,",SGIOY:PSGM,SGIOF:PSGM,SH0:XFRA,SH00:XMUN,4507...",",soneti,shnogi,shimc,pingin,",,",SGIOY:PSGM,SGIOF:PSGM,SH0:XFRA,SH00:XMUN,4507..."
4,Canada Stockwatch,,1736895739000,,,Torex Gold Provides 2025 Operational Guidance ...,(All amounts expressed in U.S. dollars unless ...,,CNSW000020250114el1e00ifx,,...,article,1736895739000,",hydgld,hydgld,hydgld,",add,",cana,caon,namz,toron,",,,",hydgld,",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223516,Securities and Exchange Commission (SEC) Filings,,1741210800000,,,Coinbase Global Inc. - Statement of Changes in...,Access the original document here\n\nStatement...,,SAEXC00020250305el3500s1l,,...,article,1741210800000,",coinba,coinba,coinba,seexc,seexc,",add,",namz,usa,",,,",seexc,coinba,",,
223517,Public Companies News and Documents via PUBT,,1742297491000,,,Almawave S.p.A. - Almawave and Oracle: strateg...,Access the original document here\n\nAlmawave ...,,LCDVP00020250318el3i00kwv,,...,article,1742297491000,",hggngf,hggngf,kosco,kosco,orcle,orcle,orcle,p...",add,",eecz,eurz,italy,lombar,medz,milan,weurz,",",xf500,",,",prital,orcle,kosco,hggngf,",,
223518,Global Banking News,,1727705061000,,,QNB’s share buyback gets regulatory approval,Qatar-based banking firm QNB Group has said th...,,GLOBAN0020240930ek9u000b9,,...,article,1727705061000,",qatfma,qatfma,qatfma,qbnk,qbnk,qbnk,qma,qma,q...",add,,,,",qnbusa,qma,qbnk,qatfma,",,
223519,Securities and Exchange Commission (SEC) Filings,,1715978722000,,,FNCB Bancorp Inc. - Amendment to Statement of ...,Access the original document here\n\nAmendment...,,SAEXC00020240517ek5h00mt5,,...,article,1715978722000,",fstdun,fstdun,fstdun,",add,",namz,usa,",,,",fstdun,",,


### First thing: creating custom word_count (Factiva has it's own)

In [3]:
import re
RE_WORD = re.compile(r'\b\w+\b') # 6% mean error from factiva wc, because factiva concats all the text cols

df['alt_word_count'] = df['body'].parallel_apply(
    lambda s: len(RE_WORD.findall(s)))
df['wc_abs_diff'] = np.absolute(df['alt_word_count'] - df['word_count'])
df['wc_abs_diff_pct'] = df['wc_abs_diff'] / df['word_count']
df[['word_count', 'alt_word_count', 'wc_abs_diff', 'wc_abs_diff_pct']].describe() # WORD_RE

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=11177), Label(value='0 / 11177')))…

Unnamed: 0,word_count,alt_word_count,wc_abs_diff,wc_abs_diff_pct
count,223521.0,223521.0,223521.0,223521.0
mean,2208.282953,2057.573655,150.722451,0.060454
std,5315.278784,3692.947964,2769.262868,0.078002
min,75.0,14.0,2.0,0.000239
25%,799.0,736.0,14.0,0.010736
50%,993.0,975.0,54.0,0.028736
75%,1437.0,1374.0,70.0,0.075325
max,306593.0,43289.0,267679.0,0.888268


### Let's do some cleaning
* Removing most common lines that (mostly) don't bring value

In [4]:
def most_common_lines(
    texts: Collection[str],
    top_k: int = 1000,
    min_pct_docs: float = 0.03,
    min_len: int = 10,
    max_len: int = 160,
) -> pd.DataFrame:
    """
    Return the `top_k` most frequent full lines across the corpus,
    excluding very short or very long ones (likely not boiler-plate).
    """
    n_texts = len(texts)
    counter = Counter()
    for doc in tqdm(texts):
        for line in doc.splitlines():
            stripped = line.strip()
            if min_len <= len(stripped) <= max_len:
                counter[stripped] += 1

    df = pd.DataFrame(counter.most_common(top_k), columns=["line", "count"])
    df["pct_docs"] = df["count"] / n_texts
    df = df.query("pct_docs >= @min_pct_docs").sort_values("count", ascending=False)
    
    return df

boilerplate_lines = most_common_lines(df['body'])
boilerplate_lines

  0%|          | 0/223521 [00:00<?, ?it/s]

Unnamed: 0,line,count,pct_docs
0,--------------- --------------------- ------...,219923,0.983903
1,-------------- --------------------- -------...,129212,0.578075
2,Status: Completed,109451,0.489668
3,(END) Dow Jones Newswires,75546,0.337982
4,The currency of all prices and other monetary ...,72451,0.324135
...,...,...,...
214,right exists:,6844,0.030619
215,----------- ----- ----------- -----,6840,0.030601
216,----------- ------ ---------- ------,6836,0.030583
217,(d) If an exempt fund manager connected with a...,6781,0.030337


In [5]:
# import matplotlib.pyplot as plt

# tmp = most_common_lines(df['body'], top_k=2000)
# plt.hist(tmp["pct_docs"], bins=200, log=True)
# plt.xlabel("Fraction of documents a line appears in")
# plt.ylabel("Number of distinct lines")
# plt.show()

In [6]:
import re
from typing import Iterable

from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize

HTML_TAG_SNIF = re.compile(r"<(p|div|br|h[1-6]|table|ul|ol)\b", re.I)

def html_to_parbreaks(html: str) -> str:
    """Very small helper: replace block tags with '\n\n', strip the rest."""
    soup = BeautifulSoup(html, "lxml")
    for br in soup.find_all("br"):
        br.replace_with("\n")
    for tag in soup.find_all(True):
        if tag.name.lower() in {"p", "div", "li", "table", "tr",
                                "ul", "ol", "section", "header",
                                "footer", "h1", "h2", "h3", "h4", "h5", "h6"}:
            tag.insert_before("\n\n")
            tag.insert_after("\n\n")
    text = soup.get_text(" ", strip=False)
    return re.sub(r"\n{3,}", "\n\n", text).strip()

def compile_boilerplate_regex(lines: Iterable[str]) -> re.Pattern:
    """
    Build a single regex OR-ing escaped boiler-plate lines.
    Use NON-capturing group `(?:...)` so re.sub can drop them easily.
    """
    escaped = [re.escape(l) for l in lines]
    pattern = r"^(?:%s)$" % "|".join(escaped)
    return re.compile(pattern, flags=re.MULTILINE)

def inject_paragraph_breaks(text: str,min_tokens: int = 40, max_tokens: int = 80) -> str:
     """
     Insert '\n\n' paragraph markers so that each paragraph has roughly
     min_tokens–max_tokens whitespace words.

     Needed to make TextTiler work with texts without paragraph breaks.
     """
     
     sentences = sent_tokenize(text)
     out, buf, n = [], [], 0
     
     for sent in sentences:
          tok = len(sent.split())
          buf.append(sent)
          n += tok
          
          if n >= max_tokens or (n>=min_tokens and re.search(r"[.!?]$", sent)):
               out.append(" ".join(buf))
               buf, n = [], 0
     if buf:
          out.append(" ".join(buf))
               
     return "\n\n".join(out)

def clean_document(
    raw_text: str,
    boilerplate_re: re.Pattern,
    min_caps_ratio: float = 0.6,
    max_short_tokens: int = 5,
) -> str:
    """
    1) Strip boiler-plate lines;
    2) Normalise paragraph breaks to exactly one *blank* line between paragraphs;
    3) Merge tiny paragraphs (datelines, bullets) into the next paragraph.
    """
    # -- 1. boiler-plate removal
    text = boilerplate_re.sub("", raw_text)

    # -- 2. normalise blank lines
    text = re.sub(r"\r\n|\r", "\n", text)        # convert CRLF → LF
    text = re.sub(r"\n{3,}", "\n\n", text.strip())

    # -- 3. collapse short / all-caps paragraphs
    parts = text.split("\n\n")
    good_parts = []
    buffer = []
    for p in parts:
        tokens = p.split()
        caps_ratio = sum(t.isupper() for t in tokens) / max(len(tokens), 1)
        if (len(tokens) <= max_short_tokens) or (caps_ratio >= min_caps_ratio):
            buffer.append(p)
            continue

        if buffer:
            p = " ".join(buffer) + " " + p
            buffer = []
        good_parts.append(p)
    if buffer:                          # dangling buffer at EOF
        good_parts.append(" ".join(buffer))

    return "\n\n".join(good_parts)

def preprocess_document(
    raw_text: str,
    boilerplate_re: re.Pattern,
    min_caps_ratio: float = 0.6,
    max_short_tokens: int = 5,
    inj_min_tokens: int = 40,
    inj_max_tokens: int = 80,
    trigger_ratio: float = 1.5,
) -> str:
    """
    1) Run `clean_document`.
    2) If *any* paragraph still exceeds `trigger_ratio * inj_max_tokens`
       OR there is only one paragraph at all, call `inject_paragraph_breaks`.
    Returns the fully pre-processed text, ready for TextTiling.
    """
    looks_like_html = HTML_TAG_SNIF.search(raw_text)
    
    text = html_to_parbreaks(raw_text) if looks_like_html else raw_text
    
    
    text = clean_document(
        text,
        boilerplate_re,
        min_caps_ratio=min_caps_ratio,
        max_short_tokens=max_short_tokens,
    )
    
    paragraphs = text.split("\n\n")
    if len(paragraphs) == 1:
        need_injection = True
    else:
        longest = max(len(p.split()) for p in paragraphs)
        need_injection = longest > trigger_ratio * inj_max_tokens

    if need_injection:
        text = inject_paragraph_breaks(
            text,
            min_tokens=inj_min_tokens,
            max_tokens=inj_max_tokens,
        )
    return text

bp_regex = compile_boilerplate_regex(boilerplate_lines['line'].tolist())

df['body_clean'] = df['body'].parallel_apply(lambda x: preprocess_document(x, bp_regex))
df['body_clean_wc'] = df['body_clean'].parallel_apply(
    lambda s: len(RE_WORD.findall(s)))

df[['word_count', 'alt_word_count', 'body_clean_wc']].describe()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=11177), Label(value='0 / 11177')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=11177), Label(value='0 / 11177')))…

Unnamed: 0,word_count,alt_word_count,body_clean_wc
count,223521.0,223521.0,223521.0
mean,2208.282953,2057.573655,1989.540634
std,5315.278784,3692.947964,3701.546339
min,75.0,14.0,14.0
25%,799.0,736.0,606.0
50%,993.0,975.0,874.0
75%,1437.0,1374.0,1338.0
max,306593.0,43289.0,43289.0


## Optimizing Tiling itself
### Part One: Quality

Let's check how good are resulting sections using:
* Depth Score - [ALGO](https://www.nltk.org/_modules/nltk/tokenize/texttiling.html#TextTilingTokenizer)
* SBERT edge - cosine similarity between the first and last sentence in each tile, computed with a Sentence-BERT encoder.

In [7]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from typing import Dict, Tuple, Any, List
from nltk.tokenize import TextTilingTokenizer, sent_tokenize
from sentence_transformers import SentenceTransformer, util

class CustomTextTilingTokenizer(TextTilingTokenizer):
    """
    A wrapper around NLTK's TextTilingTokenizer
    Current functionality:
    
    keeps:
        • self.depth_scores   – valley depths (len = #gaps)
        • self.boundaries     – 0/1 flag per gap (len = #gaps)
 
    The regular return value (a list of tiles) is unchanged.
    """
    def _depth_scores(self, scores):
        self.depth_scores = super()._depth_scores(scores)
        return self.depth_scores

    def _identify_boundaries(self, depth_scores):
        self.boundaries = super()._identify_boundaries(depth_scores)
        return self.boundaries
   

class TextTiler: # Initial Tiler: slow and consequential
     def __init__(self, text_tiling_params: Dict|None = None, sbert_model: str="intfloat/multilingual-e5-small"):
          self.encoder = SentenceTransformer(sbert_model)
          
          self.tt_params = text_tiling_params or dict(w=25, k=7, smoothing_width=4, smoothing_rounds=2, cutoff_policy="HC")
          self.splitter = CustomTextTilingTokenizer(**self.tt_params) # type: ignore
     
     @staticmethod
     def _first_last_sentences(tile: str) -> Tuple[str, str]:
          sents = sent_tokenize(tile)
          return (sents[0], sents[-1]) if sents else ("", "")
     
     def segment_and_score(self, text: str) -> List[Tuple[str, float, float]]:
          try: 
               tiles : List[str] = self.splitter.tokenize(text) # type: ignore
          except ValueError as e:
               print(f"TextTilingTokenizer error: {e}; text length: {len(text)}; text fragment: {text[:100]}")
               return []          
          
          depth = [None] + self.splitter.depth_scores[: len(tiles) - 1]

        # SBERT edge coherence
          pairs = [self._first_last_sentences(t) for t in tiles]
          flat  = [s for p in pairs for s in p if s]
          cos   = []
          if flat:
               embs = self.encoder.encode(flat, batch_size=32, show_progress_bar=False)
               cos  = util.cos_sim(embs[0::2], embs[1::2]).diagonal().tolist()

          edges, j = [], 0
          for a, b in pairs:
               if a and b:
                    edges.append(cos[j]); j += 1
               else:
                    edges.append(None)

          return list(zip(tiles, depth, edges)) # type: ignore


### Let's calculate proxy-metrics as follows:

In [8]:
from transformers import AutoTokenizer
TOK = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-small",model_max_length=512)
def bpe_len(text: str) -> int:
    """
    Return the number of BPE tokens in the text.
    """
    return len(TOK.encode(text, add_special_tokens=False))

def agg_stats(segm_results: Iterable[List[tuple]]) -> Dict[str, Any]:
     tiles_per_doc, lengths, edges, depths = [], [], [], []
     
     for doc_tiles in tqdm(segm_results, desc="Aggregating stats per text"):
         if not doc_tiles:
             continue
         tiles_per_doc.append(len(doc_tiles))
         for tile, depth, edge in doc_tiles:
                lengths.append(bpe_len(tile))
                if edge is None:
                    raise ValueError("Edge coherence is None, check the model and input text.")                
                edges.append(edge)
                if depth is not None:
                    depths.append(depth)
     
     out = {}
     
     out["n_docs"] = len(tiles_per_doc)
     out["tiles_doc_mean"] = np.mean(tiles_per_doc)
     out["tiles_doc_std"] = np.std(tiles_per_doc)
     
     for p in [25, 50, 95, 99]: #tile lengths percentiles
            out[f"tile_len_{p}"] = np.percentile(lengths, p)
     out["len_wp_mean"] = np.mean(lengths)
     out["len_wp_max"] = np.max(lengths,)
     
     def _q(a, q): return np.percentile(a, q) if a else np.nan
     
     out["edge_median"]   = _q(edges, 50)
     out["edge_p25"]      = _q(edges, 25)
     out["edge_p75"]      = _q(edges, 75)
     out["depth_median"]  = _q(depths, 50)
     out["depth_p75"]     = _q(depths, 75)
     out["depth_p25"]     = _q(depths, 25)     
 
     return out
 
def pretty_print_metrics(d, prec: int = 4,verbose: bool = True) -> None:
    """
    Print metrics with fixed precision and without the NumPy dtype noise.
    """
    clean = {
        k: (
            round(float(v), prec)               # numpy → plain float
            if isinstance(v, (np.floating, float))
            else round(v, prec) if isinstance(v, (np.integer, int))
            else int(v) if isinstance(v, np.integer)  # numpy int → plain int
            else v
        )
        for k, v in d.items()
    }
    if verbose:
        pprint(clean, compact=True)
    return

os.environ["TOKENIZERS_PARALLELISM"] = "true"     # enable Rust thread-pool

def agg_stats_fast(
    segm_results: Iterable[List[Tuple[str, float, float]]],
    *,
    batch_size: int = 1024,
) -> Dict[str, Any]:
    """
    segm_results: iterable of lists produced by ParallelTextTiler
                  each inner tuple = (tile_text, depth_score, edge_cosine)
    Returns the same dict as your original agg_stats, but ~10× faster.
    """
    tiles_per_doc, all_tiles, edges, depths = [], [], [], []

    for doc_tiles in segm_results:
        if not doc_tiles:
            continue
        tiles_per_doc.append(len(doc_tiles))
        for tile, depth, edge in doc_tiles:
            all_tiles.append(tile)
            if edge is not None:
                edges.append(edge)
            if depth is not None:
                depths.append(depth)

    lens = np.empty(len(all_tiles), dtype=np.uint16)
    idx  = 0
    for start in tqdm(range(0, len(all_tiles), batch_size),
                      desc="Token-count batches",
                      leave=False):
        batch_text = all_tiles[start : start + batch_size]
        ids = TOK(batch_text, add_special_tokens=False)["input_ids"]
        for seq in ids:
            lens[idx] = len(seq); idx += 1

    lengths_np  = lens
    edges_np    = np.asarray(edges,  dtype=np.float32)
    depths_np   = np.asarray(depths, dtype=np.float32)
    tiles_doc_np = np.asarray(tiles_per_doc, dtype=np.uint16)

    out = {
        "n_docs":            tiles_doc_np.size,
        "tiles_doc_mean":    float(tiles_doc_np.mean()),
        "tiles_doc_std":     float(tiles_doc_np.std()),
        "len_wp_mean":       float(lengths_np.mean()),
        "len_wp_max":        int(lengths_np.max()),
        "tile_len_25":       float(np.percentile(lengths_np, 25)),
        "tile_len_50":       float(np.percentile(lengths_np, 50)),
        "tile_len_95":       float(np.percentile(lengths_np, 95)),
        "tile_len_99":       float(np.percentile(lengths_np, 99)),
        "edge_median":       float(np.percentile(edges_np, 50)),
        "edge_p25":          float(np.percentile(edges_np, 25)),
        "edge_p75":          float(np.percentile(edges_np, 75)),
        "depth_median":      float(np.percentile(depths_np, 50)),
        "depth_p25":         float(np.percentile(depths_np, 25)),
        "depth_p75":         float(np.percentile(depths_np, 75)),
    }
    return out

### Let's parallelize tiling

In [13]:
import os
import torch

from functools import lru_cache, partial
from typing import Sequence, List, Tuple, Dict

os.environ.setdefault("TOKENIZERS_PARALLELISM", "true")
pandarallel.initialize(progress_bar=False, nb_workers=os.cpu_count()-2)


MAX_LEN = 512  # max BPE length for a single tile
PARAM_KEYS = ("w", "k", "smoothing_width", "smoothing_rounds", "cutoff_policy")
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

@lru_cache(maxsize=24)
def _get_splitter(params_tuple):
    params = dict(zip(PARAM_KEYS, params_tuple))
    return CustomTextTilingTokenizer(**params)

def _tiles_depth_static(text: str, params_tuple) -> List[Tuple[str, float, None]]:
    splitter = _get_splitter(params_tuple)
    try:
        tiles = splitter.tokenize(text)
    except ValueError:
        return [(text, 0., None)]  # return the whole text if it fails
    depth = [None] + splitter.depth_scores[: len(tiles) - 1]
    return [(t, d, None) for t, d in zip(tiles, depth)] # type: ignore

def _first_last_static(seg_list: Sequence[Tuple[str, float, None]]):
    out = []
    for idx, (tile, _, _) in enumerate(seg_list):
        sents = sent_tokenize(tile)
        if not sents:
            continue
        fst = sents[0]
        lst = sents[-1] if len(sents) >= 2 else sents[0]  # duplicate
        out.append((idx, fst, lst))
    return out


def rechunk(tile: str, tokenizer, max_len: int = MAX_LEN) -> List[str]:
    """
    Split an oversize tile so every chunk ≤ max_len BPE tokens.
    Tokenises each sentence *once*; O(N) time and memory.
    """
    sents = sent_tokenize(tile)
    if not sents:
        return []

    # token lengths per sentence (vectorised Rust call)
    lens  = [len(ids) for ids in tokenizer(sents,
                                           add_special_tokens=False)["input_ids"]]

    # fast path: tile already short
    if sum(lens) <= max_len:
        return [tile]

    out, buf, acc = [], [], 0
    for sent, sent_len in zip(sents, lens):
        # sentence longer than max_len by itself → hard-wrap token-wise
        if sent_len > max_len:
            if buf:
                out.append(" ".join(buf)); buf, acc = [], 0
            # chop the sentence itself
            toks = tokenizer.encode(sent, add_special_tokens=False)
            for i in range(0, len(toks), max_len):
                chunk_ids = toks[i : i + max_len]
                out.append(tokenizer.decode(chunk_ids))
            continue

        # normal case
        if acc + sent_len > max_len:
            out.append(" ".join(buf))
            buf, acc = [], 0
        buf.append(sent); acc += sent_len

    if buf:
        out.append(" ".join(buf))
    return out



class ParallelTextTiler:
    def __init__(
        self,
        text_tiling_params: Dict | None = None,
        sbert_model_id: str = "intfloat/multilingual-e5-small",
        batch_size: int = 512,
    ):
        os.environ["TOKENIZERS_PARALLELISM"] = "true"  
        self.batch_size = batch_size
        
        self.text_tiling_params = text_tiling_params or dict(
            w=25, k=6, smoothing_width=4, smoothing_rounds=2, cutoff_policy="HC"
        )       
        self.model = SentenceTransformer(sbert_model_id, device=DEVICE).half()   # single GPU/CPU copy
        _ = self.model.encode(["warm-up"] * self.batch_size, 
                batch_size=batch_size, convert_to_tensor=True, show_progress_bar=False)  # JIT warm-up
        self._tokenizer = self.model.tokenizer

    def segment_dataframe(self, df: pd.DataFrame, col: str, with_edge: bool = True, keep_depth: bool=True) -> pd.Series:
        params_tuple = tuple(self.text_tiling_params[k] for k in PARAM_KEYS)
        logging.info(f"Step 1: Tiling texts")
        # -- CPU pass #1 : tiling ---------------------------------------
        tiles_func = partial(_tiles_depth_static, params_tuple=params_tuple)
        segments   = df[col].parallel_apply(tiles_func)

        segments = segments.apply(
                lambda segs: [
                    (t2, d, e)
                    for t, d, e in segs
                    for t2 in rechunk(t, self._tokenizer)
                ]
            )
               
        if not with_edge:
            if keep_depth:
                return segments
            else: # just return tile texts
                return segments.apply(lambda lst: [t for t, _,_ in lst])
        
        # -- CPU pass #2 : extract first/last sentences -------------------------
        logging.info(f"Step 2: Extracting first/last sentences")
        pairs_per_doc = segments.parallel_apply(_first_last_static)

        pairs = [
            (doc, idx, fst, lst)
            for doc, lsts in pairs_per_doc.items()
            for idx, fst, lst in lsts
        ]
        if not pairs:
            return segments

        flat = [s for p in pairs for s in (p[2], p[3])]
        logging.info(f"Step 3: Encoding {len(flat)} first/last sentences in pairs")
        
        cosines: List[float] = [] ## cacl edge scores between first and last sentences
        for i in tqdm(range(0, len(flat), self.batch_size), desc="SBERT edge pass"):
            with torch.inference_mode():
                embs = self.model.encode(flat[i:i+self.batch_size], 
                                    batch_size=self.batch_size,
                                    convert_to_tensor=True,
                                    normalize_embeddings=False,
                                    show_progress_bar=False)
                cos   = util.cos_sim(embs[0::2], embs[1::2]).diagonal().tolist()
                cosines.extend(cos)

        for (doc, idx, *_), edge in zip(pairs, cosines):
            tile, depth, _ = segments.at[doc][idx]
            segments.at[doc][idx] = (tile, depth, edge)

        return segments

INFO: Pandarallel will run on 22 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Testing inference

In [None]:
tiler = ParallelTextTiler(batch_size=512)        # tweak params if you like

sample = df.sample(1000, random_state=42)
sample["segments"] = tiler.segment_dataframe(sample, col="body_clean", with_edge=False, keep_depth=False)
sample["num_segments"] = sample["segments"].apply(len)

sample[['body_clean', 'segments', 'num_segments']]

2025-06-04 18:03:10,702 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-small
2025-06-04 18:03:14,519 - INFO - Step 1: Tiling texts
Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors


In [10]:
tiler = ParallelTextTiler(batch_size=512)        # tweak params if you like
sample = df.sample(10000, random_state=42)
segments = tiler.segment_dataframe(sample, col="body_clean")
metrics  = agg_stats_fast(segments, )
pretty_print_metrics(metrics)

#bs: 128 - 1935 batches (247k tiles) -> 3:55s; 8m 11.7s total | fp-16 + inference: 1:41s; 5m 43s total | JIT once + tok_parallelism: 1:24s; 5m 19.2s total
#bs: 256 - 968 batches (247k tiles) -> 4:25s; 8m 59.7s total | fp-16 + inference: 1:56s; 6m 29.6s total | JIT once + tok_parallelism: 1:34s
#bs: 512 - 484 batches (247K tiles) -> 5:05s; 9m 59.3s total | fp-16 + inference+ JIT once + tok_parallelism: 1:44s; 5m 5.4s total

2025-06-04 16:25:32,146 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-small
2025-06-04 16:25:36,942 - INFO - Step 1: Tiling texts
Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors
2025-06-04 16:29:43,717 - INFO - Step 2: Extracting first/last sentences
2025-06-04 16:29:48,332 - INFO - Step 3: Encoding 257326 first/last sentences in pairs


SBERT edge pass:   0%|          | 0/503 [00:00<?, ?it/s]

Token-count batches:   0%|          | 0/126 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


{'depth_median': 0.0306,
 'depth_p25': 0.0,
 'depth_p75': 0.1421,
 'edge_median': 0.8335,
 'edge_p25': 0.8047,
 'edge_p75': 0.8711,
 'len_wp_max': 514,
 'len_wp_mean': 230.1926,
 'n_docs': 9958,
 'tile_len_25': 147.0,
 'tile_len_50': 209.0,
 'tile_len_95': 453.0,
 'tile_len_99': 512.0,
 'tiles_doc_mean': 12.9206,
 'tiles_doc_std': 22.3643}


### Hyperparams optimization
How well algo is tiling according to:
* Edge_score (how similar are start/end sentences of paragraph)
* Depth_score of TT Algo
* tile size: it should fit 512 input tokens of embedder

### First: choosing sample size


In [None]:
tiler = ParallelTextTiler(batch_size=256)        # tweak params if you like

sample_size_candidates = [100, 500, 1000, 5000, 10000, 20000, 50000, 100000]
metrics_per_sample_size = {}

for sample_size in tqdm(sample_size_candidates, desc="Calulating stats for different sample sizes"):
    logging.info(f"Sample size: {sample_size}")
    segments = tiler.segment_dataframe(df.sample(sample_size, random_state=42), col="body_clean")
    metrics  = agg_stats_fast(segments)
    metrics_per_sample_size[sample_size] = metrics
    # pretty_print_metrics(metrics)

2025-06-03 13:54:52,428 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-small


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO: Pandarallel will run on 22 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Sampling sizes:   0%|          | 0/8 [00:00<?, ?it/s]

2025-06-03 13:54:55,994 - INFO - 
Sample size: 100
2025-06-03 13:54:55,997 - INFO - Step 1: Tiling texts
2025-06-03 13:55:07,446 - INFO - Step 2: Extracting first/last sentences
2025-06-03 13:55:11,195 - INFO - Step 3: Encoding 2764 first/last sentences in pairs


SBERT edge pass:   0%|          | 0/11 [00:00<?, ?it/s]

Token-count batches:   0%|          | 0/2 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors
2025-06-03 13:55:12,795 - INFO - 
Sample size: 500
2025-06-03 13:55:12,803 - INFO - Step 1: Tiling texts


{'depth_median': 0.037,
 'depth_p25': 0.0,
 'depth_p75': 0.1643,
 'edge_median': 0.8354,
 'edge_p25': 0.8042,
 'edge_p75': 0.8739,
 'len_wp_max': 2387,
 'len_wp_mean': 244.6628,
 'n_docs': 98,
 'tile_len_25': 146.25,
 'tile_len_50': 204.0,
 'tile_len_95': 469.0,
 'tile_len_99': 840.8,
 'tiles_doc_mean': 14.102,
 'tiles_doc_std': 25.5157}


2025-06-03 13:55:37,133 - INFO - Step 2: Extracting first/last sentences
2025-06-03 13:55:40,889 - INFO - Step 3: Encoding 13294 first/last sentences in pairs


SBERT edge pass:   0%|          | 0/52 [00:00<?, ?it/s]

Token-count batches:   0%|          | 0/7 [00:00<?, ?it/s]

2025-06-03 13:55:47,047 - INFO - 
Sample size: 1000
2025-06-03 13:55:47,055 - INFO - Step 1: Tiling texts


{'depth_median': 0.0339,
 'depth_p25': 0.0,
 'depth_p75': 0.1548,
 'edge_median': 0.8335,
 'edge_p25': 0.8042,
 'edge_p75': 0.8696,
 'len_wp_max': 10762,
 'len_wp_mean': 243.1002,
 'n_docs': 497,
 'tile_len_25': 148.0,
 'tile_len_50': 203.0,
 'tile_len_95': 465.7,
 'tile_len_99': 870.16,
 'tiles_doc_mean': 13.3742,
 'tiles_doc_std': 23.202}


2025-06-03 13:56:20,548 - INFO - Step 2: Extracting first/last sentences
2025-06-03 13:56:24,202 - INFO - Step 3: Encoding 25720 first/last sentences in pairs


SBERT edge pass:   0%|          | 0/101 [00:00<?, ?it/s]

Token-count batches:   0%|          | 0/13 [00:00<?, ?it/s]

2025-06-03 13:56:34,985 - INFO - 
Sample size: 5000
2025-06-03 13:56:35,012 - INFO - Step 1: Tiling texts


{'depth_median': 0.034,
 'depth_p25': 0.0,
 'depth_p75': 0.1544,
 'edge_median': 0.833,
 'edge_p25': 0.8042,
 'edge_p75': 0.8687,
 'len_wp_max': 10762,
 'len_wp_mean': 238.5807,
 'n_docs': 993,
 'tile_len_25': 146.0,
 'tile_len_50': 200.0,
 'tile_len_95': 456.0,
 'tile_len_99': 796.0,
 'tiles_doc_mean': 12.9507,
 'tiles_doc_std': 22.6542}


2025-06-03 13:58:19,774 - INFO - Step 2: Extracting first/last sentences
2025-06-03 13:58:23,839 - INFO - Step 3: Encoding 123716 first/last sentences in pairs


SBERT edge pass:   0%|          | 0/484 [00:00<?, ?it/s]

Token-count batches:   0%|          | 0/61 [00:00<?, ?it/s]

2025-06-03 13:59:14,932 - INFO - 
Sample size: 10000
2025-06-03 13:59:14,958 - INFO - Step 1: Tiling texts


{'depth_median': 0.0325,
 'depth_p25': 0.0,
 'depth_p75': 0.1501,
 'edge_median': 0.833,
 'edge_p25': 0.8042,
 'edge_p75': 0.8672,
 'len_wp_max': 16831,
 'len_wp_mean': 239.1933,
 'n_docs': 4983,
 'tile_len_25': 145.0,
 'tile_len_50': 201.0,
 'tile_len_95': 458.0,
 'tile_len_99': 776.43,
 'tiles_doc_mean': 12.4138,
 'tiles_doc_std': 20.866}


2025-06-03 14:02:22,337 - INFO - Step 2: Extracting first/last sentences
2025-06-03 14:02:26,874 - INFO - Step 3: Encoding 247588 first/last sentences in pairs


SBERT edge pass:   0%|          | 0/968 [00:00<?, ?it/s]

Token-count batches:   0%|          | 0/121 [00:00<?, ?it/s]

2025-06-03 14:04:09,064 - INFO - 
Sample size: 20000
2025-06-03 14:04:09,107 - INFO - Step 1: Tiling texts


{'depth_median': 0.0322,
 'depth_p25': 0.0,
 'depth_p75': 0.1494,
 'edge_median': 0.833,
 'edge_p25': 0.8042,
 'edge_p75': 0.8672,
 'len_wp_max': 19780,
 'len_wp_mean': 239.2474,
 'n_docs': 9958,
 'tile_len_25': 145.0,
 'tile_len_50': 201.0,
 'tile_len_95': 458.0,
 'tile_len_99': 768.0,
 'tiles_doc_mean': 12.4316,
 'tiles_doc_std': 21.2429}


2025-06-03 14:10:12,050 - INFO - Step 2: Extracting first/last sentences
2025-06-03 14:10:17,497 - INFO - Step 3: Encoding 495160 first/last sentences in pairs


SBERT edge pass:   0%|          | 0/1935 [00:00<?, ?it/s]

Token-count batches:   0%|          | 0/242 [00:00<?, ?it/s]

2025-06-03 14:13:41,472 - INFO - 
Sample size: 50000
2025-06-03 14:13:41,568 - INFO - Step 1: Tiling texts


{'depth_median': 0.0323,
 'depth_p25': 0.0,
 'depth_p75': 0.1489,
 'edge_median': 0.833,
 'edge_p25': 0.8042,
 'edge_p75': 0.8672,
 'len_wp_max': 21119,
 'len_wp_mean': 239.692,
 'n_docs': 19910,
 'tile_len_25': 145.0,
 'tile_len_50': 201.0,
 'tile_len_95': 459.0,
 'tile_len_99': 769.21,
 'tiles_doc_mean': 12.435,
 'tiles_doc_std': 21.485}


2025-06-03 14:29:28,549 - INFO - Step 2: Extracting first/last sentences
2025-06-03 14:29:37,761 - INFO - Step 3: Encoding 1251688 first/last sentences in pairs


SBERT edge pass:   0%|          | 0/4890 [00:00<?, ?it/s]

Token-count batches:   0%|          | 0/612 [00:00<?, ?it/s]

2025-06-03 14:38:17,492 - INFO - 
Sample size: 100000


{'depth_median': 0.0326,
 'depth_p25': 0.0,
 'depth_p75': 0.1492,
 'edge_median': 0.833,
 'edge_p25': 0.8042,
 'edge_p75': 0.8677,
 'len_wp_max': 22350,
 'len_wp_mean': 240.3018,
 'n_docs': 49780,
 'tile_len_25': 145.0,
 'tile_len_50': 201.0,
 'tile_len_95': 460.0,
 'tile_len_99': 779.0,
 'tiles_doc_mean': 12.5722,
 'tiles_doc_std': 21.6975}


2025-06-03 14:38:19,608 - INFO - Step 1: Tiling texts
2025-06-03 15:08:28,930 - INFO - Step 2: Extracting first/last sentences
2025-06-03 15:08:45,430 - INFO - Step 3: Encoding 2492986 first/last sentences in pairs


SBERT edge pass:   0%|          | 0/9739 [00:00<?, ?it/s]

Token-count batches:   0%|          | 0/1218 [00:00<?, ?it/s]

{'depth_median': 0.0325,
 'depth_p25': 0.0,
 'depth_p75': 0.1489,
 'edge_median': 0.833,
 'edge_p25': 0.8042,
 'edge_p75': 0.8672,
 'len_wp_max': 23990,
 'len_wp_mean': 240.7467,
 'n_docs': 99561,
 'tile_len_25': 146.0,
 'tile_len_50': 201.0,
 'tile_len_95': 460.0,
 'tile_len_99': 790.0,
 'tiles_doc_mean': 12.5199,
 'tiles_doc_std': 21.4819}


In [15]:
metrics_df = pd.DataFrame.from_dict(metrics_per_sample_size).T
metrics_df.to_csv("../../data/text_tiling_metrics_based_on_sample_size.csv", index_label="sample_size")
metrics_df

Unnamed: 0,n_docs,tiles_doc_mean,tiles_doc_std,len_wp_mean,len_wp_max,tile_len_25,tile_len_50,tile_len_95,tile_len_99,edge_median,edge_p25,edge_p75,depth_median,depth_p25,depth_p75
100,98.0,14.102041,25.515697,244.662808,2387.0,146.25,204.0,469.0,840.8,0.835449,0.804199,0.873901,0.037006,0.0,0.164321
500,497.0,13.374245,23.201956,243.100196,10762.0,148.0,203.0,465.7,870.16,0.833496,0.804199,0.869629,0.033906,0.0,0.154802
1000,993.0,12.950655,22.654162,238.580715,10762.0,146.0,200.0,456.0,796.0,0.833008,0.804199,0.868652,0.033987,0.0,0.154361
5000,4983.0,12.413807,20.865964,239.193298,16831.0,145.0,201.0,458.0,776.43,0.833008,0.804199,0.867188,0.032484,0.0,0.150076
10000,9958.0,12.431613,21.242928,239.247379,19780.0,145.0,201.0,458.0,768.0,0.833008,0.804199,0.867188,0.032225,0.0,0.149446
20000,19910.0,12.434957,21.484992,239.692015,21119.0,145.0,201.0,459.0,769.21,0.833008,0.804199,0.867188,0.032297,0.0,0.148912
50000,49780.0,12.572198,21.697453,240.301783,22350.0,145.0,201.0,460.0,779.0,0.833008,0.804199,0.867676,0.032603,0.0,0.149209
100000,99561.0,12.519892,21.481855,240.74668,23990.0,146.0,201.0,460.0,790.0,0.833008,0.804199,0.867188,0.03247,0.0,0.148883


### Running hyperparam search
#### Sample_size: 5000


In [None]:

import itertools, time, pandas as pd
import gc, torch
SAMPLE_SIZE=5000
sample_df = df.sample(SAMPLE_SIZE, random_state=42)

cutoff_policy = ["HC", "LC"],
w = [15,20,25,30]
k = [6, 7, 8, 9, 10]
smoothing_width = [2,4,6]
smoothing_rounds = [1,2]

In [11]:
# grid = list(itertools.product( # wider combo, 240 options - too long to run
#     ["HC", "LC"],          # cutoff_policy
#     [15, 20, 25, 30],      # w
#     [6, 7, 8, 9, 10],      # k
#     [2, 4, 6],             # smoothing_width
#     [1, 2],                # smoothing_rounds
# ))

grid = list(itertools.product( # 6 most optimal combos
    [("HC", 25, 6), ("HC", 25, 7), ("HC", 20, 8),
     ("HC", 20, 7), ("LC", 25, 6), ("LC", 25, 7)],
    [4],              # smoothing_width
    [2],              # smoothing_rounds
))
logging.info(f"{len(grid)} total configurations to test")

def eval_config(cfg)->Dict[str, Any]:
    (cutoff, w, k), sm_w, sm_r = cfg
    params = dict(
        cutoff_policy=cutoff,
        w=w,
        k=k,
        smoothing_width=sm_w,
        smoothing_rounds=sm_r,
    )
    tiler = ParallelTextTiler(
        text_tiling_params=params,
        batch_size=128,
    )

    start = time.time()
    segs  = tiler.segment_dataframe(sample_df, col="body_clean",
                                    with_edge=True, keep_depth=True)
    m = agg_stats_fast(segs, batch_size=1024)
    m.update(params)                        
    m["wall_sec"] = time.time() - start
    
    del segs, tiler  
    for _ in range(3):
        torch.cuda.empty_cache() 
        gc.collect()             
    
    return m

records = [eval_config(cfg) for cfg in tqdm(grid, desc="Grid search")]

2025-06-03 17:32:53,016 - INFO - 6 total configurations to test


Grid search:   0%|          | 0/6 [00:00<?, ?it/s]

2025-06-03 17:32:53,023 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-small
2025-06-03 17:32:56,838 - INFO - Step 1: Tiling texts
Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors
2025-06-03 17:35:12,175 - INFO - Step 2: Extracting first/last sentences
2025-06-03 17:35:16,063 - INFO - Step 3: Encoding 128636 first/last sentences in pairs


SBERT edge pass:   0%|          | 0/1005 [00:00<?, ?it/s]

Token-count batches:   0%|          | 0/63 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
2025-06-03 17:36:03,881 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-small
2025-06-03 17:36:06,865 - INFO - Step 1: Tiling texts
Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors
2025-06-03 17:38:28,256 - INFO - Step 2: Extracting first/last sentences
2025-06-03 17:38:32,153 - INFO - Step 3: Encoding 129716 first/last sentences in pairs


SBERT edge pass:   0%|          | 0/1014 [00:00<?, ?it/s]

Token-count batches:   0%|          | 0/64 [00:00<?, ?it/s]

2025-06-03 17:39:20,844 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-small
2025-06-03 17:39:23,558 - INFO - Step 1: Tiling texts
Token indices sequence length is longer than the specified maximum sequence length for this model (544 > 512). Running this sequence through the model will result in indexing errors
2025-06-03 17:42:19,277 - INFO - Step 2: Extracting first/last sentences
2025-06-03 17:42:23,206 - INFO - Step 3: Encoding 153534 first/last sentences in pairs


SBERT edge pass:   0%|          | 0/1200 [00:00<?, ?it/s]

Token-count batches:   0%|          | 0/75 [00:00<?, ?it/s]

2025-06-03 17:43:17,418 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-small
2025-06-03 17:43:20,427 - INFO - Step 1: Tiling texts
Token indices sequence length is longer than the specified maximum sequence length for this model (544 > 512). Running this sequence through the model will result in indexing errors
2025-06-03 17:46:21,583 - INFO - Step 2: Extracting first/last sentences
2025-06-03 17:46:29,370 - INFO - Step 3: Encoding 152446 first/last sentences in pairs


SBERT edge pass:   0%|          | 0/1191 [00:00<?, ?it/s]

Token-count batches:   0%|          | 0/75 [00:00<?, ?it/s]

2025-06-03 17:47:24,070 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-small
2025-06-03 17:47:27,956 - INFO - Step 1: Tiling texts
Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors
2025-06-03 17:49:57,027 - INFO - Step 2: Extracting first/last sentences
2025-06-03 17:50:05,258 - INFO - Step 3: Encoding 128636 first/last sentences in pairs


SBERT edge pass:   0%|          | 0/1005 [00:00<?, ?it/s]

Token-count batches:   0%|          | 0/63 [00:00<?, ?it/s]

2025-06-03 17:50:56,388 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-small
2025-06-03 17:50:59,429 - INFO - Step 1: Tiling texts
Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors
2025-06-03 17:53:27,857 - INFO - Step 2: Extracting first/last sentences
2025-06-03 17:53:34,254 - INFO - Step 3: Encoding 129716 first/last sentences in pairs


SBERT edge pass:   0%|          | 0/1014 [00:00<?, ?it/s]

Token-count batches:   0%|          | 0/64 [00:00<?, ?it/s]

In [13]:
df_grid = pd.DataFrame(records)
df_grid.to_csv("../../data/text_tiling_grid_search_results.csv", index=False)
df_grid

Unnamed: 0,n_docs,tiles_doc_mean,tiles_doc_std,len_wp_mean,len_wp_max,tile_len_25,tile_len_50,tile_len_95,tile_len_99,edge_median,...,edge_p75,depth_median,depth_p25,depth_p75,cutoff_policy,w,k,smoothing_width,smoothing_rounds,wall_sec
0,4983,12.907485,21.959527,230.046006,513,147.0,209.0,453.0,512.0,0.833984,...,0.871582,0.031087,0.0,0.144932,HC,25,6,4,2,187.026965
1,4983,13.015854,21.974242,228.130547,513,144.0,201.0,462.0,512.0,0.834473,...,0.871094,0.033047,0.0,0.149897,HC,25,7,4,2,193.963699
2,4992,15.378005,26.607478,192.762711,513,117.0,170.0,399.0,512.0,0.838379,...,0.881348,0.035386,0.0,0.150305,HC,20,8,4,2,233.841622
3,4992,15.26903,26.62527,194.138397,513,118.0,174.0,393.0,512.0,0.836914,...,0.879883,0.031905,0.0,0.143178,HC,20,7,4,2,243.620938
4,4983,12.907485,21.959527,230.046006,513,147.0,209.0,453.0,512.0,0.833984,...,0.871582,0.031087,0.0,0.144932,LC,25,6,4,2,208.410726
5,4983,13.015854,21.974242,228.130547,513,144.0,201.0,462.0,512.0,0.834473,...,0.871094,0.033047,0.0,0.149897,LC,25,7,4,2,206.976838


In [14]:
best_set_of_params = {
     "cutoff_policy": "HC",
     "w": 25,
     "k": 6,
     "smoothing_width": 4,
     "smoothing_rounds": 2,
}

### Speed comparison: Full stats calc on sequential pipeline
*  30 min / 10k entries -> 300 mins/100k
* parallel on 22 workers:  28 mins / 100 k

speed up: 10x (depends on num_workers and ton of other staff)

In [None]:
tiler = ParallelTextTiler()        # tweak params if you like
sample = df.sample(100000, random_state=0)
tiled_texts = tiler.segment_dataframe(sample, col="body_clean", with_edge=False, keep_depth=False) #inference

In [None]:
tiler = TextTiler()

segments = [tiler.segment_and_score(text) for text in tqdm(sample['body_clean'])]
stats = agg_stats(segments)