In [2]:
import pandas as pd

df = pd.read_excel('../proposals_preprocess_0812.xlsx')

# Title & Sentence

In [3]:
def __remove_pattern(text):
    return text.translate(
        str.maketrans('', '', '>#\'\"\*:-.0123456789')).strip()

def __is_in(text, arr):
    return any([word in text for word in arr])

In [4]:
def _remove_duplicates_preserve_order(arr):
    return list(dict.fromkeys(arr))

def _is_valid(text, category):
    text = __remove_pattern(text)
    if not category.limit.is_valid_length(len(text)):
        return False

    text = text.lower()
    return (not __is_in(text, category.not_words)) and __is_in(text, category.words)

In [10]:
from util.extract.data import categories
from util.extract.extract import extract_title_and_content

def _generate_title(text):
    for v in ["TLDR ", "TL;DR ", "TL,DR ", "TLDR: ", "TLDR; ", "The TL;DR of the document:"]:
        text = text.replace(v, f"# TLDR\n")

    texts = []
    for t in text.split("\n"):
        t = t.strip()
        if not t: continue;
        temp = t.lower()
        if categories["title"].limit.is_valid_length(len(temp)) \
            and "#" not in temp \
            and (
                __is_in(temp, categories["title"].words) 
                or __is_in(temp, categories["title"].other_words) 
        ):  
            t = f"# {t}"
        texts.append(t)
    return "\n".join(texts)

def _get_title_and_content(text):
    titles, contents = extract_title_and_content(text)
    result = [
        f"<Title>: {title}\n{content}\n"
        for title, content in zip(titles, contents)
        if _is_valid(title, categories["title"])
    ]
    return "\n".join(
            _remove_duplicates_preserve_order(result)
        ) if result else None

MIN_LIMIT = 5
def _get_sentence(text):
    content = []
    texts = text.split("\n")
    idx = -1
    while idx < len(texts) - 1:
        idx += 1
        temp = __remove_pattern(texts[idx])
        if _is_valid(temp, categories["sentence"]) :
            content.append(f"<Sentence>: {texts[idx].strip()}")
            if idx < len(texts) - 2 and texts[idx].strip()[-1] == ":":
                idx += 1
                now = texts[idx].strip()
                if len(__remove_pattern(now)) < MIN_LIMIT: continue;

                nex = texts[idx+1].strip()
                content.append(f"{now}")
                while idx < len(texts) - 2 \
                    and len(__remove_pattern(nex)) >= MIN_LIMIT \
                    and now[0] == nex[0]:
                    idx += 1
                    content.append(f"{nex}")
                    now = nex
                    nex = texts[idx+1].strip()
                    
            content.append("\n")
                    
    return "\n".join(content) if content else None


In [11]:
from util.extract.data import not_sentence

def filter_words(text):
    for v in not_sentence:
        text = text.replace(v, "")
    sentence_result = _get_sentence(text)

    result = _get_title_and_content(text)
    if not result: 
        text = _generate_title(text)
        result = _get_title_and_content(text)
    
    if result and sentence_result:
        return f"{result}\n{sentence_result}"
    elif result:
        return result
    elif sentence_result:
        return sentence_result
    return None

df["filtered_body"] = df["body"].apply(filter_words)
df.sort_values(
    by="filtered_body"
).to_excel('proposals_preprocess_filtered.xlsx', index=False)

print(df.shape, df["filtered_body"].isna().sum())

(20918, 38) 9256
