In [1]:
from datasets import load_dataset
from source.annotate import save
from source.text_processing import TextProcessing
from tqdm import tqdm



In [2]:
def main_loop(df, split='train'):
    for doc_id, item, label in tqdm(df.itertuples(), total=df.shape[0], desc="Processing documents", unit="doc", ncols=80):
        text_preocessing = TextProcessing(item)
        text_preocessing.tokenize()

        tokens_all = text_preocessing.tokens_all

        for tokens in tokens_all:

            lemmas = text_preocessing.lemmatize(tokens)
            stemmes = text_preocessing.stemme(tokens)

            pack = [[tokens, stemmes, lemmas] for tokens, stemmes, lemmas in zip(tokens, stemmes, lemmas)]

            save(doc_id, pack, label, split=split)


## Processing Train split

In [3]:
df = load_dataset("wangrongsheng/ag_news", split="train",download_mode="reuse_dataset_if_exists").to_pandas()
main_loop(df, split='train')


Processing documents: 100%|███████████| 120000/120000 [17:05<00:00, 117.06doc/s]


## Processing Test Split

In [4]:
df = load_dataset("wangrongsheng/ag_news", split="test",download_mode="reuse_dataset_if_exists").to_pandas()
main_loop(df, split='test')

Processing documents: 100%|███████████████| 7600/7600 [01:08<00:00, 110.74doc/s]


## Homonymy Sentences

In [5]:
# homonymy sentences
sentences = [
    "The wind leaves the trees bare.",
    "The leaves fell gently to the ground.",
    "She will bear a child next month.",
    "A bear wandered through the forest.",
    "He saw the man running away.",
    "The carpenter picked up his saw.",
    "The rock fell from the cliff.",
    "They rocked the baby to sleep.",
    "She hurt her arm while playing.",
    "They tried to arm the guards quickly.",
    "The light in the room is bright.",
    "This suitcase is really light."
]

for sentence in sentences:
    procesor = TextProcessing(sentence)
    procesor.tokenize()

    tokens_all = procesor.tokens_all

    for tokens in tokens_all:
        lemmas = procesor.lemmatize(tokens)

    print(lemmas)

['The', 'wind', 'leave', 'the', 'tree', 'bare']
['The', 'leaf', 'fell', 'gently', 'to', 'the', 'ground']
['She', 'will', 'bear', 'a', 'child', 'next', 'month']
['A', 'bear', 'wander', 'through', 'the', 'forest']
['He', 'saw', 'the', 'man', 'run', 'away']
['The', 'carpenter', 'pick', 'up', 'his', 'saw']
['The', 'rock', 'fell', 'from', 'the', 'cliff']
['They', 'rock', 'the', 'baby', 'to', 'sleep']
['She', 'hurt', 'her', 'arm', 'while', 'play']
['They', 'try', 'to', 'arm', 'the', 'guard', 'quickly']
['The', 'light', 'in', 'the', 'room', 'is', 'bright']
['This', 'suitcase', 'is', 'really', 'light']


## Patterns in text

In [6]:
example = "You can reach Dr. Smith at +1 (456) 555-1234 or using the email address: abc@abc.com. Visit us at https://www.example.com! Washington D.C. U.S.A."

procesor = TextProcessing(example)
tokens_all = procesor.tokenize() 

print("Tokens:", procesor.tokens_all)

Tokens: [['You', 'can', 'reach', 'Dr. Smith', 'at', '+1 (456) 555-1234', 'or', 'using', 'the', 'email', 'address', ':', 'abc@abc.com'], ['Visit', 'us', 'at', 'https://www.example.com!', 'Washington', 'D.C.', 'U.S.A.']]
