Package

In [3]:
import re
import spacy
from nltk.tokenize import word_tokenize
from num2words import num2words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from textblob import TextBlob
import language_tool_python

Set up dataset

In [5]:
from datasets import load_dataset

ds = load_dataset("jhu-clsp/jfleg")

Downloading readme:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/141k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/755 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/748 [00:00<?, ? examples/s]

In [None]:
nlp = spacy.load("en_core_web_sm")
tool = language_tool_python.LanguageTool('en-US')

In [None]:
train_data = [
    ("Is this a question?", "question"),
    ("This is a statement.", "statement"),
    ("Do this now!", "command"),
    ("How are you?", "question"),
    ("Open the door.", "command")
]

In [None]:
def train_classifier(data):
    texts, labels = zip(*data)
    vectorizer = CountVectorizer()
    classifier = MultinomialNB()
    model = make_pipeline(vectorizer, classifier)
    model.fit(texts, labels)
    return model

In [None]:
classifier_model = train_classifier(train_data)

Stage 1

In [None]:
def tokenize(text):
    return word_tokenize(text)

In [None]:
def normalize(tokens):
    normalized_tokens = []
    for token in tokens:
        token = token.lower()
        token = re.sub(r'\W+', '', token)
        if token.isdigit():
            token = num2words(token)
        if token:
            normalized_tokens.append(token)
    return normalized_tokens

In [None]:
def recognize_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [None]:
def replace_entities(text, entities):
    for entity, label in entities:
        if label in ["PERSON", "GPE", "ORG"]:
            placeholder = f'[{label}]'
            text = re.sub(re.escape(entity), placeholder, text)
    return text

Stage 2

In [None]:
def classify_sentence(text):
    return classifier_model.predict([text])[0]

In [None]:
def rule_based_correction(text):
    text_blob = TextBlob(text)
    corrected_text = str(text_blob.correct())
    return corrected_text

In [None]:
def model_based_correction(text):
    matches = tool.check(text)
    corrected_text = language_tool_python.utils.correct(text, matches)
    return corrected_text

Stage 3

In [None]:
def replace_placeholders(text, entities):
    for entity, label in entities:
        placeholder = f'[{label}]'
        text = text.replace(placeholder, entity, 1)
    return text

In [None]:
def format_sentence(text):
    sentences = re.split(r'(?<=[.!?]) +', text)
    formatted_sentences = [sentence.capitalize() for sentence in sentences]
    formatted_text = ' '.join(formatted_sentences)
    return formatted_text

In [None]:
def process_text(text):
    # Pra-pemrosesan
    tokens = tokenize(text)
    normalized_tokens = normalize(tokens)
    normalized_text = ' '.join(normalized_tokens)
    entities = recognize_entities(text)
    text_with_placeholders = replace_entities(normalized_text, entities)

    # Koreksi Tata Bahasa
    sentence_type = classify_sentence(text_with_placeholders)
    rule_based_corrected_text = rule_based_correction(text_with_placeholders)
    model_based_corrected_text = model_based_correction(rule_based_corrected_text)

    # Pasca-pemrosesan
    text_with_entities = replace_placeholders(model_based_corrected_text, entities)
    formatted_text = format_sentence(text_with_entities)

    return formatted_text

Test

In [None]:
text = "John Doe went to New York on July 5th, 2021."
processed_text = process_text(text)
print(processed_text)