Package

In [79]:
import re
import spacy
import nltk
from nltk.tokenize import word_tokenize
from num2words import num2words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from textblob import TextBlob
import language_tool_python

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aditt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [103]:
text = "John Doe goes in New York on July 5th, 2021."

Set up dataset

In [78]:
nlp = spacy.load("en_core_web_sm")
tool = language_tool_python.LanguageTool('en-US')

In [80]:
from datasets import load_dataset

ds = load_dataset("jhu-clsp/jfleg")

In [81]:
validation_dataset = ds['validation']
test_dataset = ds['test']

validation_sentences = validation_dataset['sentence']
validation_corrections = validation_dataset['corrections']

test_sentences = test_dataset['sentence']
test_corrections = test_dataset['corrections']

Train_data = []
for corrections in validation_corrections:
    for sentence in corrections:
        Train_data.append(sentence)
for corrections in test_corrections:
    for sentence in corrections:
        Train_data.append(sentence)

In [82]:
def train_classifier(data):
    labels = ["statement"] * len(data)
    vectorizer = CountVectorizer()
    classifier = MultinomialNB()
    model = make_pipeline(vectorizer, classifier)
    model.fit(data, labels)
    return model

In [83]:
classifier_model = train_classifier(Train_data)

Stage 1

In [84]:
def tokenize(text):
    return word_tokenize(text)

In [108]:
Tokens = tokenize(text)
print(Tokens)

['John', 'Doe', 'goes', 'in', 'New', 'York', 'on', 'July', '5th', ',', '2021', '.']


In [132]:
def normalize_1(tokens):
    normalized_tokens = []
    for token in tokens:
        token = token.lower()
        token = re.sub(r'\W+', '', token)  # Remove non-alphanumeric characters
        if token.isdigit():
            token = num2words(token)
        if token:
            normalized_tokens.append(token)
    return normalized_tokens

In [133]:
normalized_tokens = ' '.join(normalize_1(Tokens))
print(normalized_tokens)

john doe goes in new york on july 5th two thousand and twenty-one


In [120]:
def normalize_2(text):
    normalized_text = []
    for word in text:
        if re.match(r'^\d+(st|nd|rd|th)$', word):
            normalized_text.append(word)
        else:
            normalized_word = str(TextBlob(word).correct())
            normalized_text.append(normalized_word)
    return ' '.join(normalized_text)

In [121]:
normalized_tokens = normalize_2(Tokens)
print(normalized_tokens)

John Toe goes in New Work on July 5th , 2021 .


In [128]:
def recognize_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [141]:
entities = recognize_entities(text)
print(entities)

[('John Doe', 'PERSON'), ('New York', 'GPE'), ('July 5th, 2021', 'DATE')]


In [142]:
def replace_entities(text, entities):
    for entity, label in entities:
        if label in ["PERSON", "GPE", "ORG", "DATE"]:
            placeholder = f'[{label}]'
            new_text = re.sub(re.escape(entity), placeholder, text)
    return new_text

In [143]:
text_with_placeholders = replace_entities(normalized_tokens, entities)
print(text_with_placeholders)

john doe goes in new york on july 5th two thousand and twenty-one


Stage 2

In [89]:
def classify_sentence(text):
    return classifier_model.predict([text])[0]

In [90]:
def rule_based_correction(text):
    text_blob = TextBlob(text)
    corrected_text = str(text_blob.correct())
    return corrected_text

In [91]:
def model_based_correction(text):
    matches = tool.check(text)
    corrected_text = language_tool_python.utils.correct(text, matches)
    return corrected_text

Stage 3

In [92]:
def replace_placeholders(text, entities):
    for entity, label in entities:
        placeholder = f'[{label}]'
        text = text.replace(placeholder, entity, 1)
    return text

In [93]:
def format_sentence(text):
    # Capitalize the first letter of each sentence
    sentences = re.split(r'(?<=[.!?]) +', text)
    formatted_sentences = [sentence.capitalize() for sentence in sentences]
    formatted_text = ' '.join(formatted_sentences)
    return formatted_text

In [106]:
def process_text(text):
    # Pra-pemrosesan
    tokens = tokenize(text)
    normalized_tokens = normalize(tokens)
    normalized_text = ' '.join(normalized_tokens)
    entities = recognize_entities(text)
    text_with_placeholders = replace_entities(normalized_text, entities)

    # Koreksi Tata Bahasa
    rule_based_corrected_text = rule_based_correction(text_with_placeholders)
    model_based_corrected_text = model_based_correction(rule_based_corrected_text)

    # Pasca-pemrosesan
    text_with_entities = replace_placeholders(model_based_corrected_text, entities)
    formatted_text = format_sentence(text_with_entities)

    return formatted_text


Test

In [107]:
text = "John Doe goes in New York on July 5th, 2021."
processed_text = process_text(text)
print(processed_text)

J o h n t o e g o e s i n n e w w o r k o n j u l y 5 t h , 2 0 2 1 .
