Package

In [18]:
import re
import spacy
from nltk.tokenize import word_tokenize
from num2words import num2words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from textblob import TextBlob
import language_tool_python

Set up dataset

In [19]:
nlp = spacy.load("en_core_web_sm")
tool = language_tool_python.LanguageTool('en-US')

In [20]:
from datasets import load_dataset

ds = load_dataset("jhu-clsp/jfleg")

In [23]:
validation_dataset = ds['validation']
test_dataset = ds['test']

validation_sentences = validation_dataset['sentence']
validation_corrections = validation_dataset['corrections']

test_sentences = test_dataset['sentence']
test_corrections = test_dataset['corrections']

# for v_sentence, v_correction in zip(validation_sentences, validation_corrections):
#     print(f"Sentence: {v_sentence}\nCorrection: {v_correction}\n")

print(validation_dataset)

Dataset({
    features: ['sentence', 'corrections'],
    num_rows: 755
})


In [None]:
train_data = [
    ("Is this a question?", "question"),
    ("This is a statement.", "statement"),
    ("Do this now!", "command"),
    ("How are you?", "question"),
    ("Open the door.", "command")
]

In [24]:
def train_classifier(data):
    texts, labels = zip(*data)
    vectorizer = CountVectorizer()
    classifier = MultinomialNB()
    model = make_pipeline(vectorizer, classifier)
    model.fit(texts, labels)
    return model

In [25]:
classifier_model = train_classifier(validation_dataset)

Stage 1

In [26]:
def tokenize(text):
    return word_tokenize(text)

In [27]:
def normalize(tokens):
    normalized_tokens = []
    for token in tokens:
        token = token.lower()
        token = re.sub(r'\W+', '', token)
        if token.isdigit():
            token = num2words(token)
        if token:
            normalized_tokens.append(token)
    return normalized_tokens

In [29]:
def recognize_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [30]:
def replace_entities(text, entities):
    for entity, label in entities:
        if label in ["PERSON", "GPE", "ORG"]:
            placeholder = f'[{label}]'
            text = re.sub(re.escape(entity), placeholder, text)
    return text

Stage 2

In [31]:
def classify_sentence(text):
    return classifier_model.predict([text])[0]

In [32]:
def rule_based_correction(text):
    text_blob = TextBlob(text)
    corrected_text = str(text_blob.correct())
    return corrected_text

In [33]:
def model_based_correction(text):
    matches = tool.check(text)
    corrected_text = language_tool_python.utils.correct(text, matches)
    return corrected_text

Stage 3

In [34]:
def replace_placeholders(text, entities):
    for entity, label in entities:
        placeholder = f'[{label}]'
        text = text.replace(placeholder, entity, 1)
    return text

In [35]:
def format_sentence(text):
    sentences = re.split(r'(?<=[.!?]) +', text)
    formatted_sentences = [sentence.capitalize() for sentence in sentences]
    formatted_text = ' '.join(formatted_sentences)
    return formatted_text

In [36]:
def process_text(text):
    # Pra-pemrosesan
    tokens = tokenize(text)
    normalized_tokens = normalize(tokens)
    normalized_text = ' '.join(normalized_tokens)
    entities = recognize_entities(text)
    text_with_placeholders = replace_entities(normalized_text, entities)

    # Koreksi Tata Bahasa
    sentence_type = classify_sentence(text_with_placeholders)
    rule_based_corrected_text = rule_based_correction(text_with_placeholders)
    model_based_corrected_text = model_based_correction(rule_based_corrected_text)

    # Pasca-pemrosesan
    text_with_entities = replace_placeholders(model_based_corrected_text, entities)
    formatted_text = format_sentence(text_with_entities)

    return formatted_text

Test

In [37]:
text = "John Doe went to New York on July 5th, 2021."
processed_text = process_text(text)
print(processed_text)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\aditt/nltk_data'
    - 'c:\\Users\\aditt\\AppData\\Local\\Programs\\Python\\Python311\\nltk_data'
    - 'c:\\Users\\aditt\\AppData\\Local\\Programs\\Python\\Python311\\share\\nltk_data'
    - 'c:\\Users\\aditt\\AppData\\Local\\Programs\\Python\\Python311\\lib\\nltk_data'
    - 'C:\\Users\\aditt\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************
