Dipendensi

In [1]:
import re
import spacy
import nltk
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric
from torch.nn.functional import softmax
import numpy as np
from nltk.tokenize import word_tokenize
from num2words import num2words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from textblob import TextBlob
from datasets import load_dataset

Error Text 

In [2]:
text = "John Doe goes in New York on July 5th, 2021."

Load en_core_web_sm, tokenizer, pre-trained BERT model, dataset, and nltk 

In [3]:
nlp = spacy.load("en_core_web_sm")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
ds = load_dataset("jhu-clsp/jfleg")
nltk.download('punkt')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aditt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Set data for Classifier

In [4]:
validation_dataset = ds['validation']
test_dataset = ds['test']

validation_corrections = validation_dataset['corrections']
test_corrections = test_dataset['corrections']

data_classifier = []
for corrections in validation_corrections:
    for sentence in corrections:
        data_classifier.append(sentence)
for corrections in test_corrections:
    for sentence in corrections:
        data_classifier.append(sentence)

Train Classifier Model

In [5]:
def train_classifier(data):
    labels = ["statement"] * len(data)
    vectorizer = CountVectorizer()
    classifier = MultinomialNB()
    model = make_pipeline(vectorizer, classifier)
    model.fit(data, labels)
    return model

In [6]:
classifier_model = train_classifier(data_classifier)

Training Model

In [7]:
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", truncation=True)

tokenized_dataset = ds.map(tokenize_function, batched=True)
print(tokenized_dataset)

Map:   0%|          | 0/755 [00:00<?, ? examples/s]

Map:   0%|          | 0/748 [00:00<?, ? examples/s]

DatasetDict({
    validation: Dataset({
        features: ['sentence', 'corrections', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 755
    })
    test: Dataset({
        features: ['sentence', 'corrections', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 748
    })
})


In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
metric = load_metric("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['validation'],
    eval_dataset=tokenized_dataset['test'],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Stage 1

In [84]:
def tokenize(text):
    return word_tokenize(text)

In [108]:
Tokens = tokenize(text)
print(Tokens)

['John', 'Doe', 'goes', 'in', 'New', 'York', 'on', 'July', '5th', ',', '2021', '.']


In [191]:
def normalize(tokens):
    normalized_tokens = []
    for token in tokens:
        token = token.lower()
        token = re.sub(r'\W+', '', token)  # Remove non-alphanumeric characters
        if token.isdigit():
            token = num2words(token)
        if token:
            normalized_tokens.append(token)
    return normalized_tokens

In [192]:
normalized_tokens = ' '.join(normalize(Tokens))
print(normalized_tokens)

john doe goes in new york on july 5th two thousand and twenty-one


In [193]:
def recognize_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [194]:
entities = recognize_entities(normalized_tokens)
print(entities)

[('john', 'PERSON'), ('new york', 'GPE'), ('july 5th two thousand and twenty-one', 'DATE')]


In [195]:
def replace_entities(text, entities):
    for entity, label in entities:
        if label in ["PERSON", "GPE", "ORG", "DATE"]:
            placeholder = f'[{label}]'
            text = text.replace(entity.lower(), placeholder)
    return text

In [196]:
text_with_placeholders = replace_entities(normalized_tokens, entities)
print(text_with_placeholders)

[PERSON] doe goes in [GPE] on [DATE]


Stage 2

In [89]:
def classify_sentence(text):
    return classifier_model.predict([text])[0]

In [178]:
classify = classify_sentence(text)
print(classify)

statement


In [90]:
def rule_based_correction(text):
    text_blob = TextBlob(text)
    corrected_text = str(text_blob.correct())
    return corrected_text

In [179]:
rule = rule_based_correction(text_with_placeholders)
print(rule)

[PERSON] doe goes in [GPE] on [DATE]


In [91]:
def model_based_correction(dataset):
    corrected_sentences = []
    
    for data in dataset:
        incorrect_sentence = data['sentence']
        correct_sentences = data['corrections']
        
        # Tokenize and encode input sentences
        inputs = tokenizer([incorrect_sentence] + correct_sentences, return_tensors='pt', padding=True, truncation=True)
        
        # Perform inference with the BERT model
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Compute probabilities using softmax
        probs = softmax(outputs.logits, dim=-1)
        
        # Find the correct sentence with the highest probability
        correct_index = torch.argmax(probs[1:, 1]).item()  # Adjust index to exclude the incorrect_sentence
        
        # Append the corrected sentence
        corrected_sentences.append(correct_sentences[correct_index])
    
    return corrected_sentences

In [180]:
model = model_based_correction(rule)
print(model)

[PERSON] doe goes in [GPE] on [DATE]


Stage 3

In [92]:
def replace_placeholders(text, entities):
    for entity, label in entities:
        placeholder = f'[{label}]'
        text = text.replace(placeholder, entity, 1)
    return text

In [93]:
def format_sentence(text):
    # Capitalize the first letter of each sentence
    sentences = re.split(r'(?<=[.!?]) +', text)
    formatted_sentences = [sentence.capitalize() for sentence in sentences]
    formatted_text = ' '.join(formatted_sentences)
    return formatted_text

In [106]:
def process_text(text):
    # Pra-pemrosesan
    tokens = tokenize(text)
    normalized_tokens = normalize(tokens)
    normalized_text = ' '.join(normalized_tokens)
    entities = recognize_entities(text)
    text_with_placeholders = replace_entities(normalized_text, entities)

    # Koreksi Tata Bahasa
    rule_based_corrected_text = rule_based_correction(text_with_placeholders)
    model_based_corrected_text = model_based_correction(rule_based_corrected_text)

    # Pasca-pemrosesan
    text_with_entities = replace_placeholders(model_based_corrected_text, entities)
    formatted_text = format_sentence(text_with_entities)

    return formatted_text


Test

In [107]:
text = "John Doe goes in New York on July 5th, 2021."
processed_text = process_text(text)
print(processed_text)

J o h n t o e g o e s i n n e w w o r k o n j u l y 5 t h , 2 0 2 1 .
