## ECS 289G - BERT+Rule Based Checking (Language Tool) for Grammar Error Correction

In [None]:
!pip install transformers

!pip install language-tool-python


Collecting language-tool-python
  Downloading language_tool_python-2.7.1-py3-none-any.whl (34 kB)
Installing collected packages: language-tool-python
Successfully installed language-tool-python-2.7.1


In [None]:
!pip install transformers
!pip install nltk



In [None]:
!pip install language-tool-python



In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [None]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.3 rapidfuzz-3.5.2


In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from transformers import pipeline
import language_tool_python
from transformers import pipeline
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.gleu_score import sentence_gleu
from jiwer import wer
import pandas as pd
from concurrent.futures import ProcessPoolExecutor

In [None]:
def correct_grammar_with_transformer(text):
    # Initialize the language tool
    tool = language_tool_python.LanguageTool('en-US')

    # Detect grammar errors in the text
    matches = tool.check(text)

    # Use transformer model for error correction
    corrector = pipeline(task="fill-mask", model="bert-base-uncased")

    # Correct the errors
    corrected_text = text
    for match in reversed(matches):
        start, end = match.offset, match.offset + match.errorLength
        incorrect_word = text[start:end]
        correction = corrector(f"{text[:start]}[MASK]{text[end:]}")[0]['token_str']
        corrected_text = corrected_text[:start] + correction + corrected_text[end:]

    return corrected_text

In [None]:
input_text = "I is a male"

corrected_text = correct_grammar_with_transformer(input_text)

print("Original Text:")
print(input_text)
print("\nCorrected Text:")
print(corrected_text)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Original Text:
I is a male

Corrected Text:
I am a male


In [None]:
def calculate_metrics(args):
    original_text, reference_text = args
    corrected_text = correct_grammar_with_transformer(original_text)

    # Compute BLEU score
    bleu_score = sentence_bleu([reference_text.split()], corrected_text.split())

    # Compute GLEU score
    gleu_score = sentence_gleu([reference_text.split()], corrected_text.split())

    # Compute WER score
    wer_score = wer(reference_text, corrected_text)

    return bleu_score, gleu_score, wer_score


In [None]:
def evaluate_on_jfleg(model, dataset_path, num_rows=50):
    # Load the CSV dataset
    df = pd.read_csv(dataset_path, nrows=num_rows)

    total_bleu, total_gleu, total_wer = 0, 0, 0

    # Create a list of argument tuples for parallel processing
    args_list = [(row["input"], row["target"]) for _, row in df.iterrows()]

    with ProcessPoolExecutor() as executor:
        results = list(executor.map(calculate_metrics, args_list))

    for result in results:
        total_bleu += result[0]
        total_gleu += result[1]
        total_wer += result[2]

    num_examples = len(df)

    average_bleu = total_bleu / num_examples
    average_gleu = total_gleu / num_examples
    average_wer = total_wer / num_examples

In [None]:
model = "bert-base-uncased"
dataset_path = "eval.csv"
evaluate_on_jfleg(model, dataset_path, num_rows=50)
print("Average BLEU Score:", average_bleu)
print("Average GLEU Score:", average_gleu)
print("Average WER Score:", average_wer)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU Score: 0.5020289455700866
Average GLEU Score: 0.5504235510903287
Average WER Score: 0.3094365738661962
