# A Machine Learning project for Grammar Checking

# Explore Dataset

1) Initial Exploration

In [18]:
import pandas as pd
import numpy as np

In [19]:
df = pd.read_csv('final_preprocessed_15.csv').head(150000)
df.head()

Unnamed: 0,correct,error
0,Good luck on your new start !,Good luck on your new start !
1,My teacher is going to move to change his job .,My teacher is going to move to change his job .
2,He is a so nice guy and taught me English very...,He is a so nice guy and taught me English very...
3,And he took in my favorite subjects like soccer .,And he took in my favorite subject like soccer .
4,"Actually , he was the one who let me know abou...","Actually , who let me know about Lang - 8 was ..."


In [20]:
df.describe()

Unnamed: 0,correct,error
count,150000,150000
unique,140919,141439
top,URL,URL
freq,390,390


In [3]:
print(f'Shape: {df.shape}')
print(f'Data Types:\n{df.dtypes}')
print(f'Head:\n{df.head()}')
print(f'Tail:\n{df.tail()}')

Shape: (150000, 2)
Data Types:
correct    object
error      object
dtype: object
Head:
                                             correct  \
0                      Good luck on your new start !   
1    My teacher is going to move to change his job .   
2  He is a so nice guy and taught me English very...   
3  And he took in my favorite subjects like soccer .   
4  Actually , he was the one who let me know abou...   

                                               error  
0                      Good luck on your new start !  
1    My teacher is going to move to change his job .  
2  He is a so nice guy and taught me English very...  
3   And he took in my favorite subject like soccer .  
4  Actually , who let me know about Lang - 8 was ...  
Tail:
                                                  correct  \
149995          I live in Kagoshima prefecture in Japan .   
149996  Unfortunately it 's raining in Kagoshima right...   
149997  My goal is to become to a math and Information...

2. Basic Statistics

In [4]:
print(f'Unique Values in column1: {df.correct.nunique()}')
print(f'Unique Values in column2: {df.error.nunique()}')
print(f'Most Common Values in column1:\n{df.correct.value_counts().head()}')
print(f'Most Common Values in column2:\n{df.error.value_counts().head()}')

Unique Values in column1: 140919
Unique Values in column2: 141439
Most Common Values in column1:
correct
URL            390
Hello !        182
Thank you .    159
Hello .        130
S .            122
Name: count, dtype: int64
Most Common Values in column2:
error
URL            390
Hello !        174
Thank you .    153
Hello .        129
S .            122
Name: count, dtype: int64


3. Missing Data

In [5]:
print(f'Null Values:\n{df.isnull().sum()}')
print(f'Percentage of Missing Data:\n{df.isnull().mean() * 100}')

Null Values:
correct    0
error      0
dtype: int64
Percentage of Missing Data:
correct    0.0
error      0.0
dtype: float64


# Train

In [24]:
!nvidia-smi

Tue Feb 13 16:25:45 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     9W /  70W |      2MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   37C    P8    10W /  70W |      2MiB / 15360MiB |      0%      Default |
|       

In [23]:
df.rename(columns={'correct': 'output', 'error': 'input'}, inplace=True)

## Importing libraries

In [45]:
import pandas as pd
from tqdm import tqdm
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from datasets import Dataset
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_metric
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /home/kimili/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Set seed

In [26]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
set_seed(42)

## The dataset

In [27]:
from transformers import (
    T5ForConditionalGeneration, T5Tokenizer,
    Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
  )

from torch.utils.data import Dataset, DataLoader

# Reasons of using t5 model

Unified Text-to-Text Approach: T5 treats every NLP problem as a "text-to-text" problem, where both the input and output are sequences of text. This approach is inherently suitable for GEC, where the input is a text with potential grammatical errors, and the output is a corrected version of the text. This unified framework simplifies the model architecture and training process for GEC tasks.

Pretraining on Diverse Corpus: T5 is pretrained on a large, diverse corpus of text from the web, which includes a wide range of grammatical structures and vocabularies. This extensive pretraining helps the model develop a robust understanding of language, making it more effective at identifying and correcting grammatical errors in various contexts and styles.

Fine-tuning Capability: T5 can be fine-tuned on a specific task with a relatively small dataset to achieve high performance. For GEC, this means the model can be tailored to recognize and correct a wide array of grammatical errors, including those that are relatively rare or specific to certain types of text (e.g., academic writing, technical reports), by training on a targeted dataset of grammatical errors and corrections.

Strong Generalization Abilities: Due to its extensive pretraining and ability to learn from context, T5 demonstrates strong generalization abilities. It can effectively correct errors in unseen texts, even those that significantly differ from the texts in its training dataset. This makes T5 highly adaptable to various domains and languages (when provided with appropriate training data), enhancing its utility for GEC across diverse applications.

In [29]:
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [30]:
def calc_token_len(example):
    return len(tokenizer(example).input_ids)

In [32]:
train_df, test_df = train_test_split(df, test_size=0.10, shuffle=True)
train_df.shape, test_df.shape

((135000, 4), (15000, 4))

In [33]:
test_df['input_token_len'] = test_df['input'].apply(calc_token_len)

test_df.head()

Unnamed: 0,output,input,column1_length,column2_length,input_token_len
59770,"Tea is black tea without milk for Japanese , w...","Tea is black tea without milk for Japanese , w...",104,104,23
21362,My friends get them and use the fireworks .,My friends get them and use fireworks .,43,39,10
127324,it is great !,it is great !,13,13,6
140509,Though I know English does n't improve quickly...,"Though I know English does n't improve soon , ...",64,61,20
144297,I 've been gaining weight so fast since I star...,"I 'm getting weight so fat , after I started t...",66,60,20


In [34]:
test_df['input_token_len'].describe()

count    15000.000000
mean        16.054133
std         10.225271
min          2.000000
25%          9.000000
50%         14.000000
75%         20.000000
max        211.000000
Name: input_token_len, dtype: float64

In [39]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [42]:
from torch.utils.data import Dataset, DataLoader
class GrammarDataset(Dataset):
    def __init__(self, dataset, tokenizer,print_text=False):
        self.dataset = dataset
        self.pad_to_max_length = False
        self.tokenizer = tokenizer
        self.print_text = print_text
        self.max_len = 64

    def __len__(self):
        return len(self.dataset)


    def tokenize_data(self, example):
        input_, target_ = example['input'], example['output']

        tokenized_inputs = tokenizer(input_, pad_to_max_length=self.pad_to_max_length,
                                            max_length=self.max_len,
                                            return_attention_mask=True)

        tokenized_targets = tokenizer(target_, pad_to_max_length=self.pad_to_max_length,
                                            max_length=self.max_len,
                                            return_attention_mask=True)

        inputs={"input_ids": tokenized_inputs['input_ids'],
            "attention_mask": tokenized_inputs['attention_mask'],
            "labels": tokenized_targets['input_ids']
        }

        return inputs


    def __getitem__(self, index):
        inputs = self.tokenize_data(self.dataset[index])

        if self.print_text:
            for k in inputs.keys():
                print(k, len(inputs[k]))

        return inputs

In [43]:
dataset = GrammarDataset(test_dataset, tokenizer, True)
print(dataset[121])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


input_ids 20
attention_mask 20
labels 21
{'input_ids': [264, 3, 6, 27, 2124, 572, 27, 228, 3, 29, 31, 17, 129, 125, 3, 88, 243, 3, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [264, 3, 6, 27, 2124, 572, 27, 228, 3, 29, 31, 17, 734, 125, 3, 88, 47, 2145, 3, 5, 1]}


## Evaluator

In [46]:
rouge_metric = load_metric("rouge")

  rouge_metric = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

## Train Model

In [47]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='longest', return_tensors='pt')

In [None]:
!pip show accelerate

In [54]:
batch_size = 16
args = Seq2SeqTrainingArguments(output_dir="logs/",
                        evaluation_strategy="steps",
                        per_device_train_batch_size=batch_size,
                        per_device_eval_batch_size=batch_size,
                        learning_rate=2e-5,
                        num_train_epochs=1,
                        weight_decay=0.01,
                        save_total_limit=2,
                        predict_with_generate=True,
                        fp16 = True,
                        gradient_accumulation_steps = 6,
                        eval_steps = 250,
                        save_steps = 250,
                        load_best_model_at_end=True,
                        logging_dir="/logs",
                        report_to="wandb")

In [55]:
import nltk
nltk.download('punkt')
from nltk.translate.gleu_score import sentence_gleu
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [nltk.sent_tokenize(pred.strip()) for pred in decoded_preds]
    decoded_labels = [nltk.sent_tokenize(label.strip()) for label in decoded_labels]

    gleu_scores = [sentence_gleu([ref], pred) for pred, ref in zip(decoded_preds, decoded_labels)]
    result = {"gleu": np.mean(gleu_scores) * 100}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to /home/kimili/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [57]:
trainer = Seq2SeqTrainer(model=model,
                args=args,
                train_dataset= GrammarDataset(train_dataset, tokenizer),
                eval_dataset=GrammarDataset(test_dataset, tokenizer),
                tokenizer=tokenizer,
                data_collator=data_collator,
                compute_metrics=compute_metrics)

In [58]:
trainer.train()

2024-02-13 16:44:28,058 - wandb.jupyter - ERROR - Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mabhijitpal1247[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Gleu,Gen Len
250,No log,0.409985,41.3354,13.4988




TrainOutput(global_step=351, training_loss=0.5140903831547142, metrics={'train_runtime': 2168.9205, 'train_samples_per_second': 62.243, 'train_steps_per_second': 0.162, 'total_flos': 8049662115840000.0, 'train_loss': 0.5140903831547142, 'epoch': 1.0})

In [60]:
trainer.save_model('t5_gec_model_13_02_2024')

In [68]:
import shutil
shutil.make_archive('t5_gec_model', 'zip', 't5_gec_model_13_02_2024')

'/home/kimili/t5_gec_model.zip'

# Test

In [None]:
import zipfile
import os

zip_file_path = 't5_gec_model_13_02_2024'
extract_folder_path = ''

os.makedirs(extract_folder_path, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder_path)

print("Unzipping completed.")


In [62]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_path = 't5_gec_model_13_02_2024'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [63]:
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path).to(torch_device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [64]:
def correct_grammar(input_text,num_return_sequences):
    batch = tokenizer([input_text],truncation=True,padding='max_length',max_length=64, return_tensors="pt").to(torch_device)
    translated = model.generate(**batch,max_length=64,num_beams=4, num_return_sequences=num_return_sequences, temperature=1.5)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

In [65]:
text = 'I went to school at Sunday.'
print(correct_grammar(text, num_return_sequences=1))

['I went to school on Sunday.']




In [67]:
text = 'I went to school at Sunday.'
print(correct_grammar(text, num_return_sequences=1))

['I went to school on Sunday.']
