In [None]:
import pandas as pd
import os
import spacy
from collections import defaultdict
from datetime import datetime
from os.path import exists
from os import mkdir
from multiprocessing import Pool

## Preprocessing

In [None]:
import re
def clean_up(text):
        text = text.replace("–", "-")
        text = text.replace(" ,", ",")
        text = text.replace(" .", ".")
        text = text.replace(" ;", ";")
        text = text.replace("\n", "")
        text = re.sub(r" +", " ", text)
        return text

def normalize_string(text):
    text = re.sub(r"([.!?])", r" \1", text)
    text = re.sub(r"[^a-zA-Z.!?]+", r" ", text)
    text = clean_up(text)
    return text

In [None]:
#Change the path accordingly
#BART dataset path
path = 'datasets/final_training_data/bart_score_dataset.csv'
#BERT dataset path
#path = 'datasets/final_training_data/bert_score_dataset.csv'
#DAE dataset path
#path = 'datasets/final_training_data/dae_score_dataset.csv'
#Merged dataset path
#path = 'datasets/final_training_data/merged_dataset.csv'
#Baseline dataset path
#path = 'datasets/final_training_data/baseline_dataset.csv'
df = pd.read_csv(path)
df.drop(['sentence_id'],axis = 1, inplace = True)
df['source_sentence'] = df['source_sentence'].apply(normalize_string)
df['simplified_sentence'] = df['simplified_sentence'].apply(normalize_string)
df.rename(columns={'source_sentence': 'input_text', 'simplified_sentence': 'target_text'}, inplace=True)

In [None]:
df

Unnamed: 0,input_text,target_text
0,Baltimore Ravens LRB present RRB,Ray
1,Diseases caused by apicomplexan organisms incl...,Diseases caused by apicomplexan organisms incl...
2,PERSON Harry Patch LRB NUMBER June NUMBER NUMB...,He remembered the Bible story about PERSON com...
3,In a book from Baron Cohen was released titled...,In a book from Cohen was released entitled Tou...
4,ORGANIZATION operates in engineering oilfield ...,ORGANIZATION operates ORGANIZATION a subsidiar...
...,...,...
14995,During Act II the characters Marplay and Spark...,The characters Marplay and Sparkish manage the...
14996,The mineral natron is often found in associati...,The mineral natron is often found in associati...
14997,The Atlas rocket family is today used as a lau...,The Atlas F LRB HGM RRB was stored vertically ...
14998,Near LOCATION it cuts through a high mountain ...,Near LOCATION it cuts through a high mountain ...


In [None]:
words = df['target_text'].str.split()

# Count the number of words in each sentence
sentence_lengths = words.apply(len)

# Calculate the average sentence length
avg_length = sentence_lengths.mean()
print(avg_length)

15.740333333333334


## Seq to Seq model for Text Simplification - Training


In [None]:
class args:
  num_train_epochs = 3
  eval_steps = 1000
  warmup_steps = 1000
  save_total_limit = 3
  per_device_train_batch_size = 8
  per_device_eval_batch_size = 8
  logging_steps = 100
  save_steps = 500
  run_name = "My first Hugging Face Seq2Seq model"
  dataset_name = "My first Seq2Seq dataset"
  gradient_accumulation_steps = 1
  seq_max_length = 100
  seq_min_length = 10
  no_repeat_ngram_size = 0
  length_penalty = 1.0
  num_beams = 1
  # wandb_id = 
  # wandb_entity = 
  # wandb_project =
  # wandb_api_key =  
  model_path="bert-base-uncased"
  pretrained_model_path="./training/model/"
  training_output_path='./training/output/'
  save_model_path='./training/model/store/'
  tie_encoder_decoder="true"
  hf_logging_enabled="true"
  resume="false"
  tokenizer_id="bert-base-uncased"

In [None]:
training_config_dict = {
        "num_train_epochs": args.num_train_epochs,
        "fp16": True,
        "eval_steps": args.eval_steps,
        "warmup_steps": args.warmup_steps,
        "save_total_limit": args.save_total_limit,
        "per_device_train_batch_size": args.per_device_train_batch_size,
        "per_device_eval_batch_size": args.per_device_eval_batch_size,
        "logging_steps": args.logging_steps,
        "save_steps": args.save_steps,
        "run_name": args.run_name,
        "dataset": args.dataset_name,
        "gradient_accumulation_steps": args.gradient_accumulation_steps,
    }

model_config_dict = {
    "max_length": args.seq_max_length,
    "min_length": args.seq_min_length,
    "no_repeat_ngram_size": args.no_repeat_ngram_size,
    "length_penalty": args.length_penalty,
    "num_beams": args.num_beams,
}

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
!pip install transformers==4.28.1
!pip install datasets
!pip install rouge_score
!pip install bert_score
!pip install meteor
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.28.1
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m93.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.1)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.1)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m104.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transfor

In [None]:
import functools
import gc
import logging
import os
import sys
from typing import Dict, Optional, Tuple

import torch
from transformers import (
    AutoTokenizer,
    EncoderDecoderModel,
    EvalPrediction,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    BertConfig
)
from transformers import logging as hf_logging
from datasets import Dataset, load_from_disk, load_metric
from sklearn.model_selection import train_test_split

In [None]:
def preprocess_function(examples):
  inputs = examples['input_text']
  targets = examples['target_text']
  prefix = "simplify: "
  inputs = [prefix + inp for inp in inputs]
  tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_id)
  model_inputs = tokenizer(inputs, max_length=120, truncation=True)

  labels = tokenizer(targets, max_length=120, truncation=True)

  labels["input_ids"] = [
      [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
  ]

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
dataset = Dataset.from_pandas(df)
ds_dict = dataset.train_test_split(shuffle=False, test_size=0.2)

In [None]:
final_dataset = ds_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
final_dataset

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 12000
    })
    test: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [None]:
import numpy as np
import nltk
import evaluate
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
#ChatGPT Code
# Load the pretrained encoder-decoder model
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Set the special tokens for the model
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.early_stopping = True
model.config.max_length = 100
model.config.min_length = 10
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

# Set the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=args.training_output_path,  # output directory
    evaluation_strategy="epoch",  # evaluation strategy to adopt during training
    save_strategy="epoch",  # checkpoint save strategy
    num_train_epochs=5,  # total number of training epochs
    learning_rate=2e-5,  # learning rate
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,  # batch size for evaluation
    warmup_steps=400,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    logging_steps=100,  # log saving frequency
    load_best_model_at_end=True,  # load the best model when finished training
    metric_for_best_model="eval_loss",  # metric to use for best model
    greater_is_better=False,  # whether the metric for the best model should be maximized or minimized
    predict_with_generate=True,
    gradient_accumulation_steps=1
)

# Define the data collator for the dataset
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Define the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset["train"],
    eval_dataset=final_dataset["test"],
    data_collator=data_collator,
)

# Train the model
trainer.train()
# trainer.save_model(args.save_model_path)
trainer.save_model("models/Enc-Dec/Training")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.bias', 'cls.seq_relations

Epoch,Training Loss,Validation Loss
1,3.7808,3.623892
2,3.2181,3.400587
3,2.7157,3.374201
4,2.4308,3.342609
5,2.2979,3.372824




## Seq 2 Seq model for text Simplification - Inference

In [None]:
path = 'models/test_data/test.csv'
df = pd.read_csv(path)
#df.drop(['sentence_id'],axis = 1, inplace = True)
df['source_text'] = df['source_text'].apply(normalize_string)
df['simplified_text'] = df['simplified_text'].apply(normalize_string)
df.rename(columns={'source_text': 'input_text', 'simplified_text': 'target_text'}, inplace=True)

In [None]:
df

Unnamed: 0,input_text,target_text
0,There are many important peaks in the Grison A...,There are many important peaks in the Grison A...
1,Horst K ppel LRB born May RRB is a former Germ...,Horst K ppel LRB born May RRB is a former Germ...
2,PERSON LRB born NUMBER March NUMBER RRB is a D...,PERSON LRB born NUMBER March NUMBER RRB is a D...
3,Different phases are different because their o...,Different phases are different because their o...
4,Producer and screenwriter PERSON is the novel ...,Producer and screenwriter PERSON is the novel ...
...,...,...
6995,Skardu District is part of Baltistan and curre...,Skardu District is part of Baltistan and curre...
6996,With his position in the Tanzanian government ...,With his position in the Tanzanian government ...
6997,The Governor is in charge of making yearly Sta...,The Governor is in charge of making yearly Sta...
6998,Port is a commune. It is found in the region R...,Port is a commune. It is found in the region R...


In [None]:
dataset = Dataset.from_pandas(df)
inference_test = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
inference_test

Dataset({
    features: ['input_text', 'target_text', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 7000
})

In [None]:
#Inference test

import csv
from transformers import EncoderDecoderModel, AutoTokenizer, pipeline, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

#Change the model path according to the Training save path 
trained_model = EncoderDecoderModel.from_pretrained("models/Enc-Dec/Training")

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
text_generator = pipeline("text2text-generation", model=trained_model, tokenizer=tokenizer)


with open("models/Enc-Dec/output_files/Inference.csv", "a", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=["source_sentence","pred", "gold"])
    writer.writeheader()
    count = 68
    for i in range(69, len(inference_test)):
        row = inference_test[i] 
        count += 1
        input_text = row["input_text"]
        target_text = row["target_text"]
        generated_text = text_generator(input_text, do_sample=True)[0]["generated_text"]
        writer.writerow({"source_sentence": input_text,"pred": generated_text, "gold": target_text})
        print(count)
        #print(f"Predicted text :", generated_text)
        #print(f"Target text: ", target_text)
        



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
