In [None]:
!pip install pytesseract transformers datasets rouge-score nltk tensorboard py7zr --upgrade
!pip install accelerate -U
!pip install -q mlflow
!pip install -q rouge_score
!pip install transformers[torch]

In [None]:
import torch
from datasets import load_dataset, load_metric
from transformers import (
    T5ForConditionalGeneration,
    AutoTokenizer,
    T5Tokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    pipeline,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq
)


In [None]:
# install git-fls for pushing model and logs to the hugging face hub
!sudo apt-get install git-lfs --yes

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [None]:
dataset_id = "wikisql"

In [None]:
# Load dataset from the hub
dataset = load_dataset(dataset_id)

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

Downloading builder script:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.80k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.2M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/15878 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8421 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/56355 [00:00<?, ? examples/s]

Train dataset size: 56355
Test dataset size: 15878


In [None]:
from random import randrange


sample = dataset['train'][randrange(len(dataset["train"]))]
print(f"question: \n{sample['question']}\n---------------")
print(f"table: \n{sample['table']}\n---------------")
print(f"sql: \n{sample['sql'].get('human_readable')}\n---------------")

question: 
What was the lowest rank of an area with a 2011 Census population larger than 3,645,257, a House of Commons seat percentage of 11.7% and a July 2013 population estimate over 4,581,978?
---------------
table: 
{'header': ['Rank', 'Name', 'Population ( 2011 Census )', 'Percent of national population', '% growth (2006–11)', 'Land area (km²)', 'Population density (/km 2 )', 'House of Commons seats', 'House of Commons seats (%)', '2013 population (July est.)'], 'page_title': 'List of Canadian provinces and territories by population', 'page_id': '106104', 'types': ['real', 'text', 'real', 'text', 'text', 'real', 'real', 'real', 'text', 'real'], 'id': '2-106104-1', 'section_title': 'Listing', 'caption': 'Listing', 'rows': [['1', 'Ontario', '12,851,821', '38.4%', '5.7%', '908,607.67', '14.1', '106', '34.4%', '13,537,994'], ['2', 'Quebec', '7,903,001', '23.6%', '4.7%', '1,356,547.02', '5.8', '75', '24.4%', '8,155,334'], ['3', 'British Columbia', '4,400,057', '13.1%', '7.0%', '922,509

In [None]:
# define constants
MODEL_NAME = 't5-base'
MAX_LENGTH = 64
BATCH_SIZE = 64
NUM_EPOCHS = 5

In [None]:

model_id="google/flan-t5-base"
# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id, model_max_length=MAX_LENGTH)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
train_data = load_dataset('wikisql', split='train')
validation_data = load_dataset('wikisql', split='validation')

def format_dataset(example):
    return {'input': 'translate to SQL: ' + example['question'], 'target': example['sql']['human_readable']}

train_data = train_data.map(format_dataset, remove_columns=train_data.column_names)
validation_data = validation_data.map(format_dataset, remove_columns=validation_data.column_names)

# tokenize the examples
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input'], pad_to_max_length=True, max_length=MAX_LENGTH,truncation = True) #padding
    target_encodings = tokenizer.batch_encode_plus(example_batch['target'], pad_to_max_length=True, max_length=MAX_LENGTH, truncation = True)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings


train_data = train_data.map(convert_to_features, batched=True, remove_columns=train_data.column_names)
validation_data = validation_data.map(convert_to_features, batched=True, remove_columns=validation_data.column_names)

columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']

train_data.set_format(type='torch', columns=columns)
validation_data.set_format(type='torch', columns=columns)


Map:   0%|          | 0/56355 [00:00<?, ? examples/s]



Map:   0%|          | 0/8421 [00:00<?, ? examples/s]

In [None]:
#load model
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)



In [None]:
# number of trainable parameters
print(model.num_parameters(only_trainable=True)/1e6)

247.577856


In [None]:
# set training arguments - Feel free to adapt it
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    logging_dir='./logs',
    per_device_train_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    per_device_eval_batch_size=BATCH_SIZE,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    do_train=True,
    do_eval=True,
    logging_steps=5,
    save_strategy="epoch",
    overwrite_output_dir=True,
    save_total_limit=3,
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="mlflow",  # log to mlflow
)

In [None]:
rouge = load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [None]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=validation_data,
)

In [None]:
# initial evaluation
trainer.evaluate()



RuntimeError: ignored

In [None]:
trainer.train()

In [None]:
trainer.save_model()

In [None]:
tokenizer.save_pretrained('./results')

In [None]:
#evaluating with trained data
tokenizer = AutoTokenizer.from_pretrained("./results")
model = T5ForConditionalGeneration.from_pretrained("./results")

In [None]:
test_data = load_dataset(dataset_id, split='test')

In [None]:
def translate_to_sql(text):
    inputs = tokenizer(text, padding='longest', max_length=MAX_LENGTH, return_tensors='pt')
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=MAX_LENGTH)

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
for i in range(0,60,3):
  print('translate to SQL: ' + test_data[i]['question'])
  print('Predicted: ' + translate_to_sql('translate to SQL: ' + test_data[i]['question']))
  print('Expected: ' + test_data[i]['sql']['human_readable'])
  print('=================================\n')