<a href="https://colab.research.google.com/github/roxyrong/w266_project/blob/main/GPT_J_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Experiment 1: Baseline GPT-J 6B parameter model

The goal of this notebook is to construct a prompt design which reliably returns SQL code to test the pretrained text-to-SQL performance of the [GPT-J 6B parameter transformer model](https://huggingface.co/EleutherAI/gpt-j-6b).

## Notebook & Environment Setup

In [2]:
# Install non-natively available libraries
%%capture

#!pip install transformers
!pip install sentencepiece
!pip install accelerate -U
!pip install datasets
!pip install nltk

# Install required libraries for 8-bit runtime
!pip install --quiet bitsandbytes
!pip install --quiet git+https://github.com/huggingface/transformers.git # Install latest version of transformers
!pip install --quiet accelerate

In [3]:
# Imports
import os
import sys
from typing import Dict, List
import subprocess
import collections
import json
import random
import numpy as np
import pandas as pd
import nltk
import torch
import pprint
from google.colab import drive


# Import GPT-J for text generation + AutoTokenizer
from transformers import GPTJForCausalLM, AutoTokenizer


In [4]:
# Set up Google Drive for data storage

drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
%cd /content/drive/MyDrive/Github/w266_project


project_path = '/content/drive/MyDrive/Github/w266_project'
sys.path.append(project_path)

/content/drive/MyDrive/Github/w266_project


In [6]:
%pwd

'/content/drive/MyDrive/Github/w266_project'

In [7]:
# setup third_party drive with spider dataset and utilities, only need to do once
# %mkdir third_party
# %cd third_party
# !git clone https://github.com/taoyds/spider.git
# !git clone https://github.com/HKUNLP/UnifiedSKG.git

## Spider Data Set Structure Exploration

In [8]:
# Load Spider datasets
with open('spider/train_spider.json', 'r') as f:
    train_spider = pd.read_json(f)
with open('spider/train_others.json', 'r') as f:
    others_spider = pd.read_json(f)
with open('spider/dev.json', 'r') as f:
    dev_spider = pd.read_json(f)

In [9]:
# show first 5 elements of training set
# for i in range(0, 4):
#   print(train_spider.iloc[i])
#   print(train_spider.iloc[i]['question'])
#   print(train_spider.iloc[i]['query'])

# Show 5 random elements of training set db_id, question, query
sample = train_spider.sample(n=5)
for i in range(0,5):
  print(sample.iloc[i]['db_id'])
  print(sample.iloc[i]['question'])
  print(sample.iloc[i]['query'])
  print('\n')

customers_and_invoices
What are the full names of customers with the account name 900?
SELECT T2.customer_first_name ,  T2.customer_last_name FROM Accounts AS T1 JOIN Customers AS T2 ON T1.customer_id  =  T2.customer_id WHERE T1.account_name  =  "900"


bike_1
For each end station id, what is its name, latitude, and minimum duration for trips ended there?
SELECT T1.name ,  T1.lat ,  min(T2.duration) FROM station AS T1 JOIN trip AS T2 ON T1.id  =  T2.end_station_id GROUP BY T2.end_station_id


solvency_ii
Show the names of products and the number of events they are in.
SELECT T1.Product_Name ,  COUNT(*) FROM Products AS T1 JOIN Products_in_Events AS T2 ON T1.Product_ID  =  T2.Product_ID GROUP BY T1.Product_Name


flight_4
What is the total number of routes for each country and airline in that country?
SELECT T1.country ,  T1.name ,  count(*) FROM airlines AS T1 JOIN routes AS T2 ON T1.alid  =  T2.alid GROUP BY T1.country ,  T1.name


party_people
What are the names of members who are no

In [10]:
# Load schema for all tables
with open('spider/tables.json', 'r') as f:
    schema_df = pd.read_json(f)


In [11]:
# Helper Function to extract target schemas from Spider json to a dict
# for training and prompt usage.
def _get_schema_string(table_json):
    """Returns the schema serialized as a string."""
    table_id_to_column_names = collections.defaultdict(list)
    for table_id, name in table_json["column_names_original"]:
        table_id_to_column_names[table_id].append(name.lower())
        tables = table_json["table_names_original"]

    table_strings = []
    for table_id, table_name in enumerate(tables):
        column_names = table_id_to_column_names[table_id]
        table_string = " | %s : %s" % (table_name.lower(), " , ".join(column_names))
        table_strings.append(table_string)

    return "".join(table_strings)

schema_dict = {}
for idx, row in schema_df.iterrows():
    db_id = row['db_id']
    schema = _get_schema_string(row)
    schema_dict[db_id] = schema

In [12]:
print(schema_dict.keys())

dict_keys(['perpetrator', 'college_2', 'flight_company', 'icfp_1', 'body_builder', 'storm_record', 'pilot_record', 'race_track', 'academic', 'department_store', 'music_4', 'insurance_fnol', 'cinema', 'decoration_competition', 'phone_market', 'store_product', 'assets_maintenance', 'student_assessment', 'dog_kennels', 'music_1', 'company_employee', 'farm', 'solvency_ii', 'city_record', 'swimming', 'flight_2', 'election', 'manufactory_1', 'debate', 'network_2', 'local_govt_in_alabama', 'climbing', 'e_learning', 'scientist_1', 'ship_1', 'entertainment_awards', 'allergy_1', 'imdb', 'products_for_hire', 'candidate_poll', 'chinook_1', 'flight_4', 'pets_1', 'dorm_1', 'journal_committee', 'flight_1', 'medicine_enzyme_interaction', 'local_govt_and_lot', 'station_weather', 'shop_membership', 'driving_school', 'concert_singer', 'music_2', 'sports_competition', 'railway', 'inn_1', 'museum_visit', 'browser_web', 'baseball_1', 'architecture', 'csu_1', 'tracking_orders', 'insurance_policies', 'gas_com

In [13]:
print(schema_dict['department_management'], '\n')
print(schema_dict['flight_4'], '\n')
print(schema_dict['aircraft'], '\n')
print(schema_dict['icfp_1'], '\n')
print(schema_dict['activity_1'], '\n')
print(schema_dict['journal_committee'], '\n')

 | department : department_id , name , creation , ranking , budget_in_billions , num_employees | head : head_id , name , born_state , age | management : department_id , head_id , temporary_acting 

 | routes : rid , dst_apid , dst_ap , src_apid , src_ap , alid , airline , codeshare | airports : apid , name , city , country , x , y , elevation , iata , icao | airlines : alid , name , iata , icao , callsign , country , active 

 | pilot : pilot_id , name , age | aircraft : aircraft_id , aircraft , description , max_gross_weight , total_disk_area , max_disk_loading | match : round , location , country , date , fastest_qualifying , winning_pilot , winning_aircraft | airport : airport_id , airport_name , total_passengers , %_change_2007 , international_passengers , domestic_passengers , transit_passengers , aircraft_movements , freight_metric_tonnes | airport_aircraft : id , airport_id , aircraft_id 

 | inst : instid , name , country | authors : authid , lname , fname | papers : paperid , 

A few examples from Spider to help with the prompt model:

Schema string:

 | department : department_id , name , creation , ranking , budget_in_billions , num_employees | head : head_id , name , born_state , age | management : department_id , head_id , temporary_acting

 Question:

 How many heads of the departments are older than 56 ?

 Query:

 SELECT count(*) FROM head WHERE age  >  56

 ____
flight_4

What is the name, city, and country of the airport with the lowest altitude?
SELECT name ,  city ,  country FROM airports ORDER BY elevation LIMIT 1


aircraft

what is the name and age of the youngest winning pilot?
SELECT t1.name ,  t1.age FROM pilot AS t1 JOIN MATCH AS t2 ON t1.pilot_id  =  t2.winning_pilot ORDER BY t1.age LIMIT 1


icfp_1

How many papers are "Atsushi Ohori" the author of?
SELECT count(*) FROM authors AS t1 JOIN authorship AS t2 ON t1.authid  =  t2.authid JOIN papers AS t3 ON t2.paperid  =  t3.paperid WHERE  t1.fname  =  "Atsushi" AND t1.lname  =  "Ohori"


activity_1

How many faculty members participate in each activity? Return the activity names and the number of faculty members.
SELECT T1.activity_name ,  count(*) FROM Activity AS T1 JOIN Faculty_participates_in AS T2 ON T1.actID  =  T2.actID GROUP BY T1.actID

journal_committee

Show the names of editors of age either 24 or 25.
SELECT Name FROM editor WHERE Age  =  24 OR Age  =  25

## Set up and load stock pretrained float 16 GPT-J model


In [None]:
# Specify GPU for computations
device = "cuda"

# Load pretrained float16 GPT-J to GPU from Huggingface
# model = GPTJForCausalLM.from_pretrained(
#     "EleutherAI/gpt-j-6B",
#     revision="float16",
#     torch_dtype=torch.float16,
# ).to(device)

# Load pretrained float16 GPT-J from Google Drive
model = GPTJForCausalLM.from_pretrained(
    "pretrained_GPT-J",
).to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
#Save model to Google Drive to avoid long download times - only needed when needed :-)
# model.save_pretrained("pretrained_GPT-J", from_pt=True)

## Load tokenizer

In [None]:
# Load appropriate tokenizer for model
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", padding_side="left")


Downloading (…)okenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

## Set up an 8-bit version

Memory runs out with stock version for batch inference, so fine tuning is going to be tough.

[Huggingface page for gpt-j-6B-8bit](https://huggingface.co/hivemind/gpt-j-6B-8bit)

[source notebook](https://colab.research.google.com/drive/1ft6wQU0BhqG5PRlwgaZJv2VukKKjU4Es#scrollTo=p0dy1ZFwClcq)

In [15]:
import transformers

import torch
import torch.nn.functional as F
from torch import nn
from torch.cuda.amp import custom_fwd, custom_bwd

from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise

from tqdm.auto import tqdm

In [16]:
class FrozenBNBLinear(nn.Module):
    def __init__(self, weight, absmax, code, bias=None):
        assert isinstance(bias, nn.Parameter) or bias is None
        super().__init__()
        self.out_features, self.in_features = weight.shape
        self.register_buffer("weight", weight.requires_grad_(False))
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None
        self.bias = bias

    def forward(self, input):
        output = DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias)
        if self.adapter:
            output += self.adapter(input)
        return output

    @classmethod
    def from_linear(cls, linear: nn.Linear) -> "FrozenBNBLinear":
        weights_int8, state = quantize_blockise_lowmemory(linear.weight)
        return cls(weights_int8, *state, linear.bias)

    def __repr__(self):
        return f"{self.__class__.__name__}({self.in_features}, {self.out_features})"


class DequantizeAndLinear(torch.autograd.Function):
    @staticmethod
    @custom_fwd
    def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor,
                absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor):
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        ctx.save_for_backward(input, weights_quantized, absmax, code)
        ctx._has_bias = bias is not None
        return F.linear(input, weights_deq, bias)

    @staticmethod
    @custom_bwd
    def backward(ctx, grad_output: torch.Tensor):
        assert not ctx.needs_input_grad[1] and not ctx.needs_input_grad[2] and not ctx.needs_input_grad[3]
        input, weights_quantized, absmax, code = ctx.saved_tensors
        # grad_output: [*batch, out_features]
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        grad_input = grad_output @ weights_deq
        grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None
        return grad_input, None, None, None, grad_bias


class FrozenBNBEmbedding(nn.Module):
    def __init__(self, weight, absmax, code):
        super().__init__()
        self.num_embeddings, self.embedding_dim = weight.shape
        self.register_buffer("weight", weight.requires_grad_(False))
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None

    def forward(self, input, **kwargs):
        with torch.no_grad():
            # note: both quantuized weights and input indices are *not* differentiable
            weight_deq = dequantize_blockwise(self.weight, absmax=self.absmax, code=self.code)
            output = F.embedding(input, weight_deq, **kwargs)
        if self.adapter:
            output += self.adapter(input)
        return output

    @classmethod
    def from_embedding(cls, embedding: nn.Embedding) -> "FrozenBNBEmbedding":
        weights_int8, state = quantize_blockise_lowmemory(embedding.weight)
        return cls(weights_int8, *state)

    def __repr__(self):
        return f"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})"


def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2 ** 20):
    assert chunk_size % 4096 == 0
    code = None
    chunks = []
    absmaxes = []
    flat_tensor = matrix.view(-1)
    for i in range((matrix.numel() - 1) // chunk_size + 1):
        input_chunk = flat_tensor[i * chunk_size: (i + 1) * chunk_size].clone()
        quantized_chunk, (absmax_chunk, code) = quantize_blockwise(input_chunk, code=code)
        chunks.append(quantized_chunk)
        absmaxes.append(absmax_chunk)

    matrix_i8 = torch.cat(chunks).reshape_as(matrix)
    absmax = torch.cat(absmaxes)
    return matrix_i8, (absmax, code)


def convert_to_int8(model):
    """Convert linear and embedding modules to 8-bit with optional adapters"""
    for module in list(model.modules()):
        for name, child in module.named_children():
            if isinstance(child, nn.Linear):
                print(name, child)
                setattr(
                    module,
                    name,
                    FrozenBNBLinear(
                        weight=torch.zeros(child.out_features, child.in_features, dtype=torch.uint8),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
                        bias=child.bias,
                    ),
                )
            elif isinstance(child, nn.Embedding):
                setattr(
                    module,
                    name,
                    FrozenBNBEmbedding(
                        weight=torch.zeros(child.num_embeddings, child.embedding_dim, dtype=torch.uint8),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
                    )
                )

In [17]:
class GPTJBlock(transformers.models.gptj.modeling_gptj.GPTJBlock):
    def __init__(self, config):
        super().__init__(config)

        convert_to_int8(self.attn)
        convert_to_int8(self.mlp)


class GPTJModel(transformers.models.gptj.modeling_gptj.GPTJModel):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)


class GPTJForCausalLM(transformers.models.gptj.modeling_gptj.GPTJForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)


transformers.models.gptj.modeling_gptj.GPTJBlock = GPTJBlock  # monkey-patch GPT-J

In [18]:
config = transformers.GPTJConfig.from_pretrained("EleutherAI/gpt-j-6B")
tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

Downloading (…)lve/main/config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

In [19]:
model = GPTJForCausalLM.from_pretrained("hivemind/gpt-j-6B-8bit", low_cpu_mem_usage=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, bias=False)
fc_in Linear(in_features=4096, out_features=16384, bias=True)
fc_out Linear(in_features=16384, out_features=4096, bias=True)
k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, bias=False)
fc_in Linear(in_features=4096, out_features=16384, bias=True)
fc_out Linear(in_features=16384, out_features=4096, bias=True)
k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, 

GPTJForCausalLM(
  (transformer): GPTJModel(
    (wte): FrozenBNBEmbedding(50400, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-27): 28 x GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): FrozenBNBLinear(4096, 4096)
          (v_proj): FrozenBNBLinear(4096, 4096)
          (q_proj): FrozenBNBLinear(4096, 4096)
          (out_proj): FrozenBNBLinear(4096, 4096)
        )
        (mlp): GPTJMLP(
          (fc_in): FrozenBNBLinear(4096, 16384)
          (fc_out): FrozenBNBLinear(16384, 4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): FrozenBNBLinear(4096, 50400)
)

## Experimentation to build a workable prompt

In [20]:


# Test prompt
prompt = ("\n-- Parse the question into SQL based on the given table below.--\n\n"
          " | routes : rid , dst_apid , dst_ap , src_apid , src_ap , alid , airline , codeshare | airports : apid , name , city , country , x , y , elevation , iata , icao | airlines : alid , name , iata , icao , callsign , country , active"
          "\n Based on this schema, create an ANSI-92 SQL Query to answer the following question:\n"
          "Q:What is the name, city, and country of the airport with the lowest altitude??"
          "\n Return the SQL query ONLY. Do not include any additional explanation."
          )

# Tokenize prompt, load to GPU with model
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

# Generate response tokens from GPT-J
gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.1,
    max_length=250,
    repetition_penalty=1.1,
    top_p=1,
)

# Retrieve and print response
gen_text = tokenizer.batch_decode(gen_tokens)[0]

print(gen_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



-- Parse the question into SQL based on the given table below.--

 | routes : rid, dst_apid, dst_ap, src_apid, src_ap, alid, airline, codeshare | airports : apid, name, city, country, x, y, elevation, iata, icao | airlines : alid, name, iata, icao, callsign, country, active
 Based on this schema, create an ANSI-92 SQL Query to answer the following question:
Q:What is the name, city, and country of the airport with the lowest altitude??
 Return the SQL query ONLY. Do not include any additional explanation.

A:

This is a simple example using the data you provided in your question.
SELECT * FROM airports WHERE elevation = (SELECT MIN(elevation) FROM airports);

If you want to return only one row then use LIMIT 1 at the end of the query.
SELECT * FROM airports WHERE elevation = (SELECT MIN(elevation) FROM airports LIMIT 1);

You can also use ORDER BY and LIMIT if you want to order by the lowest value first.


In [21]:
# Load appropriate tokenizer for model
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")


# Test prompt
prompt = ("\n-- Parse the question into SQL based on the given table below.--\n\n"
          "| department : department_id , name , creation , ranking , budget_in_billions , num_employees | head : head_id , name , born_state , age | management : department_id , head_id , temporary_acting"
          "\n Based on this schema, create an ANSI-92 SQL Query to answer the following question:\n"
          "Q:How many heads of the departments are older than 56 ?"
          "\n As answer, return the SQL query ONLY. Do not include any additional explanation."
          )

# Tokenize prompt, load to GPU with model
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

# Generate response tokens from GPT-J
gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.1,
    max_length=250,
    repetition_penalty=1.1,
    top_p=1,
)

# Retrieve and print response
gen_text = tokenizer.batch_decode(gen_tokens)[0]

print(gen_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



-- Parse the question into SQL based on the given table below.--

| department : department_id, name, creation, ranking, budget_in_billions, num_employees | head : head_id, name, born_state, age | management : department_id, head_id, temporary_acting
 Based on this schema, create an ANSI-92 SQL Query to answer the following question:
Q:How many heads of the departments are older than 56?
 As answer, return the SQL query ONLY. Do not include any additional explanation.

A:

You can use a subquery and group by department_id and then count the number of rows in each department that have a birthdate greater than 56.  Then you can join back to the original table to get the names of those employees.
select d.department_id, count(*) as NumEmployees
from (select department_id, count(*) as NumRows
      from tbl
      where birthdate > '1956-01-01'
      group by department_id) d
join tbl


# Construct prompts for baseline test

## Baseline test of pretrained GPT-J for Spider dataset

In [22]:
prefix = '\n-- Parse the question into SQL based on the given table below.--\n\n'
infix = '\n Based on this schema, create an ANSI-92 SQL Query to answer the following question:\n'
postfix = '\n As answer, return the SQL query ONLY. Do not include any additional explanation.'

train_spider['schema'] = train_spider['db_id'].map(schema_dict)
train_spider['prompt'] = prefix + train_spider['schema'] + infix + train_spider['question'] + postfix

others_spider['schema'] = others_spider['db_id'].map(schema_dict)
others_spider['prompt'] = prefix + others_spider['schema'] + infix + others_spider['question']

dev_spider['schema'] = dev_spider['db_id'].map(schema_dict)
dev_spider['prompt'] = prefix + dev_spider['schema'] + infix + dev_spider['question'] + postfix

## Run baseline test prior to fine tuning


In [23]:
# evaluate
torch.cuda.empty_cache()

max_length = 250

tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

inputs = tokenizer(
        list(dev_spider['prompt']),
        return_tensors='pt',
        truncation=True,
        padding=True,
    ).input_ids.to(device)

# print(inputs)
# print(inputs["input_ids"])
# print(inputs["attention_mask"])

gen_tokens = model.generate(
    input_ids=inputs,
    do_sample=True,
    temperature=0.1,
    max_length=max_length,
    repetition_penalty=1.1,
    top_p=0.9,
)


outputs = [tokenizer.decode(i, skip_special_tokens=True) for i in gen_tokens]

# # Retrieve and print response
# gen_text = tokenizer.batch_decode(gen_tokens)[0]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 384, but `max_length` is set to 250. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


OutOfMemoryError: ignored

In [None]:
# evaluate results
eval_path = f"third_party/spider/evaluation.py"
gold = f"third_party/spider/evaluation_examples/gold_example.txt"
pred = f"t5base-finetuned-spider/predicted_result.txt"
db_dir = f"spider/database"
table = f"spider/tables.json"
etype = "all"

cmd_str = f"python3 \"{eval_path}\" --gold \"{gold}\" --pred \"{pred}\" --db \"{db_dir}\" --table \"{table}\" --etype {etype} "
result = subprocess.run(cmd_str, shell=True, capture_output=True, text=True)

In [None]:
# print results
pp = pprint.PrettyPrinter(width=160)
pp.pprint(result.stdout[-4633:])