<a href="https://colab.research.google.com/github/roxyrong/w266_project/blob/main/GPT_J_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Experiment 1: Baseline GPT-J 6B parameter model

The goal of this notebook is to construct a prompt design which reliably returns SQL code to test the pretrained text-to-SQL performance of the [GPT-J 6B parameter transformer model](https://huggingface.co/EleutherAI/gpt-j-6b).

## Notebook & Environment Setup

In [1]:
# Install non-natively available libraries
%%capture

!pip install transformers
!pip install sentencepiece
!pip install accelerate -U
!pip install datasets
!pip install nltk

In [2]:
# Imports
import os
import sys
from typing import Dict, List
import subprocess
import collections
import json
import random
import numpy as np
import pandas as pd
import nltk
import torch
from google.colab import drive

# Import GPT-J for text generation + AutoTokenizer
from transformers import GPTJForCausalLM, AutoTokenizer


In [3]:
# Set up Google Drive for data storage

drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/Github/w266_project


project_path = '/content/drive/MyDrive/Github/w266_project'
sys.path.append(project_path)

/content/drive/MyDrive/Github/w266_project


In [5]:
%pwd

'/content/drive/MyDrive/Github/w266_project'

In [6]:
# setup third_party drive with spider dataset and utilities, only need to do once
# %mkdir third_party
# %cd third_party
# !git clone https://github.com/taoyds/spider.git
# !git clone https://github.com/HKUNLP/UnifiedSKG.git

## Spider Data Set

In [7]:
# Load Spider datasets
with open('spider/train_spider.json', 'r') as f:
    train_spider = pd.read_json(f)
with open('spider/train_others.json', 'r') as f:
    others_spider = pd.read_json(f)
with open('spider/dev.json', 'r') as f:
    dev_spider = pd.read_json(f)

In [8]:
# show first 5 elements of training set

# for i in range(0, 4):
#   print(train_spider.iloc[i])
#   print(train_spider.iloc[i]['question'])
#   print(train_spider.iloc[i]['query'])

# Print 5 random elements of training set db_id, question, query
sample = train_spider.sample(n=5)
for i in range(0,5):
  print(sample.iloc[i]['db_id'])
  print(sample.iloc[i]['question'])
  print(sample.iloc[i]['query'])
  print('\n')

voter_2
What is the city_code of the city that the most students live in?
SELECT city_code FROM STUDENT GROUP BY city_code ORDER BY count(*) DESC LIMIT 1


match_season
Show the draft pick numbers and draft classes of players whose positions are defenders.
SELECT Draft_Pick_Number ,  Draft_Class FROM match_season WHERE POSITION  =  "Defender"


school_player
Count the number of schools.
SELECT count(*) FROM school


county_public_safety
List the names of counties that do not have any cities.
SELECT Name FROM county_public_safety WHERE County_ID NOT IN (SELECT County_ID FROM city)


music_4
What are the songs in volumes with more than 1 week on top?
SELECT Song FROM volume WHERE Weeks_on_Top  >  1




In [9]:
# Load schema for all tables
with open('spider/tables.json', 'r') as f:
    schema_df = pd.read_json(f)


In [10]:
# Helper Function to extract target schemas from Spider json to a dict
# for training and prompt usage.
def _get_schema_string(table_json):
    """Returns the schema serialized as a string."""
    table_id_to_column_names = collections.defaultdict(list)
    for table_id, name in table_json["column_names_original"]:
        table_id_to_column_names[table_id].append(name.lower())
        tables = table_json["table_names_original"]

    table_strings = []
    for table_id, table_name in enumerate(tables):
        column_names = table_id_to_column_names[table_id]
        table_string = " | %s : %s" % (table_name.lower(), " , ".join(column_names))
        table_strings.append(table_string)

    return "".join(table_strings)

schema_dict = {}
for idx, row in schema_df.iterrows():
    db_id = row['db_id']
    schema = _get_schema_string(row)
    schema_dict[db_id] = schema

In [11]:
print(schema_dict.keys())

dict_keys(['perpetrator', 'college_2', 'flight_company', 'icfp_1', 'body_builder', 'storm_record', 'pilot_record', 'race_track', 'academic', 'department_store', 'music_4', 'insurance_fnol', 'cinema', 'decoration_competition', 'phone_market', 'store_product', 'assets_maintenance', 'student_assessment', 'dog_kennels', 'music_1', 'company_employee', 'farm', 'solvency_ii', 'city_record', 'swimming', 'flight_2', 'election', 'manufactory_1', 'debate', 'network_2', 'local_govt_in_alabama', 'climbing', 'e_learning', 'scientist_1', 'ship_1', 'entertainment_awards', 'allergy_1', 'imdb', 'products_for_hire', 'candidate_poll', 'chinook_1', 'flight_4', 'pets_1', 'dorm_1', 'journal_committee', 'flight_1', 'medicine_enzyme_interaction', 'local_govt_and_lot', 'station_weather', 'shop_membership', 'driving_school', 'concert_singer', 'music_2', 'sports_competition', 'railway', 'inn_1', 'museum_visit', 'browser_web', 'baseball_1', 'architecture', 'csu_1', 'tracking_orders', 'insurance_policies', 'gas_com

In [12]:
print(schema_dict['department_management'], '\n')
print(schema_dict['flight_4'], '\n')
print(schema_dict['aircraft'], '\n')
print(schema_dict['icfp_1'], '\n')
print(schema_dict['activity_1'], '\n')
print(schema_dict['journal_committee'], '\n')

 | department : department_id , name , creation , ranking , budget_in_billions , num_employees | head : head_id , name , born_state , age | management : department_id , head_id , temporary_acting 

 | routes : rid , dst_apid , dst_ap , src_apid , src_ap , alid , airline , codeshare | airports : apid , name , city , country , x , y , elevation , iata , icao | airlines : alid , name , iata , icao , callsign , country , active 

 | pilot : pilot_id , name , age | aircraft : aircraft_id , aircraft , description , max_gross_weight , total_disk_area , max_disk_loading | match : round , location , country , date , fastest_qualifying , winning_pilot , winning_aircraft | airport : airport_id , airport_name , total_passengers , %_change_2007 , international_passengers , domestic_passengers , transit_passengers , aircraft_movements , freight_metric_tonnes | airport_aircraft : id , airport_id , aircraft_id 

 | inst : instid , name , country | authors : authid , lname , fname | papers : paperid , 

A few examples from Spider to help with the prompt model:

Schema string:

 | department : department_id , name , creation , ranking , budget_in_billions , num_employees | head : head_id , name , born_state , age | management : department_id , head_id , temporary_acting

 Question:

 How many heads of the departments are older than 56 ?

 Query:

 SELECT count(*) FROM head WHERE age  >  56

 ____
flight_4

What is the name, city, and country of the airport with the lowest altitude?
SELECT name ,  city ,  country FROM airports ORDER BY elevation LIMIT 1


aircraft

what is the name and age of the youngest winning pilot?
SELECT t1.name ,  t1.age FROM pilot AS t1 JOIN MATCH AS t2 ON t1.pilot_id  =  t2.winning_pilot ORDER BY t1.age LIMIT 1


icfp_1

How many papers are "Atsushi Ohori" the author of?
SELECT count(*) FROM authors AS t1 JOIN authorship AS t2 ON t1.authid  =  t2.authid JOIN papers AS t3 ON t2.paperid  =  t3.paperid WHERE  t1.fname  =  "Atsushi" AND t1.lname  =  "Ohori"


activity_1

How many faculty members participate in each activity? Return the activity names and the number of faculty members.
SELECT T1.activity_name ,  count(*) FROM Activity AS T1 JOIN Faculty_participates_in AS T2 ON T1.actID  =  T2.actID GROUP BY T1.actID

journal_committee

Show the names of editors of age either 24 or 25.
SELECT Name FROM editor WHERE Age  =  24 OR Age  =  25

## Model Setup

In [13]:
# Specify GPU for computations
device = "cuda"

# Load pretrained float16 GPT-J to GPU from Huggingface
# model = GPTJForCausalLM.from_pretrained(
#     "EleutherAI/gpt-j-6B",
#     revision="float16",
#     torch_dtype=torch.float16,
# ).to(device)

# Load pretrained float16 GPT-J from Google Drive
model = GPTJForCausalLM.from_pretrained(
    "pretrained_GPT-J",
).to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
#Save model to Google Drive to avoid long download times - only needed when needed :-)
# model.save_pretrained("pretrained_GPT-J", from_pt=True)

In [15]:
# Load appropriate tokenizer for model
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")


# Test prompt
prompt = ("\n-- Parse the question into SQL based on the given table below.--\n\n"
          " | routes : rid , dst_apid , dst_ap , src_apid , src_ap , alid , airline , codeshare | airports : apid , name , city , country , x , y , elevation , iata , icao | airlines : alid , name , iata , icao , callsign , country , active"
          "\n Based on this schema, create an ANSI-92 SQL Query to answer the following question:\n"
          "Q:What is the name, city, and country of the airport with the lowest altitude??"
          "\n Return the SQL query ONLY. Do not include any additional explanation."
          )

# Tokenize prompt, load to GPU with model
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

# Generate response tokens from GPT-J
gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.1,
    max_length=250,
    repetition_penalty=1.1,
    top_p=1,
)

# Retrieve and print response
gen_text = tokenizer.batch_decode(gen_tokens)[0]

print(gen_text)

Downloading (…)okenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



-- Parse the question into SQL based on the given table below.--

 | routes : rid, dst_apid, dst_ap, src_apid, src_ap, alid, airline, codeshare | airports : apid, name, city, country, x, y, elevation, iata, icao | airlines : alid, name, iata, icao, callsign, country, active
 Based on this schema, create an ANSI-92 SQL Query to answer the following question:
Q:What is the name, city, and country of the airport with the lowest altitude??
 Return the SQL query ONLY. Do not include any additional explanation.

A:

You can use a subquery to find the lowest altitude for each airport. Then you can join that back to your original table to get the other information.
SELECT t1.*
FROM routes AS t1
INNER JOIN (
  SELECT r.rid, MIN(elevation) AS minElevation
  FROM routes AS r
  INNER JOIN airports AS a ON r.dst_apid = a.apid
  GROUP BY


In [16]:
# Load appropriate tokenizer for model
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")


# Test prompt
prompt = ("\n-- Parse the question into SQL based on the given table below.--\n\n"
          "| department : department_id , name , creation , ranking , budget_in_billions , num_employees | head : head_id , name , born_state , age | management : department_id , head_id , temporary_acting"
          "\n Based on this schema, create an ANSI-92 SQL Query to answer the following question:\n"
          "Q:How many heads of the departments are older than 56 ?"
          "\n Return the SQL query ONLY. Do not include any additional explanation."
          )

# Tokenize prompt, load to GPU with model
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

# Generate response tokens from GPT-J
gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.1,
    max_length=250,
    repetition_penalty=1.1,
    top_p=1,
)

# Retrieve and print response
gen_text = tokenizer.batch_decode(gen_tokens)[0]

print(gen_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



-- Parse the question into SQL based on the given table below.--

| department : department_id, name, creation, ranking, budget_in_billions, num_employees | head : head_id, name, born_state, age | management : department_id, head_id, temporary_acting
 Based on this schema, create an ANSI-92 SQL Query to answer the following question:
Q:How many heads of the departments are older than 56?
 Return the SQL query ONLY. Do not include any additional explanation.

A:

SELECT COUNT(*) FROM (
  SELECT DISTINCT department_id
  FROM head
  WHERE age > 56
) AS t1;

<|endoftext|>


## Baseline test of pretrained GPT-J for Spider dataset