<a href="https://colab.research.google.com/github/sssharaf/ml-nlp/blob/master/Nlp2Sql.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/gdrive')
!ls -ltr /gdrive/'My Drive'/ML/data/nlp2sql
!ln -s  /gdrive/'My Drive'/ML/data/nlp2sql wikisql
!pip install pytorch_transformers

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive
total 180484
-rw------- 1 root root 33179166 Aug 11  2017 train.tables.jsonl
-rw------- 1 root root  9858694 Aug 11  2017 test.tables.jsonl
-rw------- 1 root root  4652052 Aug 11  2017 dev.tables.jsonl
-rw------- 1 root root 12271616 Aug 11  2017 dev.db
-rw------- 1 root root 85348352 Aug 11  2017 train.db
-rw------- 1 root root 24453120 Aug 11  2017 test.db
-rw------- 1 root root 10511763 Oct  9  2017 train.jsonl
-rw------- 1 root root  1570387 Oct  9  2017 dev.json

In [0]:
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset
import pytorch_transformers as pt
from pytorch_transformers import BertTokenizer, BertConfig,BertForMaskedLM,BertModel
import os
import typing
from typing import Dict,List,Sequence,Set
from types import SimpleNamespace as SN
import numpy as np
from sklearn.utils.class_weight import compute_class_weight,compute_sample_weight

T_BertTokenizer = typing.NewType("BertTokenizer",BertTokenizer)
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 

In [3]:
EMPTY_COL='[EMPTY]'
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
#bert_model = BertModel.from_pretrained("bert-base-uncased")

max_hs_len = 3

100%|██████████| 231508/231508 [00:00<00:00, 2609574.74B/s]


In [0]:

def data_from_tables(tab_file:str,dir:str='wikisql/data') -> Dict[str,SN]:
    tab_map:Dict[str,SN]={}
    for l in open(dir + f'{os.sep}' + tab_file):
        l = json.loads(l.strip())
        header = l['header']
        header.append(EMPTY_COL)
        e_header = [tokenizer.encode(h) for h in header]
        types = l['types']
        id = l['id']
        sn = SN()
        sn.header = header
        sn.e_header = e_header
        sn.types = types
        tab_map[id]=sn
    return tab_map

def data_from_sql(sql_file:str,dir='wikisql/data') -> (List[str], List[str],List[str],List[List[SN]]):
    sql_text: List[str]=[]
    table_id: List[str]=[]
    all_conds: List[List[SN]]=[]
    for l in open(dir + f'{os.sep}' + sql_file):
        l = json.loads(l.strip())
        tab_id = l['table_id']
        table_id.append(tab_id)
        sql_text.append(l['question'])
        sql = l['sql']
        conds = []
        for cond in sql['conds']:
            sn = SN()
            sn.ci = cond[0]
            sn.oi = cond[1]
            sn.c = cond[2]
            conds.append(sn)
        all_conds.append(conds)
    e_sql_text = [tokenizer.encode(s) for s in sql_text]
    return table_id,sql_text,e_sql_text, all_conds



In [0]:

class MyDataSet(Dataset) :
    def __init__(self,max_hs_len=3,sql_file='dev.jsonl',tab_file='dev.tables.jsonl',dir='wikisql/data'):
        super(MyDataSet).__init__()
        tab_map = data_from_tables(tab_file,dir)
        table_ids, sql_texts,e_sql_text, all_conds = data_from_sql(sql_file,dir)
        data = []

        for tab_id,enc_s in zip(table_ids,e_sql_text):
            x = []
            x.extend(enc_s)
            tab_info = tab_map[tab_id]
            x.extend(tokenizer.encode(tokenizer.sep_token))
            for enc_h in tab_info.e_header:
                enc_h = enc_h[:max_hs_len]
                x.extend(enc_h)
                x.extend(tokenizer.encode(tokenizer.sep_token))
            data.append(x)

        max_l = 0
        for d in data:
            if len(d) > max_l:
                max_l = len(d)
        self.data = data
        self.max_l = max_l
        self.all_conds = all_conds
        self.n_where_conds = [len(c) for c in all_conds]

    def __getitem__(self, idx):
        d = self.data[idx]
        X = torch.zeros(self.max_l,dtype=torch.long)
        X[:len(d)] = torch.tensor(d,dtype=torch.long)
        X = X.to(DEVICE)
        Y = torch.tensor(self.n_where_conds[idx],dtype=torch.long,device=DEVICE)
        return X,Y

    def __len__(self):
        return len(self.data)


In [0]:
ds = MyDataSet(dir='/gdrive/My Drive/ML/data/nlp2sql')
dl = DataLoader(dataset=ds,batch_size=32,pin_memory=False)
u,c = np.unique(ds.n_where_conds,return_counts=True)
class_weight=compute_class_weight('balanced',classes=u,y=ds.n_where_conds)
class_weight = torch.tensor(class_weight,dtype=torch.float,device=DEVICE)


In [0]:
dl = DataLoader(dataset=ds,batch_size=32,pin_memory=False)

In [8]:
class MyModel(nn.Module):

  def __init__(self):
    super(MyModel,self).__init__()
    self.bert_model = BertModel.from_pretrained("bert-base-uncased")
    self.linear = nn.Sequential(
      nn.Linear(768,5,bias=False),
    )

  def forward(self,X):
    _,ctx = self.bert_model(X)
    output = self.linear(ctx)
    return output

model = MyModel()
criterion = nn.modules.loss.CrossEntropyLoss(weight=class_weight)
model.to(DEVICE)


100%|██████████| 313/313 [00:00<00:00, 157923.39B/s]
100%|██████████| 440473133/440473133 [00:12<00:00, 35772273.21B/s]


MyModel(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [0]:
#torch.quantization.quantize(bert_model)

In [0]:
#torch.quantization.quantize(bert_model)


optimizer = torch.optim.AdamW( model.parameters(),lr=0.00001)
if (torch.cuda.is_available()):
  print("Cuda available")
else:
  print("No Cuda")

n_epochs=10
for epoch in range(n_epochs):
  t_loss = 0
  print(f'Running epoch {epoch}')
  for X,Y in dl:
    #print(Y)
    output = model(X)
    loss = criterion(output,Y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    t_loss = t_loss + loss.item() 
  print(f'Loss={t_loss}')
  
  


Cuda available
Running epoch 0


In [0]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight,compute_sample_weight

#Y.numpy()
u,_ = np.unique([ len(c) for c in ds.all_conds],return_counts=True)
compute_class_weight('balanced',classes=u,y=[len(c) for c in ds.all_conds])
compute_sample_weight('balanced',[ len(c) for c in ds.all_conds])
np.bincount?