<a href="https://colab.research.google.com/github/sssharaf/ml-nlp/blob/master/start_servers_play.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/gdrive')
!ls -ltr /gdrive/'My Drive'/ML/data/start-servers-play
!pip install pytorch_transformers
!ln -s  /gdrive/'My Drive'/ML/data/start-servers-play data
!ls -ltr data/*

In [0]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,Dataset
import pytorch_transformers as pt
from pytorch_transformers import BertTokenizer, BertConfig,BertForMaskedLM,BertModel,DistilBertTokenizer, DistilBertModel,DistilBertForSequenceClassification 
import os
import typing
from typing import Dict,List,Sequence,Set
from types import SimpleNamespace as SN
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight,compute_sample_weight
T_BertTokenizer = typing.NewType("BertTokenizer",BertTokenizer)
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 

In [0]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [0]:
trn_df = pd.read_csv('data/train.csv',dtype={'action':'category','component':'category'})
val_df = pd.read_csv('data/val.csv',dtype={'action':'category','component':'category'})
trn_df.component

In [0]:
a_ohe = OneHotEncoder(sparse=False)
a_ohe.fit(trn_df.loc[:,['action']])
print(a_ohe.transform([['start']]))
c_ohe = OneHotEncoder(sparse=False)
c_ohe.fit(trn_df.loc[:,['component']])

action_le = LabelEncoder()
action_le.fit(trn_df.action)
component_le = LabelEncoder()
component_le.fit(trn_df.component)
print(action_le.classes_)
print(component_le.classes_)


In [0]:

def encode_X(comment:str,max_len):
  X = f"[CLS] {comment}[SEP]"
  encoded = torch.tensor(tokenizer.encode(X),dtype=torch.long)
  X = torch.zeros(max_len,dtype=torch.long)
  X[:len(encoded)] = encoded
  X[len(encoded)+1:] = torch.tensor(tokenizer.pad_token_id,dtype=torch.long)  
  X_attn_mask = X!=tokenizer.pad_token_id
  X_attn_mask = X_attn_mask.int()
  return X,X_attn_mask

class MyDataset(Dataset):

    def __init__(self,df:DataFrame,max_len = 16):
        self.df = df
        self.max_len=max_len
        self.action = self.df.action.cat.codes
        self.component = self.df.component.cat.codes

    def __getitem__(self,index):
        X = self.df.iloc[index]['comment_text']
        X,X_attn_mask = encode_X(X,self.max_len)
        Y1 = self.df.iloc[index]['action']
        Y1 = action_le.transform([Y1])
        #Y1 = a_ohe.transform([[Y1]])
        Y1 = torch.tensor(Y1,dtype=torch.long)
        Y2 = self.df.iloc[index]['component']
        Y2 = component_le.transform([Y2])
        #Y2 = c_ohe.transform([[Y2]])
        Y2 = torch.tensor(Y2, dtype=torch.long)
        return (X,X_attn_mask),(Y1.squeeze(),Y2.squeeze())

    def __len__(self):
        return len(self.df)

    def components(self):
        return self.component

In [0]:
trn_ds = MyDataset(trn_df)
val_ds = MyDataset(val_df)
trn_ds[2]

In [0]:
trn_dl = DataLoader(dataset=trn_ds,batch_size=4,pin_memory=True)
val_dl = DataLoader(dataset=val_ds,batch_size=4,pin_memory=True)

In [0]:
# Model
class MyModel(nn.Module):
  def __init__(self, freeze_bert = True):
    super().__init__()
    self.bert_lyr = BertModel.from_pretrained('bert-base-uncased')
    self.a_attn = nn.Linear(768,10)
    self.c_attn = nn.Linear(768,10)
    self.action_cls_lyr = nn.Sequential(
        nn.Linear(768,len(a_ohe.categories_[0]),bias=False),
        #nn.LayerNorm(len(a_ohe.categories_[0])),
      
    )
    self.component_cls_lyr = nn.Sequential(
                nn.Linear(768,len(c_ohe.categories_[0]),bias=False),
                #nn.LayerNorm(len(c_ohe.categories_[0])),
              
    )
    

    #Freeze bert layers
    if freeze_bert:
        for lyr in self.bert_lyr.encoder.layer[:-2]:
          for p in lyr.parameters():#self.bert_lyr.parameters():
              p.requires_grad = False
    #nn.init.xavier_uniform_(self.action_cls_lyr.weight)
    #nn.init.xavier_uniform_(self.component_cls_lyr.weight)

  def forward(self, seq, attn_masks,output_attn=False):
    attn_mask_cls = (1 - attn_masks)*-10000
    attn_mask_cls.unsqueeze_(dim=-1)

    seq_emb,ctx = self.bert_lyr(seq,attention_mask =attn_masks)
    
    a = self.a_attn(seq_emb)
    a = a + attn_mask_cls
    a = a.mean(dim=-1,keepdim=True)
    a = a_output = a.softmax(dim=1)
    #a = F.dropout(a,0.1)
    a = torch.mul(seq_emb,a)
    a = a.sum(dim=1)

    c = self.c_attn(seq_emb)
    c = c + attn_mask_cls
    c = c.mean(dim=-1,keepdim=True)
    c = c_output =  c.softmax(dim=1)
    #c = F.dropout(c,0.1)
    c = torch.mul(seq_emb,c)
    c = c.sum(dim=1)

    outputs = [self.action_cls_lyr(a),self.component_cls_lyr(c)]
    if (output_attn):
      outputs += [a_output,c_output]
    return outputs

model = MyModel(freeze_bert=True)

In [0]:
print(f'Device type is {DEVICE.type}')
model.to(DEVICE)

le_trnf = action_le.transform(trn_df.action)
u,c = np.unique(le_trnf,return_counts=True)
ac_class_weight=compute_class_weight('balanced',classes=u,y=le_trnf)
ac_class_weight = torch.tensor(ac_class_weight,dtype=torch.float,device=DEVICE)
le_trnf = component_le.transform(trn_df.component)
u,c = np.unique(le_trnf,return_counts=True)
com_class_weight=compute_class_weight('balanced',classes=u,y=le_trnf)
com_class_weight = torch.tensor(com_class_weight,dtype=torch.float,device=DEVICE)

print(f'ac_class_weight={ac_class_weight}')
print(f'com_class_weight={com_class_weight}')

optimizer = torch.optim.AdamW([{'params':model.bert_lyr.parameters(),'lr':1e-7},
                               {'params':model.a_attn.parameters(),'lr':1e-4},
                               {'params':model.c_attn.parameters(),'lr':1e-4},
                               {'params':model.action_cls_lyr.parameters(),'lr':1e-4},
                               {'params':model.component_cls_lyr.parameters(),'lr':1e-4},
                               ],lr=1e-4 )
#optimizer = torch.optim.AdamW(model.parameters(),lr=1e-4,weight_decay=0.01 )
action_criterion = nn.modules.loss.CrossEntropyLoss(weight=ac_class_weight,reduction='mean')
component_criterion = nn.modules.loss.CrossEntropyLoss(weight=com_class_weight,reduction='mean')

n_epochs = 1000

def evaluate_model(model:MyModel,dl:DataLoader,optimizer):
  t_loss=0
  for (X, attn_mask),(Y1,Y2) in dl:
    X , attn_mask,Y1,Y2 = X.to(DEVICE),attn_mask.to(DEVICE),Y1.to(DEVICE),Y2.to(DEVICE)
    p_a,p_c = model(X,attn_mask)
    loss = action_criterion(p_a,Y1) + component_criterion(p_c,Y2)
    if optimizer is not None:
      optimizer.zero_grad()
      torch.nn.utils.clip_grad_norm(model.parameters(),1)
      loss.backward()
      optimizer.step()
    t_loss += loss.item()
  return t_loss

for epoch in range(n_epochs):
  t_loss=0
  t_loss = evaluate_model(model,trn_dl,optimizer)
  if epoch%10 == 0:
    torch.save(model.state_dict(),'data/model-2.dat')
  v_loss = 0
  with torch.no_grad():
    v_loss = evaluate_model(model,val_dl,None)
  print(f'Epoch:{epoch} Training loss={t_loss} , Validation loss:{v_loss}')


In [0]:
with torch.no_grad():
  X,X_attn_mask = encode_X('kindly kill and begin ',max_len=16)
  X_tokenized = X
  X,X_attn_mask = X.to(DEVICE), X_attn_mask.to(DEVICE)
  X.unsqueeze_(0)
  X_attn_mask.unsqueeze_(0)
  action,component,a_attn,c_attn = model(X,X_attn_mask,output_attn=True)
  action = action.softmax(dim=1)
  component = component.softmax(dim=1)
  
  print(tokenizer.convert_ids_to_tokens(X_tokenized.numpy()))

  print(action_le.classes_)  
  print(action)
  print(a_attn.squeeze())
  action = torch.argmax(action.detach().cpu()).item()
  print(f'Selected Action: {action_le.inverse_transform([action])}')

  print(component_le.classes_)
  print(component)
  print(c_attn.squeeze())
  component = torch.argmax(component.detach().cpu()).item()
  print(component)
  print(f'Selected component: {component_le.inverse_transform([component])}')


In [0]:
i = torch.ones(4,16,768)
attn = nn.Linear(768,1)
a = attn(i)
a = a.softmax(dim=1)
o = a*i

model.bert_lyr.encoder.layer[-1].parameters()

In [0]:
trn_df