<a href="https://colab.research.google.com/github/sssharaf/ml-nlp/blob/master/start_servers_play_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/gdrive')
!ls -ltr /gdrive/'My Drive'/ML/data/start-servers-play
!pip install pytorch_transformers
!ln -s  /gdrive/'My Drive'/ML/data/start-servers-play data
!ls -ltr data/*

In [0]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
import pytorch_transformers as pt
from pytorch_transformers import BertTokenizer, BertConfig,BertForMaskedLM,BertModel,DistilBertTokenizer, DistilBertModel,DistilBertForSequenceClassification 
import os
import typing
from typing import Dict,List,Sequence,Set
from types import SimpleNamespace as SN
import numpy as np
import pickle
import math
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight,compute_sample_weight
T_BertTokenizer = typing.NewType("BertTokenizer",BertTokenizer)
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 

In [0]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [0]:
trn_df = pd.read_csv('data/train.csv',dtype={'action':'category','component':'category'},)
val_df = pd.read_csv('data/val-new.csv',dtype={'action':'category','component':'category'})
#trn_df.loc[trn_df.action=='noaction']
trn_df

In [0]:
action_le = LabelEncoder()
action_le.fit(trn_df.action)
component_le = LabelEncoder()
component_le.fit(trn_df.component)
print(action_le.classes_)
print(component_le.classes_)

with open(f'data/action_le.dat','wb') as f:
  pickle.dump(action_le,f)

with open(f'data/component_le.dat','wb') as f:
  pickle.dump(component_le,f)


In [0]:

def encode_X(comment:str,max_len):
  X = f"[CLS] {comment} [SEP]"
  encoded = torch.tensor(tokenizer.encode(X),dtype=torch.long)
  X = torch.zeros(max_len,dtype=torch.long)
  X[:len(encoded)] = encoded
  X[len(encoded)+1:] = torch.tensor(tokenizer.pad_token_id,dtype=torch.long)  
  X_attn_mask = X!=tokenizer.pad_token_id
  X_attn_mask = X_attn_mask.int()
  return X,X_attn_mask

class MyDataset(Dataset):

    def __init__(self,df:DataFrame,max_len = 16):
        self.df = df
        self.max_len=max_len
        self.action = self.df.action.cat.codes
        self.component = self.df.component.cat.codes

    def __getitem__(self,index):
        X = self.df.iloc[index]['comment_text']
        X,X_attn_mask = encode_X(X,self.max_len)
        Y1 = self.df.iloc[index]['action']
        Y1 = action_le.transform([Y1])
        #Y1 = a_ohe.transform([[Y1]])
        Y1 = torch.tensor(Y1,dtype=torch.long)
        Y2 = self.df.iloc[index]['component']
        Y2 = component_le.transform([Y2])
        #Y2 = c_ohe.transform([[Y2]])
        Y2 = torch.tensor(Y2, dtype=torch.long)
        return (X,X_attn_mask),(Y1.squeeze(),Y2.squeeze())

    def __len__(self):
        return len(self.df)

    def components(self):
        return self.component

In [0]:

trn_ds = MyDataset(trn_df,max_len=25)
val_ds = MyDataset(val_df,max_len=25)

trn_dl = DataLoader(dataset=trn_ds,batch_size=32,pin_memory=True,shuffle=True)
val_dl = DataLoader(dataset=val_ds,batch_size=32,pin_memory=True,shuffle=True)


In [0]:
len(trn_ds)

In [0]:
from pytorch_transformers import BertForSequenceClassification
############################### Model 5  ############################################
# Model
class MyModel5(nn.Module):
  def __init__(self, freeze_bert = True):
    super().__init__()
    self.model_version = 5
    
    self.bert_lyr = BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True,output_attentions=True)

    self.config = self.bert_lyr.config;

    self.cls_lyr = nn.Sequential(
        nn.LayerNorm(768),
        nn.Linear(768,9)
    )

    # self.cls_lyr = nn.Sequential(
    #         nn.LayerNorm(768),
    #         nn.Dropout(0.3),
    #         nn.Linear(768,64),
    #         nn.ReLU(),
    #         nn.LayerNorm(64),
    #         nn.Dropout(0.1),
    #         nn.Linear(64,9),
    # )

    #Freeze bert layers
    if freeze_bert:
        self.freeze_bert()
        
  def freeze_bert(self):
      for p in self.bert_lyr.parameters():
        p.requires_grad = False

  def unfreeze_bert(self,from_lyr=6):

    for lyr in self.bert_lyr.encoder.layer[-11:]:
      for p in lyr.parameters():
        p.requires_grad = True


  def forward(self, seq, attn_masks,output_attn=False,output_hs=False):
    
    seq_emb,pooled,hs,attn = self.bert_lyr(seq,attention_mask =attn_masks)
    o=self.cls_lyr(pooled)     
    outputs=[]
    outputs += [
                o[:,:4],
                o[:,4:],
                ]
    return outputs



In [0]:
from pytorch_transformers import BertForSequenceClassification
############################### Model 1  ############################################
# Model
class MyModel7(nn.Module):
  def __init__(self, freeze_bert = True,attn_dropout=0.3):
    super().__init__()
    self.model_version = '7_1'
    
    self.bert_lyr = BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True,output_attentions=True)
    
    self.config = self.bert_lyr.config;

    self.a_attn = nn.Linear(768,1)
    
    self.c_attn = nn.Linear(768,1)

    self.attn_dropout = nn.Dropout(attn_dropout)

    self.action_cls_lyr = nn.Sequential(
        nn.Dropout(0.3),
        nn.LayerNorm(768),
        nn.Linear(768,4),
    )
    self.comp_cls_lyr = nn.Sequential(
            nn.Dropout(0.3),
            nn.LayerNorm(768),
            nn.Linear(768,5),
        )    

    #Freeze bert layers
    if freeze_bert:
        self.freeze_bert()
        
  def freeze_bert(self):
      for p in self.bert_lyr.parameters():
        p.requires_grad = False

  def unfreeze_bert(self,from_lyr=6):

    for lyr in self.bert_lyr.encoder.layer[-6:]:
      for p in lyr.parameters():
        p.requires_grad = True

  def forward(self, seq, attn_masks,output_attn=False,output_hs=False):
    attn_mask_cls = (1 - attn_masks)*-10000
    attn_mask_cls.unsqueeze_(dim=-1)

    seq_emb,pooled,hs,attn = self.bert_lyr(seq,attention_mask =attn_masks)
    
    a,a_output = self.attention(seq_emb,self.a_attn,attn_mask_cls)

    c,c_output = self.attention(seq_emb,self.c_attn,attn_mask_cls)

    a_pooled  = a
    c_pooled  = c
    outputs=[]
    outputs += [
                self.action_cls_lyr(a_pooled),
                self.comp_cls_lyr(c_pooled),
                ]
    if (output_attn):
      outputs += [a_output,c_output]
    return outputs

  def attention(self,seq_emb,attn_lyr,attn_mask_cls):
    a = attn_lyr(seq_emb)
    a = a + attn_mask_cls
    a = a_output = a.softmax(dim=1)
    a = self.attn_dropout(a)
    #a_output = a.clone()
    a = torch.mul(seq_emb,a)
    a = a.sum(dim=1)
    return a,a_output.unsqueeze(dim=0)

# try:
#   del model
#   torch.cuda.reset_max_memory_allocated()
#   torch.cuda.reset_max_memory_cached()
# except NameError:
#   pass

# model = MyModel7(freeze_bert=False)
# model = model.to(DEVICE)


In [0]:
from pytorch_transformers import BertForSequenceClassification
############################### Model 1  ############################################
# Model
class MyModel8(nn.Module):
  def __init__(self, freeze_bert = True,attn_dropout=0.3):
    super().__init__()
    self.model_version = '8'
    
    self.bert_lyr = BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True,output_attentions=True)
    
    self.config = self.bert_lyr.config;

    self.a_attn = nn.Linear(768,1)
    
    self.c_attn = nn.Linear(768,1)

    self.attn_dropout = nn.Dropout(attn_dropout)

    self.shared_lyr = nn.Sequential(
         nn.LayerNorm(768),
         nn.Dropout(0.3),
         nn.Linear(768,768),
         nn.ReLU(),
         nn.LayerNorm(768),
    )
    self.action_cls_lyr = nn.Sequential(
        nn.Dropout(0.3),
        nn.Linear(768,4),
    )
    self.comp_cls_lyr = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(768,5),
        )    

    #Freeze bert layers
    if freeze_bert:
        self.freeze_bert()
        
  def freeze_bert(self):
      for p in self.bert_lyr.parameters():
        p.requires_grad = False

  def unfreeze_bert(self,from_lyr=6):

    for lyr in self.bert_lyr.encoder.layer[-6:]:
      for p in lyr.parameters():
        p.requires_grad = True

  def forward(self, seq, attn_masks,output_attn=False,output_hs=False):
    attn_mask_cls = (1 - attn_masks)*-10000
    attn_mask_cls.unsqueeze_(dim=-1)

    seq_emb,pooled,hs,attn = self.bert_lyr(seq,attention_mask =attn_masks)
    seq_emb = seq_emb[:,1:,:]
    attn_mask_cls = attn_mask_cls[:,1:]
    a,a_output = self.attention(seq_emb,self.a_attn,attn_mask_cls)

    c,c_output = self.attention(seq_emb,self.c_attn,attn_mask_cls)

    pooled = self.shared_lyr(pooled)
    a_pooled  = a+pooled
    c_pooled  = c+pooled
    outputs=[]
    outputs += [
                self.action_cls_lyr(a_pooled),
                self.comp_cls_lyr(c_pooled),
                ]
    if (output_attn):
      outputs += [a_output,c_output]
    return outputs

  def attention(self,seq_emb,attn_lyr,attn_mask_cls):
    a = attn_lyr(seq_emb)
    a = a + attn_mask_cls
    a = a_output = a.softmax(dim=1)
    a = self.attn_dropout(a)
    #a_output = a.clone()
    a = torch.mul(seq_emb,a)
    a = a.sum(dim=1)
    return a,a_output.unsqueeze(dim=0)


In [0]:
model = MyModel8()
model.to(DEVICE)
(X, X_attn),(_,_) = next(iter(trn_dl))
X, X_attn = X.to(DEVICE),X_attn.to(DEVICE)
print(f'X.size()={X.size()}')
a,c,a_a,c_a = model(X,X_attn,output_attn=True,output_hs=True)
a_a.size()

In [0]:
from pytorch_transformers import BertForSequenceClassification
############################### Model 1  ############################################
# Model
class MyModel8_LSTM(nn.Module):
  def __init__(self, freeze_bert = True):
    super().__init__()
    self.model_version = '8'
    
    self.bert_lyr = BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True,output_attentions=True)
    
    self.config = self.bert_lyr.config;

    n_features = 64
    self.a_attn = nn.Linear(768,1)
    
    self.c_attn = nn.Linear(768,1)

    self.action_cls_lyr = nn.Sequential(
        nn.Dropout(0.3),
        nn.Linear(n_features,4),
    )
    self.comp_cls_lyr = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(n_features,5),
        )    

    self.a_lstm = nn.LSTM(batch_first=True,input_size=768,hidden_size=n_features,num_layers=1)
    self.c_lstm = nn.LSTM(batch_first=True,input_size=768,hidden_size=n_features,num_layers=1)
    
    #Freeze bert layers
    if freeze_bert:
        self.freeze_bert()
    else:
        self.freeze_bert()
        self.unfreeze_bert()
        
  def freeze_bert(self):
      for p in self.bert_lyr.parameters():
        p.requires_grad = False

  def unfreeze_bert(self,from_lyr=6):

    for lyr in self.bert_lyr.encoder.layer[-6:]:
      for p in lyr.parameters():
        p.requires_grad = True


  def attention(self,seq_emb,attn_lyr,attn_mask_cls):
    a = attn_lyr(seq_emb)
    a = a + attn_mask_cls
    a = a_output = a.softmax(dim=1)
    a = torch.mul(seq_emb,a)
    #a = a.mean(dim=1)
    return a,a_output


  def forward(self, seq, attn_masks,output_attn=False,output_hs=False):
    attn_mask_cls = (1 - attn_masks)*-10000
    attn_mask_cls.unsqueeze_(dim=-1)
    seq_emb,pooled,hs,attn = self.bert_lyr(seq,attention_mask =attn_masks)
    #seq_emb, (_,_) = self.lstm(seq_emb)
    a,a_output = self.attention(seq_emb,self.a_attn,attn_mask_cls)
    _, (a,_) = self.a_lstm(a)
    a=a.sum(dim=0)
    c,c_output = self.attention(seq_emb,self.c_attn,attn_mask_cls)
    _, (c,_) = self.c_lstm(c)
    c=c.sum(dim=0)

    a_pooled  = a
    c_pooled  = c
    outputs=[]
    outputs += [
                self.action_cls_lyr(a_pooled),
                self.comp_cls_lyr(c_pooled),
                ]
    if (output_attn):
      outputs += [a_output,c_output]
    return outputs



try:
  del model
  torch.cuda.reset_max_memory_allocated()
  torch.cuda.reset_max_memory_cached()
except NameError:
  pass

model = MyModel8(freeze_bert=False)
model = model.to(DEVICE)


In [0]:
from pytorch_transformers import BertForSequenceClassification

class MHA(nn.Module):

   def __init__(self, attns:int=1, attn_dropout=0.1,use_attn_proj=False,use_out_proj=False):
     super().__init__()

     self.attns = attns
     self.attn_dropout = attn_dropout
     self.attn_vec = nn.Parameter(torch.Tensor(768))
     self.use_attn_proj = use_attn_proj
     self.use_out_proj=use_out_proj
     self.attn_dropout = nn.Dropout(attn_dropout)
     if use_attn_proj:
       att_hs = int(768 / attns)
       self.attn_proj=nn.Linear(att_hs,att_hs)
     if use_out_proj:
       self.out_proj=nn.Sequential(nn.Linear(768,768),nn.LayerNorm(768))
     nn.init.normal_(self.attn_vec)

   def forward(self,seq_emb,attn_mask_cls):
    attn_vec= self.attn_vec
    bs,ss,es = seq_emb.size()
    seq_emb_t = seq_emb.view(bs,ss,self.attns,-1)
    #(s,w,a,d/a) -> (s,a,w,d/a)
    seq_emb_t = seq_emb_t.permute(0,2,1,3)

    if self.use_attn_proj:
      seq_emb_t = self.attn_proj(seq_emb_t)

    #(d) -> (a,d/a,1)
    attn = attn_vec.view(self.attns,-1).unsqueeze(dim=-1)
    #(s,a,w,1)
    alpha = torch.matmul(seq_emb_t,attn)
    alpha /= math.sqrt(es)    
    attn_mask_cls = attn_mask_cls.unsqueeze(dim=1)
    attn_mask_cls = torch.cat([attn_mask_cls]*self.attns,dim=1)
    attn_mask_cls = attn_mask_cls.unsqueeze(dim=-1)
    #print(f'alpha.size()={alpha.size()}')
    #print(f'attn_mask_cls.size()={attn_mask_cls.size()}')
    alpha  = alpha + attn_mask_cls
    alpha = alpha.softmax(dim=2)
    alpha = self.attn_dropout(alpha)
    #(s,a,w,d/a)
    att_s = seq_emb_t*alpha
    att_s = att_s.sum(dim=-2)
    att_s = att_s.view(bs,-1).contiguous()
    if self.use_out_proj:
      att_s = self.out_proj(att_s)
    return att_s,alpha.squeeze()


############################### Model 9  ############################################
# With different attention
class MyModel9(nn.Module):
  def __init__(self, freeze_bert = True, attns:int=1,cls_dropout=0.1, attn_dropout=0.1,use_attn_proj=False,use_out_proj=False):
    super().__init__()
    
    attns = int(attns)

    self.model_version = '9'
    
    self.bert_lyr = BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True,output_attentions=True)
    
    self.config = self.bert_lyr.config;

    self.attns = attns

    self.a_attn = MHA(attns=attns,attn_dropout=attn_dropout,use_attn_proj=use_attn_proj,use_out_proj=use_out_proj)

    self.c_attn = MHA(attns=attns,attn_dropout=attn_dropout,use_attn_proj=use_attn_proj,use_out_proj=use_out_proj)

    self.attn_dropout = nn.Dropout(attn_dropout)
    
    assert 768 % attns == 0
    
    att_hs = int(768 / attns)

    self.action_cls_lyr = nn.Sequential(
            nn.Dropout(cls_dropout),
            nn.LayerNorm(768),            
            nn.Linear(768,4),
            # nn.LayerNorm(768),
            # nn.Dropout(cls_dropout),
            # nn.Linear(768,64),
            # nn.ReLU(),
            # nn.LayerNorm(64),
            # nn.Dropout(0.1),
            # nn.Linear(64,4),
    )
    self.comp_cls_lyr = nn.Sequential(
            nn.Dropout(cls_dropout),
            nn.LayerNorm(768),            
            nn.Linear(768,5),
            # nn.LayerNorm(768),
            # nn.Dropout(cls_dropout),
            # nn.Linear(768,64),
            # nn.ReLU(),
            # nn.LayerNorm(64),
            # nn.Dropout(0.1),
            # nn.Linear(64,5),        
            )    

    #Freeze bert layers
    if freeze_bert:
        self.freeze_bert()
    # else:
    #     self.freeze_bert()
    #     self.unfreeze_bert()
        
  def freeze_bert(self):
      for p in self.bert_lyr.parameters():
        p.requires_grad = False

  def unfreeze_bert(self,from_lyr=6):

    for lyr in self.bert_lyr.encoder.layer[-6:]:
      for p in lyr.parameters():
        p.requires_grad = True

  def attention_base(self,seq_emb,attn_lyr,attn_mask_cls):
      attn_mask_cls = attn_mask_cls.unsqueeze(dim=-1)
      a = attn_lyr(seq_emb)
      a = a + attn_mask_cls
      a = a_output = a.softmax(dim=1)
      a = torch.mul(seq_emb,a)
      a = a.sum(dim=1)
      return a,a_output

  def attention(self,seq_emb,attn_lyr,attn_mask_cls):
    attns = self.attns
    bs,ss,es = seq_emb.size()
    #(s,w,d) -> (s,w,a,d/a)
    seq_emb_t = seq_emb.view(bs,ss,attns,-1).contiguous()
    if attns > 1:
        attn_mask_cls =attn_mask_cls.repeat(1,attns).view(bs,attns,-1).permute(0,-1,-2).contiguous()
    else:
      attn_mask_cls = attn_mask_cls.unsqueeze(dim=-1)
    # (s,w,a,d/a) -> (s,w,a,d/a,1)
    attn_mask_cls = attn_mask_cls.unsqueeze(dim=-1)

    # (s,w,a,d/a) -> (s,w,a,1)
    alpha = attn_lyr(seq_emb_t)
    alpha += attn_mask_cls
    alpha = alpha.softmax(dim=-3)
    alpha = self.attn_dropout(alpha)
    #(s,w,a,d/a)
    att_s = torch.mul(seq_emb_t, alpha)
    att_s_ctx = att_s.sum(dim=1).view(bs,-1).contiguous()
    alpha = alpha.squeeze(dim=-1).sum(dim=-1)
    return att_s_ctx,alpha.squeeze(dim=-1)


  def forward(self, seq, attn_masks,output_attn=False,output_hs=False):
    attn_mask_cls = (1 - attn_masks)*-10000
    #attn_mask_cls.unsqueeze_(dim=-1)
    seq_emb,pooled,hs,attn = self.bert_lyr(seq,attention_mask =attn_masks)
    
    #a,a_output = self.attention(seq_emb,self.a_attn,attn_mask_cls)
    a,a_output = self.a_attn(seq_emb,attn_mask_cls)

    #c,c_output = self.attention(seq_emb,self.c_attn,attn_mask_cls)
    c,c_output = self.c_attn(seq_emb,attn_mask_cls)

    a_pooled  = a
    c_pooled  = c
    outputs=[]
    outputs += [
                self.action_cls_lyr(a_pooled),
                self.comp_cls_lyr(c_pooled),
                ]
    if (output_attn):
      outputs += [a_output,c_output]
    return outputs


# try:
#   del model
#   torch.cuda.reset_max_memory_allocated()
#   torch.cuda.reset_max_memory_cached()
# except NameError:
#   pass

# model = MyModel9(freeze_bert=False,attns=3,use_out_proj=True)
# model = model.to(DEVICE)


In [0]:
(X, X_attn),(_,_) = next(iter(trn_dl))
X, X_attn = X.to(DEVICE),X_attn.to(DEVICE)
print(f'X.size()={X.size()}')
a,c,a_a,c_a = model(X,X_attn,output_attn=True,output_hs=True)
a_a.size()

In [0]:

# #attn_lyr= model.a_attn
# seq_emb = i.clone()
# attn_mask_cls = attn_mask.clone()

# bs,ss,es = seq_emb.size()
# #(s,w,d) -> (s,w,a,d/a)
# seq_emb_t = seq_emb.view(bs,ss,attns,-1).contiguous()
# if attns > 1:
#   attn_mask_cls =attn_mask_cls.repeat(1,attns).view(bs,attns,-1).permute(0,-1,-2).contiguous()
# else:
#   attn_mask_cls = attn_mask_cls.unsqueeze(dim=-1)

# # (s,w,a,d/a) -> (s,w,a,d/a,1)
# attn_mask_cls = attn_mask_cls.unsqueeze(dim=-1)

# # (s,w,a,d/a) -> (s,w,a,1)
# alpha = attn_lyr(seq_emb_t)
# alpha += attn_mask_cls
# alpha = alpha.softmax(dim=-3)

# print(f'[Attention-1]alpha.size()={alpha.size()}')
# print(alpha)
# att_s = seq_emb_t * alpha
# print(f'att_s={att_s.size()}')
# att_s = att_s.mean(dim=1).view(bs,-1)
# print(f'att_s={att_s.size()}')
# print(att_s)

# #Attention - Base
# print('Attention - Base')
# seq_emb = i.clone()
# attn_mask_cls = attn_mask.clone()
# attn_mask_cls = attn_mask_cls.unsqueeze(dim=-1)
# alpha = attn_lyr(seq_emb)
# alpha += attn_mask_cls
# alpha = alpha.softmax(dim=1)
# print(alpha)
# att_s = torch.mul(seq_emb,alpha)
# att_s = att_s.mean(dim=1)
# print(att_s)

# Attention - 2 
seq_emb = i.clone()
attn_mask_cls = attn_mask.clone()
attn_mask_cls[0:-1]=-10000
bs,ss,es = seq_emb.size()
#print(bs,ss,es)
#(s,w,d) -> (s,a,w,d/a)
seq_emb_t = seq_emb.view(bs,ss,attns,-1)
seq_emb_t = seq_emb_t.permute(0,2,1,3)

# if attns > 1:
#   # (s,w) -> (s,a,w)
#   attn_mask_cls = attn_mask_cls.repeat(attns,1).contiguous()
# else:
#   attn_mask_cls = attn_mask_cls.unsqueeze(dim=-1)
# # (s,a,w) -> (s,a,w,1)
# attn_mask_cls = attn_mask_cls.unsqueeze(dim=-1)

# (s,a,w,d/a) -> (s,a,w,1)
attn = attn_vec.view(attns,-1).unsqueeze(dim=-1)
alpha = torch.matmul(seq_emb_t,attn)
alpha /=math.sqrt(es)
#alpha += attn_mask_cls
alpha  = alpha + attn_mask.unsqueeze(dim=-1)
alpha = alpha.softmax(dim=2)
att_s = seq_emb_t*alpha
#att_s = att_s.sum(dim=2)
#att_s = att_s.view(bs,-1).contiguous()
print(f'[Attention-MH]alpha.size()={alpha.size()}')
print(alpha)
print(att_s)
att_s = att_s.sum(dim=2)
att_s = att_s.view(bs,-1).contiguous()
print(att_s)




In [0]:
model1 = MyModel9(attns=1)
model1= model1.to(DEVICE)

In [0]:
(X, X_attn),(_,_) = next(iter(trn_dl))
X, X_attn = X.to(DEVICE),X_attn.to(DEVICE)
print(f'X.size()={X.size()}')
a,c,a_a,c_a = model(X,X_attn,output_attn=True,output_hs=True)
a_a.size()

In [0]:
def configure_bert_optim(model:nn.Module,lr=2e-6,wd=0.1):
      param_optimizer = list(model.named_parameters())
      no_decay = ["bias", "gamma", "beta"]
      optimizer_grouped_parameters = [
              {
                  "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                  "weight_decay_rate": wd,
                  "lr":lr,
              },
              {
                  "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                  "weight_decay_rate": 0.0,
                  "lr":lr,
              },
              ]
      return optimizer_grouped_parameters


# Load model
#model = MyModel5(freeze_bert=False)
# optimizer_params += [
#                                 {'params':model.cls_lyr.parameters()},
# ]

MODEL=8

if MODEL ==7:
  model = MyModel7(freeze_bert=False)
  optimizer_params = configure_bert_optim(model.bert_lyr,lr=1e-5)
  optimizer_params += [
                                  #{'params':model.bert_lyr.parameters(),'lr':1e-5,'weight_decay':0.1},
                                  {'params':model.a_attn.parameters()},
                                  {'params':model.c_attn.parameters()},
                                  {'params':model.action_cls_lyr.parameters()},
                                  {'params':model.comp_cls_lyr.parameters(),},                     
  ]
if MODEL == 8:
  model = MyModel8(freeze_bert=False)
  optimizer_params = configure_bert_optim(model.bert_lyr,lr=1e-5)
  optimizer_params += [
                                  #{'params':model.bert_lyr.parameters(),'lr':1e-5,'weight_decay':0.1},
                                  {'params':model.a_attn.parameters()},
                                  {'params':model.c_attn.parameters()},
                                  {'params':model.shared_lyr.parameters()},
                                  {'params':model.action_cls_lyr.parameters()},
                                  {'params':model.comp_cls_lyr.parameters(),},                     
  ]

#model = MyModel9(freeze_bert=False,attns=4,cls_dropout=0.2,attn_dropout=0.2,use_attn_proj=False,use_out_proj=False)
#model = MyModel10(freeze_bert=False,attns=1,attn_dropout=0.3)
#model.load_state_dict(torch.load(f'data/model-{model.model_version}.dat'))


g_wd = 1
g_lr = 1e-3
model.to(DEVICE)
optimizer = torch.optim.AdamW(optimizer_params,lr=g_lr,weight_decay=g_wd )
scheduler = CosineAnnealingWarmRestarts(optimizer,T_0=20,T_mult=1)

In [0]:
# Train model
# torch.cuda.reset_max_memory_allocated()
# torch.cuda.reset_max_memory_cached()
SAVE_TO_FILE=False
print(f'Device type is {DEVICE.type}')
print(f'Runnig model version {model.model_version}')
model.to(DEVICE)
model.train()
le_trnf = action_le.transform(trn_df.action)
u,c = np.unique(le_trnf,return_counts=True)
ac_class_weight=compute_class_weight('balanced',classes=u,y=le_trnf)
ac_class_weight = torch.tensor(ac_class_weight,dtype=torch.float,device=DEVICE)
le_trnf = component_le.transform(trn_df.component)
u,c = np.unique(le_trnf,return_counts=True)
com_class_weight=compute_class_weight('balanced',classes=u,y=le_trnf)
com_class_weight = torch.tensor(com_class_weight,dtype=torch.float,device=DEVICE)

print(f'ac_class_weight={ac_class_weight}')
print(f'com_class_weight={com_class_weight}')


action_criterion = nn.modules.loss.CrossEntropyLoss(weight=ac_class_weight,reduction='mean')
component_criterion = nn.modules.loss.CrossEntropyLoss(weight=com_class_weight,reduction='mean')

n_epochs = 500

def evaluate_model(epoch:int,model:nn.Module,dl:DataLoader,optimizer):
  t_loss=0.0
  t_a_loss=0.0
  t_c_loss=0.0
  iters = len(dl)
  for i,((X, attn_mask),(Y1,Y2)) in enumerate(dl):
    X , attn_mask,Y1,Y2 = X.to(DEVICE),attn_mask.to(DEVICE),Y1.to(DEVICE),Y2.to(DEVICE)
    p_a,p_c = model(X,attn_mask)
    #print(f'p_a size={p_a.size()}')
    action_loss = action_criterion(p_a.view(-1,4),Y1)
    component_loss = component_criterion(p_c.view(-1,5),Y2)
    loss =   action_loss + component_loss
    if optimizer is not None:
      scheduler.step(epoch+i/iters)
      optimizer.zero_grad()
      torch.nn.utils.clip_grad_norm_(model.parameters(),5)
      loss.backward()
      optimizer.step()
    t_loss += loss.item()
    t_a_loss += action_loss.item()
    t_c_loss += component_loss.item()
    
  return t_loss,t_a_loss,t_c_loss

for epoch in range(n_epochs):
  if epoch == 10:
    print('Freezing BERT')
    model.freeze_bert()
  t_loss=0
  a_loss=0
  c_loss=0
  t_loss,a_loss,c_loss = evaluate_model(epoch,model,trn_dl,optimizer)
  if SAVE_TO_FILE and epoch%10 == 0:
    torch.save(model.state_dict(),f'data/model-{model.model_version}.dat')
  v_loss = 0
  with torch.no_grad():
    v_loss = evaluate_model(epoch,model,val_dl,None)
    v_loss = [f'{v:0.4}' for v in v_loss]
  print(f'Epoch:{epoch} Trn loss={t_loss:0.4}, Actn loss:{a_loss:0.4} ,Comp loss:{c_loss:0.4},  Validation loss:{v_loss}')


In [0]:
def print_tensor(t:torch.tensor):
  t = t.squeeze().detach().cpu().numpy()
  sh = t.shape[0]
  print(f'sh={sh}')
  if sh ==1:
    t = [f'{item:.2}' for item in t]
    print(t)
  else:
    for arr in t:
      i = [f'{item:.2}' for item in arr]
      print(i)
      

model.eval()
with torch.no_grad():
  X,X_attn_mask = encode_X('',max_len=25)
  
  X_tokenized = X
  X,X_attn_mask = X.to(DEVICE), X_attn_mask.to(DEVICE)
  X.unsqueeze_(0)
  X_attn_mask.unsqueeze_(0)
  action,component,a_attn,c_attn = model(X,X_attn_mask,output_attn=True)
  #action,component = model(X,X_attn_mask,output_attn=False)
  action = action.softmax(dim=1)
  component = component.softmax(dim=1)
  action,component = [a.squeeze().detach().cpu().numpy() for a in [action,component]]

  print(action_le.classes_)  
  print(action)
  print(a_attn.size())
  action = np.argmax(action)
  print(f'Selected Action: {action_le.inverse_transform([action])}')

  print(component_le.classes_)
  print(component)
  component = np.argmax(component)
  print(component)
  print(f'Selected component: {component_le.inverse_transform([component])}')

  print(tokenizer.convert_ids_to_tokens(X_tokenized.numpy().squeeze()))
  print_tensor(a_attn)
  print_tensor(c_attn)
  # print(a_attn.squeeze().detach().cpu().numpy())
  # print(c_attn.squeeze().detach().cpu().numpy())


In [0]:
model.a_attn_vec

In [0]:
model.a_attn_vec