# Task 1: MultiWOZ2.2 Template Generation

##Setups

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#requirements
!pip install fastDamerauLevenshtein
!pip install g2p_en
import nltk
nltk.download('punkt')

In [None]:
%env HOME=/content/drive/MyDrive/
%cd ~/Research/ner/Multiwoz/multiwoz/data/MultiWOZ_2.2/run_env

env: HOME=/content/drive/MyDrive/
/content/drive/MyDrive/Research/ner/Multiwoz/multiwoz/data/MultiWOZ_2.2/run_env


In [None]:
!ls -lrt

total 51
-rw------- 1 root root  7376 Aug  6 13:59 extra_methods.py
-rw------- 1 root root 25332 Aug  6 13:59 BabyTrie_DSTC10_v4.py
-rw------- 1 root root  1120 Aug  7 08:44 simple_tokenize.py
drwx------ 2 root root  4096 Aug  7 08:52 __pycache__
-rw------- 1 root root 12765 Aug  7 08:52 BabyTrie_MultiWOZ.py


##Taxi, Train/Bus db modification

In [None]:
import json

#Taxi
db = open('/content/drive/MyDrive/Research/ner/Multiwoz/multiwoz/db/taxi_db.json','r')
taxi_dic = json.load(db)
db.close()
new_db = open('/content/drive/MyDrive/Research/ner/Multiwoz/multiwoz/db/taxi_db_clean.json','w')

taxi_final = []
colors = taxi_dic['taxi_colors']
types = taxi_dic['taxi_types']

taxi_id = 0
for color in colors:
    for t in types:
        candidate = {}
        candidate['id'] = taxi_id
        candidate['name'] = color + ' ' + t
        taxi_id += 1
        taxi_final.append(candidate)
json.dump(taxi_final, new_db, indent = 4)
new_db.close()

##BabyTrie modification

In [None]:
from simple_tokenize import Clean_Text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#@title BABYTRIE
#%%writefile BabyTrie_MultiWOZ.py
import re
import json
from fastDamerauLevenshtein import damerauLevenshtein as dl
from g2p_en import G2p
from simple_tokenize import Clean_Text
from simple_tokenize import Word_Tokenize

def getphoneme(g2p,text):
    ptext=g2p(text)
    string='-'.join(ptext)
    res=string.replace('- -','_')
    return res

class BabyTrie:#baby version of Trie
    class TrieNode:#Node within a Trie
        def __init__(self,entity=None):
            self.children={}#dictionary of TrieNodes
            self.markers=[]#list of markers/categories
            self.end=False#the Node is leaf, or end of word
            self.entity=None#in case needed
            self.id=None
    
    def __init__(self):
        self.root=self.new_node()

    def new_node(self,word=None):
        #Creates TrieNode object with a given word
        return self.TrieNode(word)

    def restricted_insert(self,lst,cat,number):
        if len(lst) == 0:
            print("ERROR: empty NE in knowledge file")
            return
        if len(lst) == 1 and lst[0]==cat:
            print("only one elem in lst")
            return
        
        if lst[0]=='the':#the is ambiguous
            lst.pop(0)

        ptr=self.root
        for i in range(0,len(lst)):#for every elem but not the last (since it is the category)
            if lst[i] not in ptr.children.keys():#if already a key, skip; else add new
                new=self.new_node(lst[i])
                ptr.children[lst[i]]=new
            ptr=ptr.children[lst[i]]
        #if (len(lst)==2 and lst[-2] in ['good','ask','page']):#we dont want to treat good or ask as an entity (see test.txt)
            #ptr.end=False
        if (len(lst)==3 and lst[-3]=='cable' and lst[-2]=='car'):
            ptr.end=False
        else:
            ptr.end=True#is end of word
            ptr.id=number
        if cat not in ptr.markers:#Only adds new categories to the list of markers 
            ptr.markers.append(cat)
    
    def insert(self,lst,cat,number):
        '''
        Given a list of words that make up a Named Entity and its category,
        inserts the Entity into the Trie.

        Note: the structure resembles a tree, i.e. words as nodes, and at the end
        of the inserted word, the node is made a leaf node (end=True), and a Marker
        is added to the node's list of markers.
        -------------------------------------------------------------------
        Example:
        bt=BabyTrie()
        possible_NE=['Jade','Garden']#let's say that this is a restaurant and id#1
        bt.insert(possible_NE,'restaurant',1)
        '''
        if len(lst) == 0:
            print("ERROR: empty NE in knowledge file")
            return
        if len(lst) == 1 and lst[0]==cat:
            print("only one elem in lst")
            return
        
        if lst[0]=='the':#the is ambiguous
            lst.pop(0)
        
        
        
        if lst[-1]!=cat: #['a','and','b'] + ['restaurant']
            lst.append(cat)
        '''
        for the sake of more possible matches:
        If inserted "Jade Garden" and it's a restaurant,
        "Jade Garden" will be recognized as an entity;
        and "Jade Garden Restaurant" will also be one.
        '''
        ptr=self.root
        for i in range(0,len(lst)-1):#for every elem but not the last (since it is the category)
            if lst[i] not in ptr.children.keys():#if already a key, skip; else add new
                new=self.new_node(lst[i])
                ptr.children[lst[i]]=new
            ptr=ptr.children[lst[i]]
        if (len(lst)==2 and lst[-2] in ['good','ask','page']):#we dont want to treat good or ask as an entity (see test.txt)
            ptr.end=False
        elif (len(lst)==3 and lst[-3]=='cable' and lst[-2]=='car'):
            ptr.end=False
        else:
            ptr.end=True#is end of word
            ptr.id=number
        if cat not in ptr.markers:#Only adds new categories to the list of markers 
            ptr.markers.append(cat)
        if lst[-1] not in ptr.children.keys():#avoid possible conflict with same name
            new=self.new_node(lst[-1])
            ptr.children[lst[-1]]=new#adds the category as one of the leaf node
        ptr=ptr.children[lst[-1]]
        ptr.end=True
        ptr.id=number
        if cat not in ptr.markers:#only adds category if not in list of markers
            ptr.markers.append(cat)
    
    def isinTrie(self,sen):
        stutter_match=False
        dl_mistakes=['train','want','american','lane','fees','fee','marina','wanna','canna','dinna','finna']
        #outfile for tracking log process
        dic={}#the uncleaned version of returned dictionary
        flag=False#=True when a phrase in the sentence does not match the trie anymore
        lst=[]#the list that will ultimately be returned

        new_sen=sen.lower().strip()#cleans up sentence

        lst=Word_Tokenize(Clean_Text(new_sen))
        
        last=len(lst)-1#keep track of last to ensure index does not go over limit
        #the returned list is completely formed at this point, we use it to generate dictionary
        rstart=0#starting position of NE
        rend=0#ending position of NE

        for i in range(len(lst)):
            proceed=False
            ptr=self.root
            if lst[i] in ptr.children:
                proceed=True
                rstart=i
                ptr=ptr.children[lst[i]]
            elif lst[i] not in ptr.children and lst[i] not in dl_mistakes:
                for word in ptr.children:
                    if len(lst[i])<=4:
                        dl_score=dl(lst[i],word,similarity=False,deleteWeight=2,insertWeight=2,replaceWeight=2)
                        bm=0
                    else:
                        dl_score=dl(lst[i],word,similarity=False,deleteWeight=1,insertWeight=2,replaceWeight=2)
                        bm=1
                    if dl_score<=bm:
                        proceed=True
                        rstart=i
                        ptr=ptr.children[word]
                        break
            if proceed:
                stutter_utt=0
                for j in range(i+1,len(lst)):
                    if lst[j] in ptr.children:
                        ptr=ptr.children[lst[j]]
                        rend=j
                        stutter_match=False
                    elif lst[j] not in ptr.children:
                        proceed=False
                        for word in ptr.children:
                            dl_score=dl(lst[j],word,similarity=False,deleteWeight=2,insertWeight=2,replaceWeight=2)
                            bm=1
                            if dl_score<=bm and lst[j].isalnum():
                                ptr=ptr.children[word]
                                rend=j
                                proceed=True
                                stutter_match=False
                                break
                        if (len(lst[j])<=3) and (lst[j].isalnum()) and (stutter_utt<1):#allow one mismatch (except for punc), could be stuttering
                            stutter_utt+=1
                            proceed=True
                            rend=j-1
                            stutter_match=True
                            #here rend not counted since if next is not matched, it will not be effected if the prev word is actually end of word
                        elif stutter_match:
                            rend=j-2
                            break
                        #print('for '+lst[i]+' stopped at '+lst[j])#degbug use
                        if not proceed:
                            rend=j-1
                            break
            if ptr.end:
                for i in range(len(ptr.markers)):
                    if (ptr.markers[i] in dic):#if already a key, add to value
                        if rstart>rend:
                            rend=rstart
                        dic[ptr.markers[i]].append(((rstart,rend),ptr.id))
                    else:#if not, make new entry
                        if rstart>rend:
                            rend=rstart
                        dic[ptr.markers[i]]=[((rstart,rend),ptr.id)]
                rstart=0
                rend=0
            else:
                continue
        #cleaning up dictionary
        rdic={}#clean version of the dictionary that is ultimately returned
        for cat in dic.keys():
            rdic[cat]=[]
            rlst=sorted(dic[cat],key=lambda item:item[0][1]-item[0][0],reverse=True)
            #rdic[cat].append(rlst[0])
            for i in range(0,len(rlst)):
                rflag=True
                for rcat in rdic.keys():#make sure no overlap: ex. 'some hotel diner' > 'some hotel'
                    for elem in rdic[rcat]:
                        if (rlst[i][0][0]<elem[0][0] and rlst[i][0][1]<elem[0][0]):#ex. if we have (3,5) we can take (1,2)
                            continue
                        elif (rlst[i][0][0]>elem[0][1] and rlst[i][0][1]>elem[0][1]):#ex. if we have (3,5) we can take (6,7)
                            continue
                        else:# any equals, [0] or [1] will not work: ex. we have (3,5). cannot have (3,4) or (4,5).
                            if rlst[i][0][1]-rlst[i][0][0]==elem[0][1]-elem[0][0] and rlst[i][0][1]>elem[0][1]:
                                rdic[rcat].remove(elem)
                                continue
                            else:
                                rflag=False
                if rflag:
                    rdic[cat].append(rlst[i])
        del_lst=[]#cheap(costly) way to deal with empty dic entry
        for cat in rdic:
            if len(rdic[cat])==0:
                del_lst.append(cat)
        for elem in del_lst:
            del rdic[elem]
        return (lst,rdic)
        
    

    def initialize_multiwoz(self):
        dic_lst = ['/content/drive/MyDrive/Research/ner/Multiwoz/multiwoz/db/attraction_db.json',
                   '/content/drive/MyDrive/Research/ner/Multiwoz/multiwoz/db/restaurant_db.json',
                   '/content/drive/MyDrive/Research/ner/Multiwoz/multiwoz/db/hotel_db.json',
                   '/content/drive/MyDrive/Research/ner/Multiwoz/multiwoz/db/police_db.json',
                   '/content/drive/MyDrive/Research/ner/Multiwoz/multiwoz/db/hospital_db_clean.json',
                   '/content/drive/MyDrive/Research/ner/Multiwoz/multiwoz/db/taxi_db_clean.json',        
        ]
        cat_lst = ['attraction',
                   'restaurant',
                   'hotel',
                   'police',
                   'hospital',
                   'taxi',
        ]

        for f in range(len(dic_lst)):
            print('Starting ', dic_lst[f].split('/')[-1])
            db = open(dic_lst[f],'r')
            dic = json.load(db)
            db.close()
            cat = cat_lst[f]
            for record in dic:            
                if ('name' in record and record['name'] is not None) or ('department' in record):
                    if cat == 'hospital':
                        new_name=record['department'].lower().strip()
                    else:
                        if record['name'] == 'the place':#edge case
                            continue
                        new_name=record['name'].lower().strip()#cleans up name
                    elem = record['id']
                    if new_name == cat:
                        print('exceptional entity---------------------')
                        continue
                    lst=[]#final list that is used to insert into bt
                    lst=Word_Tokenize(Clean_Text(new_name))
                    #p=re.compile(r"^(\w+)(\'\w+)$")

                    
                    self.restricted_insert(lst,cat,elem)
                    if cat == 'police':
                        self.restricted_insert(lst[:-1], cat, elem)
                    #self.restricted_insert(no_punc_lst,cat,elem)#Tokenize will not split '-' such that 'alpha-milton guest house' will result in inserting 'guest house'
                    
                    if 'and' in lst and 'bar' in lst:#de luca cucina and bar
                        partial=lst[0:lst.index('and')]
                        self.insert(partial,cat,elem)
                        
                    for i in range(len(lst)):#& <=> and
                        if lst[i]=='&':
                            lst[i]='and'
                            self.restricted_insert(lst,cat,elem)
                        elif lst[i]=='and':
                            lst[i]='&'
                            self.restricted_insert(lst,cat,elem)
    
                    if ',' in lst:#hotel vitale, ...
                        partial=lst[0:lst.index(',')]
                        self.restricted_insert(partial,cat,elem)
                        for i in range(len(partial)):#& <=> and
                            if partial[i]=='&':
                                partial[i]='and'
                                self.restricted_insert(partial,cat,elem)
                            elif partial[i]=='and':
                                partial[i]='&'
                                self.restricted_insert(partial,cat,elem)

    def gettemplate_wmap(self, g2p, title):
        title = re.sub(r'\u2019', "'", title)
        title = Clean_Text(title).replace('\n',' ')
        nltk=' '.join(Word_Tokenize(title))
    
        word_lst,ind_dic=self.isinTrie(title)
        nlst=Word_Tokenize(Clean_Text(title))
        new_sen=title.strip()#cleans up sentence
        
        en_dic={}
        title_phoneme=title
        for key in ind_dic:
            ind_lst=ind_dic[key]
            for pair in ind_lst:
                temp_str=''
                for index in range(pair[0][0],pair[0][1]+1):
                    if not nlst[index][0].isalnum():
                        temp_str=temp_str[:-1]
                    if nlst[index]=='-':
                        temp_str+=nlst[index]
                    else:
                        temp_str+=nlst[index]+' '
                temp_str=temp_str[:-1]#getting rid of last space
                #temp_str_converted=convert_text2num(temp_str)#name entity to be in dig form
                phoneme_temp_str=getphoneme(g2p,temp_str)
                en_dic['<'+key+'-'+str(pair[1])+'>']=temp_str
                title_phoneme=title_phoneme.replace(temp_str,'<'+phoneme_temp_str+'>')
                title=title.replace(temp_str,'<'+key+'-'+str(pair[1])+'>')
        return [title,en_dic,title_phoneme,nltk]                    
def main():
    g2p = G2p()
    bt=BabyTrie()
    bt.initialize_multiwoz()
    test="The taxi is all set, look for a grey toyota, they can be reached at 07596698267 if there are any issues. Any other questions?"
    res=bt.isinTrie(test)
    #print(res[0],res[1],res[2],sep='\n')
    print(res)
    res2 = bt.gettemplate_wmap(g2p,test)
    print(res2[0],res2[1],res2[2],res2[3],sep='\n')

if __name__=='__main__':
    main()

Overwriting BabyTrie_MultiWOZ.py


In [None]:
#Quick Test
from BabyTrie_MultiWOZ import BabyTrie

g2p = G2p()
bt = BabyTrie()
bt.insert(['amc','theatre'],'attraction','0')
bt.insert(['amc','theatre'],'movie','0')
#bt.insert(['amc','theatre'],'attraction','0')
res = bt.gettemplate_wmap(g2p, 'amc theatre is great.')
print(res[0], res[1], res[2], res[3], sep = '\n')

<attraction-0> is great.
{'<attraction-0>': 'amc theatre'}
<EY1-EH1-M-S-IY1_TH-IY1-AH0-T-ER0> is great.
amc theatre is great .


##Outputs (need change directory accordingly)

MultiWOZ2.2 train, test, dev templates
Note: only restaurant, hotel, and attraction are 'full-proof'.

1. attraction has an entity 'the place', edgecase.
2. hospital actual name is not avaliable in db, thus not inserted into trie.
3. taxi entities are enumerated from the old json file with 'color + type'.
4. bus/train have the same db.json and they are not currently being considered.

a. look for more edgecases
b. considering bus/train: areas (destination/departure)

In [None]:
import os
import json
from g2p_en import G2p

g2p = G2p()
bt=BabyTrie()
bt.initialize_multiwoz()


directory = '../dev/'#../train/ or ../test/
for filename in os.listdir(directory):
    print('Now: '+filename)
    f = open(directory+filename,'r')
    log = json.load(f)
    f.close()

    for dialogue in log:
        for turn in dialogue['turns']:
            text = turn['utterance']
            res = bt.gettemplate_wmap(g2p, text)
            turn['text_tmplate'] = res[0]
            turn['id_map'] = res[1]
            turn['text_phoneme'] = res[2]
            turn['text_nltk'] = res[3]


    out = directory+filename.split('.')[0]+'_template.json'
    wf = open(out,'w')
    json.dump(log, wf, indent = 4)
    wf.close()
    print('Done.')

Starting  attraction_db.json
Starting  restaurant_db.json
Starting  hotel_db.json
Starting  police_db.json
Starting  hospital_db_clean.json
Starting  taxi_db_clean.json
Now: dialogues_001.json
Done.
Now: dialogues_002.json
Done.


In [None]:
%cd ../dev/#../train/ or ../test/
!ls -lrt
!zip multiwoz_dev_templates.zip *_template.json#rename if needed

/content/drive/My Drive/Research/ner/Multiwoz/multiwoz/data/MultiWOZ_2.2/dev
total 59521
-rw------- 1 root root 10708830 Jun 29 08:20 dialogues_002.json
-rw------- 1 root root 11771811 Jun 29 08:20 dialogues_001.json
-rw------- 1 root root 20128689 Aug  7 11:36 dialogues_001_template.json
-rw------- 1 root root 18339329 Aug  7 11:37 dialogues_002_template.json
  adding: dialogues_001_template.json (deflated 96%)
  adding: dialogues_002_template.json (deflated 96%)


# Task 2: MultiWOZ data preperation for Intent Classifier

Abandoned for now since MultiWOZ data has similar utterances as the KB titles.
Use data modified from DSTC9 set instead:

/scratch/yt2267/public/alexa-with-dstc9-track1-dataset/data_trainval/train/logs.json, labels.json, use target=False and last user turn for non-KB utterances @Seeger Zou 

##Setups

In [None]:
from google.colab import drive
drive.mount('/content/drive')