In [1]:
import numpy as np
import nltk

# Sentence generation using replacement 

Grammar based sentence generation generated all the possible permutations and combinations between the words and phrases, thereby preventing us to sample all the phrases/words in a small number of output sentences. Another methodology of sentence generation is using random sampling from a group of similar phrases for replacement in a particular sentence structure independent of other parts of the sentences. For example, the algorithm should be able to generate following.

Base sentence: 

\[I\] \[want to\] \[payoff\] \[my loan\]

Generates: 

\[I\] \[would like to\] \[settle\] \[my debts\]

\[We\] \[need to\] \[pay off\] \[our mortgage\]

\[I\] \[am looking to\] \[pay off\] \[loans\]

Note that the above set of sentences can be generated from a finite state grammar, as opposed to our previous approach which had more relaxed set of rules which could be classified under context free grammars. In finite state grammar, the number of sentence structures need to be increased, because we can no longer have awareness that subject is plural to change all possesive personal pronouns to plural. Instead we would need to have seperate structures for singluar subjects and plural subjects. However, using finite state grammar rules to generate sentences from NLTK would defeat the purpose of this exercise, as we want that samples from vocabulary should be done independently for each part of the sentence, whereas NLTK does it sequentially. 


In [2]:

VOCAB = {
    'PPI': ['my','the','our'],
    'PPY': ['your','the'],
    'PI' : ['I'],
    "IF" : ["if"],
    "ARE": ["are",'were'],
    "ARTICLE": ['a','an','the',' '],
    'FROM' : ['from','by'],
    'TOTHIS' : ['to this','to the following', 'to the'],
    'RESET': ['reset','set again','give me again','unlock'],
    'PWD': ['password','pwd','passwd','pass'],
    'PLS': ['please','pls'],
    'COULDYOU': ['could you','would you','request you to','can you'],
    'COULDI': ['could I','would I','May I','can I'],
    'WANTTO': ['want to','need to','want you to','want to', 'need to','want to have','would like to','would want to'],
    'TELLME' : ['tell me','show me','give me','write here','inform me'],
    'IN': ['in'],
    'FOR': ['for'],
    'IS1': ['is','was','is now','has been','probably is'],
    
    'SALESFORCE': ['salesforce','salesforce account','sales force account','salesforce application','sales force application'],
    'SKYPE': ['skype','skype account','skype application','application of skype','account of skype'],
    'SERVICEACC': ['service account'],
    'OMNIV': ['omniview account','omniview','omni view','account of omniview'],
    
    'HI' : ['hi','hello','hey there', 'hello!', 'hi!', 'Hello', 'hey'],
    'WELCOME' : ['Welcome!','Welcome.'],
    'TO_COMP' : ['to IT helpdesk', 'to IT help', 'to helpdesk'],
    
    'USERID': ['user id', 'id', 'user name', 'id name'],
    
    
    'IDS' : ['john1234','john123','abcd1234','xyz','mark1','markXYZ','mark_john','tony_stark','ab_dev','kimPossible123'],
    
    'WHATIS' : ['what is','which is','what value is'],
    'PY' : ['your'],
    'YOU' : ['you'],
    '?' : ['?'],
    '.' : ['.'],
    'AFFIRM': ['yes','yes!','sure','please do','yeah','yup','yeah sure','yes sure!','yes please','ok','fine','thats fine',
              "that's fine",'no problem, go ahead','yes go ahead','sure go ahead','please go ahead','yes do it',
              'yes I have that','Yes I am on it','yes I do have that','Yes I can do that','yes I can give you that',
              'yes it is that','yeah it seems to be the case','yes it is that','It is.','it is that','it is so.',
              'i have done that','i did that','i got that','I received that','I found that','It is with me',
              'yes it is with me'],
    
    'EMAIL_TO_SEND': ['An email would be sent to the id configured in our system with a security code',
                      'We need to send an email.',
                      'An email will be sent to you for security code',],
    'NEED_CODE': ['You need to provide me the code ,So i can reset the password',
                 'I will need the code to reset the password',
                 'Code will be needed'],
    'CONFIRM_AHEAD': ['Can I go ahead?', 'Should I go ahead?','Is it ok', 'Is it ok to go ahead with it?',
                     'Can I go ahead with it?', 'Should I go ahead and do that?','Can I do it?'],
    
    'EMAIL' : ['email','e-mail','mail'],
    'SENT' : ['sent'],
    'SECCODE' : ['security code','code','secret number','otp','passphrase', 'code', 'security code'],
    'NUMBER': ['1234','1234123','54123','123231','12311512','12351','1234151','125613'],
    'FIN': ['finished','completed','done','concluded','i have done','i have given','its done','its over','is granted',
           'is now granted'],
    'THANKS': ['Thank you','thanks','ok thanks','well thank you','I am so thankful','Thanks to you','so much thanks'],
    'HUMBLE_DENY': ['no','no thanks','no it isnt','no I dont want','No its ok','that is not','no it is not','nope','nah'],
    'ANYELSE': ['is there anything else?','anything else?','any other help?','Do you need any other help?'],
    'ACCESS': ['access','open','be added to','added to','be able to open','be able to access'],
    'MACHINE_ENTITIES': ['application1','app1','app2','application329','application-new','app1','SYSTEM','sys'],
    'MACHINE' : ['machine', 'system', 'resource'],
    'HAVE' : ['have','has','had'],
    'DO_WOULD': ['do','Would'],
    'DO_I_HAVE': ['do I have','Would I have','have I'],
    'SUPERVISOR': ['supervisor','manager','boss','chief'],
    'APPROVAL' : ['approval','confirmation','permission','request approval','approved'],
    'REQNUM' : ['request number','req number','req num'],
    'ADMINACCESS': ['admin access','administrator access','access as admin','access as administrator'],
    'LOGIN': ["login","access",'logging in','log in'],
    "ABLETO" : ['able to', 'capable of','in state of'],
    'CAN' : ['can'],
    'CHECK': ['check','verify','validate'],
    'BYE' : ['ok bye','bye','thanks bye','see ya','ok see ya','thanks see ya','bye!','bye,', 'bye.','bye talk to you later',
            'bye nice to talk to you','bye it was nice','bye youre good','bye you are awesome'],
    'RECVD' : ['received','gotten','taken'],
    'temp': ['No as per policy we don’t give admin rights to the users on this machine.',
            'Sorry, admin rights are not available as per the policy.',
            'I am afraid you dont have administrator access according to the policy.']
}

In [3]:
# '_' key means, value of the slot should be taken as Null
# '*' key means, value of the slot should be exactly the same as what is generated
slots = {
        'ID_reset': {
                        # PASSWORD RESET
                        '_' : {
                                '1' : ['RESET','USERID'],
                                '2' : ['PWD','RESET'],
                                '3' : ['RESET','PPI','USERID'],
                                '4' : ['PPI','$2']
                        }
                   } ,
        'ID': {
            '*' : {'1':['IDS']},
            '_' : {'1':['USERID']}
        },
        
        }

In [4]:
# INTENTS

intents = {
    'inform' : {
    
        #Reset the password.
        'PASSRESET1' : ['<password_reset%_>'],
        'PASSRESET2' : ['PLS','<password_reset%_>'],
        'PASSRESET3' : ['COULDYOU','$PASSRESET2'],
        'PASSRESET3_' : ['WANTTO','$PASSRESET2'],
        'PASSRESET4_' : ['PI','$PASSRESET3_'],

        #Reset the password in sales force application 
        'PASSRESET4' : ['$PASSRESET3','IN|FOR','<application_skype%_>'],
        'PASSRESET5' : ['$PASSRESET3','IN|FOR','<application_salesforce%_>'],
        'PASSRESET6' : ['$PASSRESET3','IN|FOR','<application_serviceacc%_>'],
        'PASSRESET6' : ['$PASSRESET3','IN|FOR','<application_omniview%_>'],

        #My user id is john1234
        'USERID1' : ['<ID%*>'],
        'USERID2' : ['USERID','IS1','$USERID1'],
        'USERID3' : ['PPI','$USERID2','.'],
        
        #Security code is 1523
        'SEC1' : ['<number%*>'],
        'SEC2' : ['<security_code%_>','IS1','$SEC1'],
        'SEC3' : ['PPI','$SEC2','.'],
        
        
        #Request number is 1523
        'REQN1' : ['<request_number%_>','IS1','$SEC1'],
        'REQN2' : ['PPI','$REQN1','.'],
        
        
        #Password reset finished
        'PF1': ['PY','<password_reset%_>','IS1','<finished%_>'],
        
        
        #I have given access
        'ACG': ['<finished%_>','<access%_>',"TOTHIS","MACHINE","<machine%*>","."],
        'ACG2': ['<finished%_>','<access%_>',"TOTHIS","MACHINE"],
        'ACG2': ['PI','HAVE','<finished%_>','<access%_>',"TOTHIS","MACHINE"],
        'ACG3': ['<access%_>',"TOTHIS","MACHINE",'<finished%_>'],
        'ACG4': ['<access%_>','<finished%_>'],
        'ACG4': ["MACHINE",'<access%_>','<finished%_>'],

    },
    
    'greet' : {
        'G1': ['HI'],
        'G2': ['WELCOME'],
        'G3' : ['$G1','$G2','TO_COMP'],
    },
    
    'request' :{
        
        # What is your user id?
        'P0' : ['PY', '<ID%_>'],
        'P1' : ['WHATIS','PY', '$P0'],
        'P2' : ['COULDYOU','PLS','TELLME', '$P0','?'],
        
        #TELL me the security code
        'SEC1' : ['WHATIS','<security_code%_>'],
        'SEC2' : ['COULDYOU','PLS','TELLME', '<security_code%_>','?'],
        'SEC2' : ['COULDI','HAVE', '<security_code%_>','PLS','?'],
        
        #TELL me the request number
        'REQ1' : ['WHATIS','<request_number%_>'],
        'REQ2' : ['COULDYOU','PLS','TELLME', '<request_number%_>','?'],
        
        #need access
        'AC0': ['PI','WANTTO',"<access%_>"],
        'AC1': ['PI','WANTTO',"<access%_>","TOTHIS","MACHINE","<machine%*>","."],
        
        
        #An email would be sent to your id, I will need the code. can I go ahead?
        'EMAL' : ['<send_otp%_>'],
    },
    
    'confirm' : {
        
        #you need access to this machine applicatoin1?
        'APP0': ['YOU','WANTTO','<access%_>','TOTHIS','MACHINE','?'],
        'AP1': ['YOU','WANTTO','<access%_>','TOTHIS','MACHINE','<machine%*>','?'],
        'AP1': ['TOTHIS','<machine%*>','?'],
        
        #do you have approval from superviser?
        'SUP0' : ['YOU','HAVE','<approval%_>','FROM','SUPERVISOR'],
        'SUP01' : ['HAVE','PY','SUPERVISOR','<approval%_>'],
        'SUP02' : ['PY','SUPERVISOR','HAVE','<approval%_>'],
        'SUP02' : ['HAVE','YOU','RECVD','<approval%_>'],
        'SUP02' : ['HAVE','YOU','RECVD','<approval%_>','FROM','SUPERVISOR'],
        'SUP1' : ['DO_WOULD','$SUP0'],
        'SUP2' : ['COULDYOU','PLS','TELLME',"IF","$SUP0"],
        
        #Do I have admin access?
        'ADM' : ['DO_I_HAVE','<admin_access%_>'],
        
        #Can you check if you are able to login
        'LOG0': ['<login_able%_>'],
        'LOG1': ['CHECK',"IF","YOU","ARE",'$LOG0'],
        'LOG2': ['COULDYOU',"PLS",'$LOG1'],
    },
    
    'affirm' : {
        #Simple affirm
        '1': ['AFFIRM'],
        
        #Affirm email sent
        'EM1': ['<email_sent%_>']
    },
    
    'gratitude':{
        #thanks
        '1': ['THANKS']
    },
    
    'deny': {
        '1': ['HUMBLE_DENY'],
        '2': ['HUMBLE_DENY','<admin_access%_>'],
        '3': ['temp']
    },
    
    'anythingelse': {
        '1': ['ANYELSE']
    },
    
    'bye' : {
        '1' : ['BYE']
    }
}

Now generating sentences saving in <intent\>.dat

In [5]:
def process_slot_data(slot_data):
    slots = []
    start_idx=0
    for slot in slot_data:
        if slot[0] is None:
            start_idx+=slot[2]
            continue
        slots.append({"slot":slot[0], "value":slot[1],
                     "start":start_idx,
                     "end":start_idx+slot[2]})
        start_idx+=slot[2]
    return slots

In [6]:
import regex as re
def gen_sent_array(struct,dic,slots):
    sent_arr=[]
    slot_data = []
    for s in struct:
        text,slot_info = sample(s,dic,slots)
        sent_arr.append(text)
        slot_data.append(slot_info)
    text = ""
    final_slot_data = []
    for _t,_s in zip(sent_arr,slot_data):
        text+=_t
        final_slot_data+=_s
        text+=" "
        final_slot_data+=[[None,None,1]]
    text = text[:-1]
    final_slot_data = final_slot_data[:-1]
    return text, final_slot_data

def gen_slots(slots,slot):
    slot_name, slot_value = slot.split('%')
    dic = slots[slot_name][slot_value]
    key_ = list(dic.keys())
    chosen_key = key_[np.random.randint(len(key_))]
    text, _ = gen_sent_array(dic[chosen_key],dic,None) #NO SLOTS INSIDE SLOTS
    slot_value = None if slot_value == '_' else \
                 text if slot_value == '*' else \
                 slot_value
    return text, [[slot_name, slot_value, len(text)]]

def sample_from_vocab(key):
    try:
        arr=VOCAB[key]
    except:
        raise Exception('key not in vocabulary')
    return arr[np.random.randint(len(arr))]

def sample(key,dic,slots):
    if(key[0]=='$'): #It's another query embeded in this query
        text, slot_data = gen_sent_array(dic[key[1:]],dic,slots)
        return text, slot_data
    mtch = re.match(r'<(.*)>',key)
    if mtch:
        slot = mtch.group(1)
        text, slot_info = gen_slots(slots,slot)
        return text, slot_info
    text = sample_from_vocab(key)
    return text,[[None,None,len(text)]]


def generate_sent(struct,dic,slots, scale=10,NNEST=20):
    """NNEST: minimum number of sentences to produce from a nested query
    """
    num=-1
    for s in struct:        
        # This part is to accomodate alternate options within a query, such HELLO|HI|HALLO 
        # where each option is a key in VOCAB; VOCAB[HELLO|HI|HALLO]
        if '|' in s:
            options=s.split('|')
            VOCAB[s]=[]
            for x in options:
                VOCAB[s]+=VOCAB[x]
                
        if(s.startswith('$') or s.startswith('<')):
            num=max(num,NNEST)  #Generate only NNEST sentences from the nested query
        else:
            num=max(num,len(VOCAB[s]))
    for i in range(num*scale):
        text, slot_data=gen_sent_array(struct,dic,slots)
        slot_data = process_slot_data(slot_data)
        yield text, slot_data

In [7]:
x=generate_sent(intents['inform']['PASSRESET1'],intents['inform'],slots)
next(x)

('reset password',
 [{'slot': 'password_reset', 'value': None, 'start': 0, 'end': 14}])

In [8]:
import json
for qq in intents.keys():
    with open(qq+'.dat','w') as f:
        data = []
        for q in intents[qq]:
            gen=generate_sent(intents[qq][q],intents[qq],slots,scale=5,NNEST=10)
            for a,sdata in gen:
                data.append({'text':a,'act':qq,'slots':sdata})
        json.dump(data,f, indent=4, sort_keys=True)