In [1]:
import json
import copy
from nltk.tokenize import TweetTokenizer


In [2]:
def split_que_ans(game,human=False):
    new_game = copy.deepcopy(game)
    for key in game:
        new_game[key] = copy.deepcopy(game[key])
        if human:
            gen_dialogue = [game[key]['true_dialogue']]
            gen_dialogue[0] = gen_dialogue[0].replace('Yes','<Yes>')
            gen_dialogue[0] = gen_dialogue[0].replace('yes','<yes>')
            gen_dialogue[0] = gen_dialogue[0].replace('No','<No>')
            gen_dialogue[0] = gen_dialogue[0].replace('no','<no>')
            gen_dialogue[0] = gen_dialogue[0].replace('NA','<NA>')
            gen_dialogue[0] = gen_dialogue[0].replace('na','<na>')
            new_game[key]['gen_dialogue'] = copy.deepcopy(game[key]['true_dialogue'])
        else:
            gen_dialogue = [game[key]['gen_dialogue']] 
        gen_dialogue[0] = gen_dialogue[0].replace('<start>','')
        
        # replace < and > in unk token so we can split on that one next
        gen_dialogue[0] = gen_dialogue[0].replace('<unk>', '_unk_')
        
        tmp_gen_dialogue = [dial.split('>') for dial in gen_dialogue]
        
        new_game[key]['que'] = []
        new_game[key]['ans'] = []
        for dialogue in tmp_gen_dialogue[0]:
            dialogue = dialogue.replace('<','')
            if dialogue:
                try:
                    que,ans = dialogue.split('?')
                    que = que + ' ?'
                    new_game[key]['que'].append(que)
                    new_game[key]['ans'].append(ans.lower().strip())
                except:
                    nothing = 1
    print('Spliting is done')                
    return(new_game)

In [3]:
def lexicalDiversity(game,maxQ=-1):
    # LexicalDiversity as type token ratio https://www.sltinfo.com/type-token-ratio/
    # maxQ in case we want to analysis on part of the dialogue i.e. 5Q, 6Q only
    tknzr = TweetTokenizer(preserve_case=False)
    ttr = 0
    q_tokens = list()
    all_que = []
    for key in game:
        ques = game[key]['que']
        anss = game[key]['ans']

        q_count = 0

        for que, ans in zip(ques, anss):

            if maxQ > 0 and q_count >=maxQ:
                break
            q_tokens.extend(tknzr.tokenize(que))

            q_count +=1
    ttr = len(set(q_tokens)) * 100 / len(q_tokens)

    # avg_ttr = ttr / len(game)
    # print(ttr, len(q_tokens),len(set(q_tokens)), len(all_que))
    return ttr


In [4]:
def questionDiversity(game,maxQ=-1, human=False):
    # Question Diversity and % of Game with repeated questions
    # maxQ in case we want to analysis on part of the dialogue i.e. 5Q, 6Q only
    all_que = []

    game_rep = 0
    
    #Default word in the Vocabulary
    vocab = ['<padding>',
              '<start>',
              '<stop>',
              '<stop_dialogue>',
              '<unk>',
              '<yes>' ,
              '<no>',
              '<n/a>',
            ]

    if human:
        word2i = {'<padding>': 0,
                  '<start>': 1,
                  '<stop>': 2,
                  '<stop_dialogue>': 3,
                  '<unk>': 4,
                  '<yes>': 5,
                  '<no>': 6,
                  '<n/a>': 7,
                  }


        min_occ = 1
        word2occ = collections.OrderedDict()
        tknzr = TweetTokenizer(preserve_case=False)

        for key in game:
            questions = game[key]['que']
            q_count = 0
            for que_idx, que in enumerate(questions):
                if maxQ > 0 and q_count >= maxQ:
                    continue

                tokens = tknzr.tokenize(que)
                for tok in tokens:
                    if tok not in word2occ:
                        word2occ[tok] = 1
                    else:
                        word2occ[tok] += 1
        for word, occ in word2occ.items():
            if occ >= min_occ and word.count('.') <= 1:
                word2i[word] = len(word2i)
        print(len(word2i))

    all_q_count = 0
    for key in game:
        questions = game[key]['que']

        game_rep_flag = False
        game_que = []

        q_count = 0
        for que_idx, que in enumerate(questions):

            if maxQ > 0 and q_count >=maxQ:
                continue

            words = que.split()


            for word in words:
                if word not in vocab:
                    vocab.append(word)

            if que not in all_que:
                all_que.append(que)
            if que in game_que:
                game_rep_flag = True
            
            game_que.append(que)
            
            q_count += 1
            all_q_count +=1
        if game_rep_flag:
            game_rep += 1
        

    # print(len(vocab))
    num_unique_que = len(all_que)
    per_rep_game = game_rep * 100 / len(game)
    len_vocab = len(vocab)

    out = {}
    out['num_que'] = all_q_count
    out['num_unique_que'] = num_unique_que
    out['que_divesity'] = num_unique_que*100/all_q_count
    out['%_rep_game'] = per_rep_game
    out['len_vocab'] = len_vocab

    print('all_que_count =',all_q_count )
    return out

In [5]:
file_name = '../data/dummy.json'

In [6]:
with open(file_name) as file:
    game = json.load(file)

In [7]:
game = split_que_ans(game)

Spliting is done


In [None]:
#TABLE 2 in the NAACL Paper
lexDiv = lexicalDiversity(game)
queDiv = questionDiversity(game)

out = {}
out['LexicalDiversity'] = lexDiv
out['QuestionDiversity'] = queDiv['que_divesity']
out['% Game with repeated Q\'s'] = queDiv['%_rep_game']

In [19]:
out

{'LexicalDiversity': 0.07080805140720507,
 'QuestionDiversity': 14.846348221305652,
 '% Game with repeated Q': 59.97056968677738}