## AMI corpus. 
Go through the list of files and extract passage and the summaries, using the DialogueActs as intermediary

In [1]:
import glob, os
#import xml.etree.ElementTree as et
from lxml import etree as et
from collections import OrderedDict, defaultdict
ROOT_DIR = "/Users/haileywu/Desktop/W266-fall-2020-hwu-rnair/data/AMI_manual/ami_public"

### Extract words

In [2]:
meeting_dict = dict() # key = meeting, value = ordered dict of word id: word
cur_word = None
for word_f in sorted(glob.glob(ROOT_DIR+"/words/*xml")):
    m = word_f.split("/")[-1].split(".")
    meeting_name = m[0]+"."+m[1]
    root = et.parse(word_f).getroot()
    nodes = root.findall('*')
    meet_words = OrderedDict()

    for node in nodes:
        if node.tag=='w':
            meet_words[node.attrib['{http://nite.sourceforge.net/}id']] = node.text
            cur_word = node.text
        for w in node.values():
            if "disfmarker" in w:
                meet_words [node.attrib['{http://nite.sourceforge.net/}id']] = cur_word
    meeting_dict[meeting_name] = meet_words
    
#meeting_dict['Bdb001.D']['Bdb001.w.2,391']

##### Extract complete text and store in files under ../ICSI_plus_NXT/processing/_meeting_id_words.txt file

In [3]:
len(meeting_dict.keys())

for meeting, words in meeting_dict.items():
    with open("../data/AMI_manual/processing/"+meeting+"_words.txt", "w") as f:
        for word in words.values():
            f.write(word)
            f.write(" ")

### Extract dialogues

Dialogues are equivalent to complete sentences

In [4]:
meeting_dialogues = dict() # key = meeting name, value = list of tuples start, end word ids(end can be same as start)
for dialog_f in sorted(glob.glob(ROOT_DIR+"/dialogueActs/*act.xml")):
    m = dialog_f.split("/")[-1].split(".")
    meeting_name = m[0]+"."+m[1]
    root = et.parse(dialog_f).getroot()
    dlist = root.findall('dact')
    for dl in dlist:
        nodes = dl.findall("{http://nite.sourceforge.net/}child")
        dialog_id = dl.attrib['{http://nite.sourceforge.net/}id']
        dialog_id = dialog_id.split('.')
        del dialog_id[3]
        dialog_id = '.'.join(dialog_id)
        #print(dialog_id)
        for dl_c in nodes:
            words = dl_c.attrib['href']
            words = words.split("#")[1]
            try:
                start, end = words.split("..")
                start, end = start.replace("id", ""), end.replace("id", "")
                start, end = start.replace("(", ""), end.replace("(", "")
                start, end = start.replace(")", ""), end.replace(")", "")
    #            print(start, end)
            except:
                start = end = words.replace("id", "").replace("(", "").replace(")", "")
                
    #            print(start)
            if meeting_name not in meeting_dialogues:
                meeting_dialogues[meeting_name] = OrderedDict()
            meeting_dialogues[meeting_name][dialog_id] = ((start, end))




#####  Extract complete dialogues and store in files under ../AMI_manual/processing/_meeting_id_dialogues.txt file

In [8]:
print(len(meeting_dialogues.keys()))
dialogues = dict() # key = dialogue id, value = text (required for summaries)
for meeting, sentence_map in meeting_dialogues.items():
    with open("../data/AMI_manual/processing/"+meeting+"_dialogues.txt", "w") as f:
        starts, ends = dict(), dict()
        for d_id, (start, end) in sentence_map.items():
            starts[start] = d_id
            ends[end] = d_id
        #go through entire text and start printing if start token is in "starts". stop printing if end token is in "ends"
        print_w = False
        cur_d = None
        for word_id, word in meeting_dict[meeting].items():
            if word_id in starts:
                cur_d = starts[word_id]
                print_w = True
            if print_w:
                f.write(word + " ")
                dialogues[cur_d] = dialogues.get(cur_d, "") + word+ " "
            if word_id in ends:
                print_w = False
                f.write("\n")



556


### Extract summaries

In [9]:
meeting_summaries = defaultdict(list) # key = meeting name, value = list of tuples start, end word ids(end can be same as start)

for dialog_f in sorted(glob.glob(ROOT_DIR+"/extractive/*extsumm.xml")):
    m = dialog_f.split("/")[-1].split(".")
    meeting_name = m[0]+"."+m[1]
    root = et.parse(dialog_f).getroot()
    dlist = root.findall('extsumm/{http://nite.sourceforge.net/}child')
    for dl in dlist:
        words = dl.attrib['href']
        words = words.split("#")[1]
        try:
            start, end = words.split("..")
            start, end = start.replace("id", ""), end.replace("id", "")
            start, end = start.replace("(", ""), end.replace("(", "")
            start, end = start.replace(")", ""), end.replace(")", "")
            start, end = start.split('.'),end.split('.')
            del start[3],end[3]
            start, end = '.'.join(start),'.'.join(end)
#            print(start, end)
        except:
            start = end = words.replace("id", "").replace("(", "").replace(")", "")
            start, end = start.split('.'),end.split('.')
            del start[3],end[3]
            start, end = '.'.join(start),'.'.join(end)
            #            print(start)
        meeting_summaries[meeting_name].append((start, end))


In [11]:
for meeting, summ_list in meeting_summaries.items():
    with open("../data/AMI_manual/processing/"+meeting+"_summaries.txt", "w") as f:
        for start, end in summ_list:
            st = int(start.split('.')[3])
            en = int(start.split('.')[3])+1
            for d in range(st, en):
                key = start.split(str(st))[0]+str(d)
                if key not in dialogues:
                    print("missing", key)
                else:
                    f.write(dialogues.get(key))
                    f.write("\n")


missing ES2002a.B.dialog-act.3
missing ES2002a.B.dialog-act.33
missing ES2002b.B.dialog-act.24
missing ES2002b.C.dialog-act.4
missing ES2002b.C.dialog-act.19
missing ES2002b.C.dialog-act.39
missing ES2002b.C.dialog-act.43
missing ES2002b.B.dialog-act.140
missing ES2002b.A.dialog-act.116
missing ES2
missing ES2002c.C.dialog-act.29
missing ES2002c.B.dialog-act.71
missing ES2002c.C.dialog-act.42
missing ES2002c.C.dialog-act.48
missing ES2
missing ES2003a.C.dialog-act.17
missing ES2003
missing ES2003b.C.dialog-act.42
missing ES2003b.C.dialog-act.90
missing ES2003b.C.dialog-act.134
missing ES2003b.A.dialog-act.18
missing ES20
missing ES2003b.A.dialog-act.60
missing ES2003b.B.dialog-act.129
missing ES2003b.A.dialog-act.104
missing ES200
missing ES2003b.A.dialog-act.120
missing ES2003c.C.dialog-act.54
missing ES2003c.C.dialog-act.71
missing ES2003c.C.dialog-act.95
missing ES2003c.A.dialog-act.102
missing ES2003c.C.dialog-act.148
missing ES200
missing ES2003c.D.dialog-act.126
missing ES20
miss

missing IS1009d.D.dialog-act.260
missing TS3
missing TS3003a.A.dialog-act.66
missing TS3003a.A.dialog-act.162
missing TS30
missing TS3003b.A.dialog-act.114
missing TS3003b.A.dialog-act.228
missing TS3003b.A.dialog-act.263
missing TS3003b.A.dialog-act.284
missing TS3003c.B.dialog-act.5
missing TS3003c.B.dialog-act.21
missing TS3
missing TS30
missing TS3003c.D.dialog-act.90
missing TS30
missing TS30
missing TS3003d.A.dialog-act.79
missing TS30
missing TS3003d.A.dialog-act.180
missing TS3003d.C.dialog-act.113
missing TS3003d.C.dialog-act.134
missing TS3003d.A.dialog-act.467
missing TS30
missing TS3004a.D.dialog-act.88
missing TS3004a.B.dialog-act.74
missing TS3004a.C.dialog-act.59
missing TS3004
missing TS30
missing TS3004b.D.dialog-act.109
missing TS3004b.C.dialog-act.98
missing TS300
missing TS3004b.B.dialog-act.264
missing TS30
missing TS30
missing TS3004c.A.dialog-act.53
missing TS30
missing TS3004c.A.dialog-act.70
missing TS3004c.B.dialog-act.138
missing TS300
missing TS3004c.B.dialo