## AMI corpus. 
Go through the list of files and extract passage and the summaries, using the DialogueActs as intermediary
Uses the following mappings:

__1__) meeting id -> ordered dict of word id: word

__2__) meeting id -> ordered dict of dialog id -> list of (start, end) word ids that form a dialog. 

__3__) summary_starts, summary_ends -> sets of start and end dialog ids that belong to extractive summaries


function `get_words(meeting, start_word=None, end_word=None)` # goes through the meeting and returns words from start to end if provided. Else prints everything

function `extract_dialogues(meeting)` # goes through the meeting and returns list of  (dialogue, SummaryFlag) for the meeting


In [1]:
import glob, os
#import xml.etree.ElementTree as et
from lxml import etree as et
from collections import OrderedDict, defaultdict
import matplotlib.pyplot as plt
import numpy as np
ROOT_DIR = "/Users/haileywu/Desktop/W266_project/data/AMI_manual/ami_public"

### Extract words

__1__) meeting id -> ordered dict of word id: word

In [2]:
meeting_dict = dict() # key = meeting, value = ordered dict of word id: word
cur_word = None
for word_f in sorted(glob.glob(ROOT_DIR+"/words/*xml")):
    m = word_f.split("/")[-1].split(".")
    meeting_name = m[0]+"."+m[1]
    root = et.parse(word_f).getroot()
    nodes = root.findall('*')
    meet_words = OrderedDict()

    for node in nodes:
        if node.tag=='w':
            meet_words[node.attrib['{http://nite.sourceforge.net/}id']] = node.text
            cur_word = node.text
        for w in node.values():
            if "disfmarker" in w or "pause" in w or "vocalsound" in w:
                meet_words [node.attrib['{http://nite.sourceforge.net/}id']] = None
    meeting_dict[meeting_name] = meet_words
    
#meeting_dict['EN2001a.A']['EN2001a.A.words0']

In [3]:
meeting_dict['EN2001a.A']['EN2001a.A.words0']

'Okay'

In [4]:
def get_words(meeting, start=None, end=None):
    ret = ""
    include = False
    if start is None:
        include = True

    for meet_word in meeting_dict[meeting].keys():

        if start is not None and meet_word == start:
            include = True
        if include:
            if meeting_dict[meeting][meet_word] is not None:
                ret += meeting_dict[meeting][meet_word] + " "
        if end is not None and meet_word == end:
            include = False

    return ret

#get_words('Bdb001.A', "Bdb001.w.915", "Bdb001.disfmarker.49")
#get_words('Bns002.D')

In [5]:
get_words('EN2001a.A','EN2001a.A.words0','EN2001a.A.words1')

'Okay . '

### Extract Dialogues

__2__) meeting id -> ordered dict of dialog id -> tuple of (start, end) word ids that form a dialog. 

In [6]:
meeting_dialogues = dict() # key = meeting name, value = ordered dict of dialog id -> List of (start, end, SummaryFlag) word ids that form a dialog. 
for dialog_f in sorted(glob.glob(ROOT_DIR+"/DialogueActs/*act.xml")):
    m = dialog_f.split("/")[-1].split(".")
    meeting_name = m[0]+"."+m[1]
    root = et.parse(dialog_f).getroot()
    dlist = root.findall('dact')
    for dl in dlist:
        nodes = dl.findall("{http://nite.sourceforge.net/}child")
        dialog_id = dl.attrib['{http://nite.sourceforge.net/}id']

        for dl_c in nodes:
            words = dl_c.attrib['href']
            words = words.split("#")[1]
            try:
                start, end = words.split("..")
                start, end = start.replace("id", ""), end.replace("id", "")
                start, end = start.replace("(", ""), end.replace("(", "")
                start, end = start.replace(")", ""), end.replace(")", "")
            except:
                start = end = words.replace("id", "").replace("(", "").replace(")", "")
            if meeting_name not in meeting_dialogues:
                meeting_dialogues[meeting_name] = OrderedDict()
            meeting_dialogues[meeting_name][dialog_id] = [start, end, None]



In [7]:
meeting_dialogues['ES2002a.A']['ES2002a.A.dialog-act.dharshi.1']

['ES2002a.A.words0', 'ES2002a.A.words12', None]

### Extract summaries

__3__) meeting id -> set of dialog ids that belong to extractive summaries

In [8]:
summary_starts, summary_ends = set(),set() # key = meeting name, list (2) of set of Dialogue starts and ends

for dialog_f in sorted(glob.glob(ROOT_DIR+"/extractive/*extsumm.xml")):
    m = dialog_f.split("/")[-1].split(".")
    meeting_name = m[0]+"."+m[1]
    root = et.parse(dialog_f).getroot()
    dlist = root.findall('extsumm/{http://nite.sourceforge.net/}child')
    for dl in dlist:
        dialogs = dl.attrib['href']
        dialogs = dialogs.split("#")[1]
        start = end = None
        try:
            start, end = dialogs.split("..")
            start, end = start.replace("id", ""), end.replace("id", "")
            start, end = start.replace("(", ""), end.replace("(", "")
            start, end = start.replace(")", ""), end.replace(")", "")
        except:
            start = end = dialogs.replace("id", "").replace("(", "").replace(")", "")
        summary_starts.add(start)
        summary_ends.add(end)

In [9]:
def extract_dialogues(meeting):
    # loop through all the meetings and (optionally) return only the ones that fall in range of starts/ends
    ret = []
    dialogues = meeting_dialogues[meeting]
    include = False
    for dialog, (start_w, end_w, _) in dialogues.items():
        if dialog in summary_starts:
            include = True
        if include:
            ret.append((dialog, start_w, end_w, get_words(meeting, start_w, end_w), True))
            meeting_dialogues[meeting][dialog][2] = True
        else:
            ret.append((dialog, start_w, end_w, get_words(meeting, start_w, end_w), False))
            meeting_dialogues[meeting][dialog][2] = False
        if dialog in summary_ends:
            include = False
    return ret


In [7]:
#extract_dialogues('Bns002.D')


In [10]:
import csv


In [17]:
# go through all meetings and write out dialogues to files
with open("../data/AMI_manual/T5_csv/ami_ori_extrac.csv", mode='w') as csv_file:
    # ctext is the orginal text, while text is the extractive summary
    fieldnames = ['meeting','text', 'ctext']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    for meeting in meeting_dialogues.keys():
        extractive_summary = ""
        original_text = ""
        for _, _, _, diag, summ_flag in extract_dialogues(meeting):
            if len(original_text)>= 200:
                writer.writerow({'meeting':meeting,'text': extractive_summary, 'ctext': original_text})
                extractive_summary = ""
                original_text = ""
            if summ_flag == 1: 
                extractive_summary+= diag + " "
            original_text+=diag + " "
        writer.writerow({'meeting':meeting,'text': extractive_summary, 'ctext': original_text})

In [12]:
import pandas as pd


In [18]:
df = pd.read_csv("../data/AMI_manual/T5_csv/ami_ori_extrac.csv")
print(df.head(5))

     meeting                                               text  \
0  ES2002a.A                                                NaN   
1  ES2002a.A                                                NaN   
2  ES2002a.A                                                NaN   
3  ES2002a.A                                                NaN   
4  ES2002a.A  and I kind of like whales .  They come in and ...   

                                               ctext  
0  Hi , I'm David and I'm supposed to be an indus...  
1  Did you get the same thing ?    Cool .  There'...  
2   Okay .  Can't draw . Um . Yeah . Um , well an...  
3  Um .  Yeah .  Um , well anyway ,  I don't know...  
4  so um fish was a natural choice .  Um , yeah ,...  


In [19]:
len(df)

24629

In [22]:
idx = df['ctext'].map(len).idxmax()
df.iloc[idx, :]['ctext']

"Wait for the marketing director actually ,  so .  Anyways . Uh . See ,  shall we wait ?  I'm not sure if he's late or delayed or whatever ,  so I'm gonna start soon ,  we have now don't have much time anyway . There you are , okay . Uh no problem . We're about to start , so have a seat . Okay , welcome again . Today , functional design phase . I'll take you over the minutes of last last meeting . Okay , that was just to get to know each other , have a little thoughts on what your vision is and on this project , so I put the minutes on the I made on the on the p the the project share , so if you wanna review them , they're there . I will do so after every meeting , so if you have some information you wanna take back you can find it there . Anyways , um today three presentations , from every one of you . Um after that I got some new project requirements from project board , so we're gonna go af go after over this later . But I wanna start with uh stuff you did first , so we can see what

In [21]:
df[df['meeting']=='TS3012b.A']['ctext'].map(len)

23610    22583
23611      230
23612    22186
23613      233
23614      237
         ...  
23768      796
23769      234
23770      379
23771      212
23772      165
Name: ctext, Length: 163, dtype: int64

In [19]:
# # go through all meetings and write out dialogues to files
# diags=[]
# summary =[]
# all_diags =[]
# for meeting in meeting_dialogues.keys():
#     with open("../data/AMI_manual/processing/"+meeting+"_summaries.txt", "w") as f:
#         for _, _, _, diag, summ_flag in extract_dialogues(meeting):
#             diags.append(diag)
#             f.write(diag)
#             f.write("|")
#             f.write(str(int(summ_flag)))
#             f.write("\n")
#     with open("../data/AMI_manual/T5_ready/"+meeting+"_summaries.txt", "w") as f:
#         for _, _, _, diag, summ_flag in extract_dialogues(meeting):
#             if summ_flag == 1:
#                 summary.append(diag)
#                 f.write(diag)
#                 f.write(" ")
#     with open("../data/AMI_manual/T5_ready/"+meeting+"_dialogues.txt", "w") as f:
#         for _, _, _, diag, summ_flag in extract_dialogues(meeting):
#             all_diags.append(diag)
#             f.write(diag)
#             f.write(" ")