# ICSI corpus and AMI corpus

## Defined Training, Dev, and Test Dataset 
Go through the list of files and extract passage and the summaries, using the DialogueActs as intermediary
Uses the following mappings:

__1__) meeting id -> ordered dict of word id: word

__2__) meeting id -> ordered dict of dialog id -> list of (start, end) word ids that form a dialog. 

__3__) summary_starts, summary_ends -> sets of start and end dialog ids that belong to extractive summaries


function `get_words(meeting, meetingtype, start_word=None, end_word=None)` # goes through the meeting and returns words from start to end if provided. Else prints everything; `meetingtype` is either "AMI" or "ICSI"

function `extract_dialogues(meeting, meetingtype)` # goes through the meeting and returns list of  (dialogue, SummaryFlag) for the meeting, `meetingtype` is either "AMI" or "ICSI"


Ref:
* http://groups.inf.ed.ac.uk/ami/corpus/annotation.shtml
* http://groups.inf.ed.ac.uk/ami/icsi/
* https://www.groundai.com/project/end-to-end-abstractive-summarization-for-meetings/1
* https://bitbucket.org/dascim/offline_meeting_summarization/src/master/
* https://github.com/gcunhase/AMICorpusXML

*Please download the related ICSI corpus and AMI corpus from the websites, and change the root directory below to point to them each.* The saving will also require a file directory as:

* `data/ICSI_plus_NXT/Full_doc/Dev`, 
* `data/ICSI_plus_NXT/Full_doc/Train`, 
* `data/ICSI_plus_NXT/Full_doc/Test`,
* `data/AMI_manual/Full_doc/Dev`,
* `data/AMI_manual/Full_doc/Test`,
* `data/AMI_manual/Full_doc/Train`


The dataset is split on the speaker (turn) level, meaning for each hour of the meeting (e.g.ES2004a), the saving will have ES2004a.A, ES2004a.B, etc. The abstractive summary is on the meeting level (e.g. ES2004a)

**During Model Training, do separate ROUGE evaluation on AMI and ICSI datasets**

In [1]:
import glob, os
#import xml.etree.ElementTree as et
from lxml import etree as et
from collections import OrderedDict, defaultdict
import matplotlib.pyplot as plt
import numpy as np
import csv
import pandas as pd

import codecs
import re
import string
import nltk
import operator
from nltk import PerceptronTagger

ROOT_DIR_ICSI = "/Users/haileywu/Desktop/W266_project/data/ICSI_plus_NXT/ICSIplus"
ROOT_DIR_AMI = "/Users/haileywu/Desktop/W266_project/data/AMI_manual/ami_public"

## Train, Dev, Test Split

based on https://bitbucket.org/dascim/acl2018_abssumm/src/master/

In [2]:
ami_test_set = [
    'ES2004a',
    'ES2004b',
    'ES2004c',
    'ES2004d',
    'ES2014a',
    'ES2014b',
    'ES2014c',
    'ES2014d',
    'IS1009a',
    'IS1009b',
    'IS1009c',
    'IS1009d',
    'TS3003a',
    'TS3003b',
    'TS3003c',
    'TS3003d',
    'TS3007a',
    'TS3007b',
    'TS3007c',
    'TS3007d'
]

ami_development_set = [
    'ES2005b',
    'ES2005d',
    'ES2007b',
    'ES2008a',
    'ES2008d',
    'ES2015d',
    'IS1003c',
    'IS1004c',
    'IS1006b',
    'IS1006d',
    'TS3004c',
    'TS3005d',
    'TS3006c',
    'TS3008b',
    'TS3011a',

    'ES2005a',
    'ES2005c',
    'ES2007a',
    'ES2007c',
    'ES2007d',
    'ES2008b',
    'ES2008c',
    'ES2015a',
    'ES2015b',
    'ES2015c',
    'IS1003a',
#    'IS1003b',
    'IS1003d',
    'IS1004a',
    'IS1004b',
    'IS1004d',
    'IS1006a',
    'IS1006c',
    'TS3004a',
    'TS3004b',
    'TS3004d',
    'TS3005a',
    'TS3005b',
    'TS3005c',
    'TS3006a',
    'TS3006b',
    'TS3006d',
    'TS3008a',
    'TS3008c',
    'TS3008d',
    'TS3011b',
    'TS3011c',
    'TS3011d'
]

icsi_test_set = [
    'Bed004',
    'Bed009',
    'Bed016',
    'Bmr005',
    'Bmr019',
    'Bro018'
]

icsi_development_set = [
    'Bed003',
    'Bed006',
    'Bed011',
    'Bed014',
    'Bed015',
    'Bed017',
    'Bmr013',
    'Bmr014',
    'Bmr015',
    'Bmr020',
    'Bro023',
    'Bro024',
    'Bro025',
    'Bro026',
    'Bro027',

    'Bed002',
    'Bed005',
    'Bed008',
    'Bed010',
    'Bed012',
    'Bed013',
    'Bmr003',
    'Bmr006',
    'Bmr007',
    'Bmr018',
]

### Extract words

__1__) meeting id -> ordered dict of word id: word

In [3]:
meeting_dict_ICSI = dict() # key = meeting, value = ordered dict of word id: word
cur_word = None
for word_f in sorted(glob.glob(ROOT_DIR_ICSI+"/Words/*xml")):
    m = word_f.split("/")[-1].split(".")
    meeting_name = m[0]+"."+m[1]
    root = et.parse(word_f).getroot()
    nodes = root.findall('*')
    meet_words = OrderedDict()

    for node in nodes:
        if node.tag=='w':
            meet_words[node.attrib['{http://nite.sourceforge.net/}id']] = node.text
            cur_word = node.text
        for w in node.values():
            if "disfmarker" in w or "pause" in w or "vocalsound" in w:
                meet_words [node.attrib['{http://nite.sourceforge.net/}id']] = None
    meeting_dict_ICSI[meeting_name] = meet_words
    
#meeting_dict_ICSI['Bdb001.D']['Bdb001.w.2,391']

In [4]:
meeting_dict_ICSI['Bdb001.D']['Bdb001.w.2,391']

'P'

In [5]:
meeting_dict_AMI = dict() # key = meeting, value = ordered dict of word id: word
cur_word = None
for word_f in sorted(glob.glob(ROOT_DIR_AMI+"/words/*xml")):
    m = word_f.split("/")[-1].split(".")
    meeting_name = m[0]+"."+m[1]
    root = et.parse(word_f).getroot()
    nodes = root.findall('*')
    meet_words = OrderedDict()

    for node in nodes:
        if node.tag=='w':
            meet_words[node.attrib['{http://nite.sourceforge.net/}id']] = node.text
            cur_word = node.text
        for w in node.values():
            if "disfmarker" in w or "pause" in w or "vocalsound" in w:
                meet_words [node.attrib['{http://nite.sourceforge.net/}id']] = None
    meeting_dict_AMI[meeting_name] = meet_words
    
#meeting_dict_AMI['EN2001a.A']['EN2001a.A.words0']

In [6]:
meeting_dict_AMI['EN2001a.A']['EN2001a.A.words0']

'Okay'

In [7]:
def get_words(meeting, meetingtype, start=None, end=None):
    ret = ""
    include = False
    if start is None:
        include = True
    if meetingtype == "AMI":
        for meet_word in meeting_dict_AMI[meeting].keys():

            if start is not None and meet_word == start:
                include = True
            if include:
                if meeting_dict_AMI[meeting][meet_word] is not None:
                    ret += meeting_dict_AMI[meeting][meet_word] + " "
            if end is not None and meet_word == end:
                include = False
        return ret
    elif meetingtype == "ICSI":
        for meet_word in meeting_dict_ICSI[meeting].keys():

            if start is not None and meet_word == start:
                include = True
            if include:
                if meeting_dict_ICSI[meeting][meet_word] is not None:
                    ret += meeting_dict_ICSI[meeting][meet_word] + " "
            if end is not None and meet_word == end:
                include = False
        return ret
    

In [8]:
get_words('Bdb001.A', "ICSI", "Bdb001.w.915", "Bdb001.disfmarker.49")

"So you 're essentially defining a lattice . Yeah . How - how Oh , that 's "

In [9]:
get_words('EN2001a.A',"AMI",'EN2001a.A.words0','EN2001a.A.words1')

'Okay . '

### Extract Dialogues

__2__) meeting id -> ordered dict of dialog id -> tuple of (start, end) word ids that form a dialog. 

In [10]:
meeting_dialogues_ICSI = dict() # key = meeting name, value = ordered dict of dialog id -> List of (start, end, SummaryFlag) word ids that form a dialog. 
for dialog_f in sorted(glob.glob(ROOT_DIR_ICSI+"/DialogueActs/*acts.xml")):
    m = dialog_f.split("/")[-1].split(".")
    meeting_name = m[0]+"."+m[1]
    root = et.parse(dialog_f).getroot()
    dlist = root.findall('dialogueact')
    for dl in dlist:
        nodes = dl.findall("{http://nite.sourceforge.net/}child")
        dialog_id = dl.attrib['{http://nite.sourceforge.net/}id']

        for dl_c in nodes:
            words = dl_c.attrib['href']
            words = words.split("#")[1]
            try:
                start, end = words.split("..")
                start, end = start.replace("id", ""), end.replace("id", "")
                start, end = start.replace("(", ""), end.replace("(", "")
                start, end = start.replace(")", ""), end.replace(")", "")
            except:
                start = end = words.replace("id", "").replace("(", "").replace(")", "")
            if meeting_name not in meeting_dialogues_ICSI:
                meeting_dialogues_ICSI[meeting_name] = OrderedDict()
            meeting_dialogues_ICSI[meeting_name][dialog_id] = [start, end, None]



In [11]:
meeting_dialogues_ICSI['Bdb001.A']['Bdb001.A.dialogueact74']

['Bdb001.w.691', 'Bdb001.w.700', None]

In [12]:
meeting_dialogues_AMI = dict() # key = meeting name, value = ordered dict of dialog id -> List of (start, end, SummaryFlag) word ids that form a dialog. 
for dialog_f in sorted(glob.glob(ROOT_DIR_AMI+"/DialogueActs/*act.xml")):
    m = dialog_f.split("/")[-1].split(".")
    meeting_name = m[0]+"."+m[1]
    root = et.parse(dialog_f).getroot()
    dlist = root.findall('dact')
    for dl in dlist:
        nodes = dl.findall("{http://nite.sourceforge.net/}child")
        dialog_id = dl.attrib['{http://nite.sourceforge.net/}id']

        for dl_c in nodes:
            words = dl_c.attrib['href']
            words = words.split("#")[1]
            try:
                start, end = words.split("..")
                start, end = start.replace("id", ""), end.replace("id", "")
                start, end = start.replace("(", ""), end.replace("(", "")
                start, end = start.replace(")", ""), end.replace(")", "")
            except:
                start = end = words.replace("id", "").replace("(", "").replace(")", "")
            if meeting_name not in meeting_dialogues_AMI:
                meeting_dialogues_AMI[meeting_name] = OrderedDict()
            meeting_dialogues_AMI[meeting_name][dialog_id] = [start, end, None]



In [13]:
meeting_dialogues_AMI['ES2002a.A']['ES2002a.A.dialog-act.dharshi.1']

['ES2002a.A.words0', 'ES2002a.A.words12', None]

### Extractive summaries

__3__) meeting id -> set of dialog ids that belong to extractive summaries

In [14]:
summary_starts_ICSI, summary_ends_ICSI = set(),set() # key = meeting name, list (2) of set of Dialogue starts and ends

for dialog_f in sorted(glob.glob(ROOT_DIR_ICSI+"/Contributions/Summarization/extractive/*extsumm.xml")):
    m = dialog_f.split("/")[-1].split(".")
    meeting_name = m[0]+"."+m[1]
    root = et.parse(dialog_f).getroot()
    dlist = root.findall('extsumm/{http://nite.sourceforge.net/}child')
    for dl in dlist:
        dialogs = dl.attrib['href']
        dialogs = dialogs.split("#")[1]
        start = end = None
        try:
            start, end = dialogs.split("..")
            start, end = start.replace("id", ""), end.replace("id", "")
            start, end = start.replace("(", ""), end.replace("(", "")
            start, end = start.replace(")", ""), end.replace(")", "")
        except:
            start = end = dialogs.replace("id", "").replace("(", "").replace(")", "")
        summary_starts_ICSI.add(start)
        summary_ends_ICSI.add(end)

In [15]:
summary_starts_AMI, summary_ends_AMI = set(),set() # key = meeting name, list (2) of set of Dialogue starts and ends

for dialog_f in sorted(glob.glob(ROOT_DIR_AMI+"/extractive/*extsumm.xml")):
    m = dialog_f.split("/")[-1].split(".")
    meeting_name = m[0]+"."+m[1]
    root = et.parse(dialog_f).getroot()
    dlist = root.findall('extsumm/{http://nite.sourceforge.net/}child')
    for dl in dlist:
        dialogs = dl.attrib['href']
        dialogs = dialogs.split("#")[1]
        start = end = None
        try:
            start, end = dialogs.split("..")
            start, end = start.replace("id", ""), end.replace("id", "")
            start, end = start.replace("(", ""), end.replace("(", "")
            start, end = start.replace(")", ""), end.replace(")", "")
        except:
            start = end = dialogs.replace("id", "").replace("(", "").replace(")", "")
        summary_starts_AMI.add(start)
        summary_ends_AMI.add(end)

In [16]:
def extract_dialogues(meeting, meetingtype):
    # loop through all the meetings and (optionally) return only the ones that fall in range of starts/ends
    ret = []
    include = False
    if meetingtype == "AMI":
        dialogues = meeting_dialogues_AMI[meeting]
        for dialog, (start_w, end_w, _) in dialogues.items():
            if dialog in summary_starts_AMI:
                include = True
            if include:
                ret.append((dialog, start_w, end_w, get_words(meeting, meetingtype, start_w, end_w), True))
                meeting_dialogues_AMI[meeting][dialog][2] = True
            else:
                ret.append((dialog, start_w, end_w, get_words(meeting, meetingtype, start_w, end_w), False))
                meeting_dialogues_AMI[meeting][dialog][2] = False
            if dialog in summary_ends_AMI:
                include = False
        return ret

    elif meetingtype == "ICSI":
        dialogues = meeting_dialogues_ICSI[meeting]
        for dialog, (start_w, end_w, _) in dialogues.items():
            if dialog in summary_starts_ICSI:
                include = True
            if include:
                ret.append((dialog, start_w, end_w, get_words(meeting, meetingtype, start_w, end_w), True))
                meeting_dialogues_ICSI[meeting][dialog][2] = True
            else:
                ret.append((dialog, start_w, end_w, get_words(meeting, meetingtype, start_w, end_w), False))
                meeting_dialogues_ICSI[meeting][dialog][2] = False
            if dialog in summary_ends_ICSI:
                include = False
        return ret

In [17]:
extract_dialogues('Bed009.G',"ICSI")

[('Bed009.G.dialogueact1133',
  'Bed009.w.9,951',
  'Bed009.w.9,955',
  'Is it i in , ',
  False),
 ('Bed009.G.dialogueact1134',
  'Bed009.w.9,956',
  'Bed009.w.9,966',
  'then , your place , in five five - A ? ',
  False)]

In [18]:
extract_dialogues('ES2004a.A', "AMI")

[('ES2004a.A.dialog-act.s9553330.1',
  'ES2004a.A.words0',
  'ES2004a.A.words3',
  'Hmm hmm hmm . ',
  False),
 ('ES2004a.A.dialog-act.s9553330.2',
  'ES2004a.A.words4',
  'ES2004a.A.words5',
  'Yeah . ',
  False),
 ('ES2004a.A.dialog-act.s9553330.3',
  'ES2004a.A.words6',
  'ES2004a.A.words12',
  "Okay . Yep , yep . Okay . Tu tu tu tu Hi , good morning . 'Kay . Oops . Mm . Oh sorry . Mm-hmm . Yeah , me . Cat . Where did this come from ? Uh , yep . Thank you . Uh , maybe you can guess what I'm trying to make ? Yep . It's actually sitting , so it's sitting , it's not standing . Okay , I see it as one thing it's very supportive . It's your best friend and your you can talk to a dog , it can be your best friend , it doesn't discriminate between you , based on what you are . Second it's loyal and third thing it's got intuition . dogs can som sometimes can make out between a thief and a person so basically these are the three unique features I think belong to a dog . Thank you . Okay . Sorr

## Abstractive Summary


In [19]:
# return the dictionary abstractive_summary[sentenceid]
as_dict_ICSI = dict()
for sum_f in sorted(glob.glob(ROOT_DIR_ICSI+"/Contributions/Summarization/abstractive/*abssumm.xml")):
    m = sum_f.split("/")[-1].split(".")
    meeting_name = m[0]
    root = et.parse(sum_f).getroot()
    dlist = root.findall('abstract/sentence')
    # getting the abstractive summary sentences only in the abstractive summary
    # excluding question and progress part
    for dl in dlist:
        sentenceid = dl.attrib["{http://nite.sourceforge.net/}id"]
        if meeting_name not in as_dict_ICSI:
            as_dict_ICSI[meeting_name]= OrderedDict()
        as_dict_ICSI[meeting_name][sentenceid] = dl.text

In [20]:
# return the dictionary abstractive_summary[sentenceid]
as_dict_AMI = dict()
for sum_f in sorted(glob.glob(ROOT_DIR_AMI+"/abstractive/*abssumm.xml")):
    m = sum_f.split("/")[-1].split(".")
    meeting_name = m[0]
    root = et.parse(sum_f).getroot()
    dlist = root.findall('abstract/sentence')
    # getting the abstractive summary sentences only in the abstractive summary
    # excluding question and progress part
    for dl in dlist:
        sentenceid = dl.attrib["{http://nite.sourceforge.net/}id"]
        if meeting_name not in as_dict_AMI:
            as_dict_AMI[meeting_name]= OrderedDict()
        as_dict_AMI[meeting_name][sentenceid] = dl.text

### Abstractive Summary and Extractive Summary Link

#### ICSI

In [21]:
# return the relationship between extractive and abstractive 
# summary_links[extractive_sentence_id] = abstractive_sentence_id
summary_links_ICSI = dict()
for summary_link in sorted(glob.glob(ROOT_DIR_ICSI+"/Contributions/Summarization/extractive/*summlink.xml")):
    m = summary_link.split("/")[-1].split(".")
    meeting_name = m[0]
    root = et.parse(summary_link).getroot()
    dlist = root.findall('summlink')
    for dl in dlist:
        #d = dl.findall('{http://nite.sourceforge.net/}pointer')
        for d in dl: 
            if d.attrib['role'] == 'abstractive':
                abstractive_sentence_id = d.attrib['href'].split("(")[1].split(")")[0]
                #abstractive_meeting_id = abstractive_sentence_id.split(".")[0]
            elif d.attrib['role']=='extractive': 
                extractive_sentence_id = d.attrib['href'].split("(")[1].split(")")[0]
                #extractive_meeting_id = extractive_sentence_id.split(".")[0] + "." + extractive_sentence_id.split(".")[1]  
        if meeting_name not in summary_links_ICSI:
            summary_links_ICSI[meeting_name]= defaultdict(set)
        summary_links_ICSI[meeting_name][extractive_sentence_id].add(abstractive_sentence_id)

In [22]:
s1 = summary_links_ICSI['Bdb001']['Bdb001.F.dialogueact37']
for i in s1:
    print(as_dict_ICSI['Bdb001'][i])

Two main options were discussed as to the organisation of the collected data.


#### AMI

In [23]:
# return the relationship between extractive and abstractive 
# summary_links[extractive_sentence_id] = abstractive_sentence_id
summary_links_AMI = dict()
for summary_link in sorted(glob.glob(ROOT_DIR_AMI+"/extractive/*summlink.xml")):
    m = summary_link.split("/")[-1].split(".")
    meeting_name = m[0]
    root = et.parse(summary_link).getroot()
    dlist = root.findall('summlink')
    for dl in dlist:
        #d = dl.findall('{http://nite.sourceforge.net/}pointer')
        for d in dl: 
            if d.attrib['role'] == 'abstractive':
                abstractive_sentence_id = d.attrib['href'].split("(")[1].split(")")[0]
                #abstractive_meeting_id = abstractive_sentence_id.split(".")[0]
            elif d.attrib['role']=='extractive': 
                extractive_sentence_id = d.attrib['href'].split("(")[1].split(")")[0]
                #extractive_meeting_id = extractive_sentence_id.split(".")[0] + "." + extractive_sentence_id.split(".")[1]  
        if meeting_name not in summary_links_AMI:
            summary_links_AMI[meeting_name]= defaultdict(set)
        summary_links_AMI[meeting_name][extractive_sentence_id].add(abstractive_sentence_id)

### Stop Words 

In [201]:
# def clean_text(text, stopwords, remove_stopwords=True, pos_filtering=False, stemming=True, lower_case=True):
#     if lower_case:
#         # convert to lower case
#         text = text.lower()
#     # strip extra white space
#     text = re.sub(' +', ' ', text)
#     # strip leading and trailing white space
#     text = text.strip()
#     # tokenize (split based on whitespace)
#     tokens = text.split(' ')

#     # remove punctuation
#     tokens = [t for t in tokens if t not in string.punctuation]

#     if pos_filtering:
#         tagger = PerceptronTagger()
#         # apply POS-tagging
#         tagged_tokens = tagger.tag(tokens)
#         # retain only nouns and adjectives
#         tokens = [item[0] for item in tagged_tokens if item[1] in ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJS', 'JJR', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']]
#     if remove_stopwords:
#         # remove stopwords
#         tokens = [token for token in tokens if token.lower() not in stopwords]
#     if stemming:
#         # http://people.scs.carleton.ca/~armyunis/projects/KAPI/porter.pdf
#         stemmer = nltk.stem.PorterStemmer()
#         # apply Porter's stemmer
#         tokens_stemmed = list()
#         for token in tokens:
#             tokens_stemmed.append(stemmer.stem(token))
#         tokens = tokens_stemmed

#     return (tokens)


In [24]:
def load_stopwords(path):
    stopwords = set([])

    for line in codecs.open(path, 'r', 'utf-8'):
        if not re.search('^#', line) and len(line.strip()) > 0:
            stopwords.add(line.strip().lower())  # lowercase

    return stopwords

In [25]:
def load_filler_words(path):
    with open(path, 'r+') as f:
        filler = f.read().splitlines()

    return filler

In [232]:
def clean_utterance(utterance, filler_words):
    utt = utterance
    # replace consecutive unigrams with a single instance
    utt = re.sub('\\b(\\w+)\\s+\\1\\b', '\\1', utt)
    # same for bigrams
    utt = re.sub('(\\b.+?\\b)\\1\\b', '\\1', utt)
    # strip extra white space
    utt = re.sub(' +', ' ', utt)
    # strip leading and trailing white space
    utt = utt.strip()

    # remove filler words # highly time-consuming
    utt = ' ' + utt + ' '
    for filler_word in filler_words:
        utt = re.sub(' ' + filler_word + ' '+ '.'+ ' ', ' ', utt)
        utt = re.sub(' ' + filler_word + ' '+ ','+ ' ', ' ', utt)
        utt = re.sub(' ' + filler_word + ' '+ '?'+ ' ', ' ', utt)
        utt = re.sub(' ' + filler_word + ' ', ' ', utt) 
        #utt = re.sub(' ' + filler_word.capitalize() + ' ', ' ', utt)

    return utt

In [202]:

def read_ami_icsi(diag, filler_words):
#     asr_output = pd.read_csv(
#         path,
#         sep='\t',
#         header=None,
#         names=['ID', 'start', 'end', 'letter', 'role', 'A', 'B', 'C', 'utt']
#     )
    utt = diag
    utterances = []
#     for tmp in zip(asr_output['role'].tolist(), asr_output['utt'].tolist()):
#         role, utt = tmp
#         # lower case
    utt = str(utt).lower()

    # remove special tag
    for ch in ['{nonvocalsound}','{vocalsound}', '{gap}', '{disfmarker}', '{comment}', '{pause}', '@reject@']:
        utt = re.sub(ch, '', utt)

    utt = re.sub("'Kay", 'okay', utt)
    utt = re.sub("'kay", 'okay', utt)
    utt = re.sub('"Okay"', 'okay', utt)
    utt = re.sub("'cause", 'cause', utt)
    utt = re.sub("'Cause", 'cause', utt)
    utt = re.sub('"cause"', 'cause', utt)
    utt = re.sub('"\'em"', 'them', utt)
    utt = re.sub('"\'til"', 'until', utt)
    utt = re.sub('"\'s"', 's', utt)
    utt = re.sub('"\"', "" , utt)
    utt = re.sub("-", ' ', utt)
    # l. c. d. -> lcd
    # t. v. -> tv
    utt = re.sub('h. t. m. l.', 'html', utt)
    utt = re.sub(r"(\w)\. (\w)\. (\w)\.", r"\1\2\3", utt)
    utt = re.sub(r"(\w)\. (\w)\.", r"\1\2", utt)
    utt = re.sub(r"(\w)\.", r"\1", utt)

    # clean_utterance, remove filler_words
    utt = clean_utterance(utt, filler_words=filler_words)

    # strip extra white space
    utt = re.sub(' +', ' ', utt)
    # strip leading and trailing white space
    utt = utt.strip()

    if not re.match(r'^[_\W]+$', utt) and utt != '':
    #if utt != '' and utt != '.' and utt != ' ' and utt!= "?" and utt!= "," and :
        utterances.append(utt)
    if len(utterances)>0:
    # remove duplicate utterances per speaker
        utterances = sorted(set(utterances), key=utterances.index)[0]
        result = str(utterances)
    else:
        result = ""
    #utterances_indexed = list(zip(range(len(utterances)), list(zip(*utterances))[0], list(zip(*utterances))[1]))
    #list(zip(*utterances))[1]
    return result

In [245]:
ROOT_DIR_UTILS = "/Users/haileywu/Desktop/W266_project/data/utils/"
filler_words = load_filler_words(ROOT_DIR_UTILS+"filler_words.less.txt")
#stopwords = load_stopwords(ROOT_DIR_UTILS+"stopwords.en.dat")

In [239]:
def clean_comma(utterance_indexed):
    #utterances_processed = []
#    for utterance_indexed in rst:
        # remove the comma at the beginning
    if len(utterance_indexed) >0:
        if utterance_indexed[0] == "," or utterance_indexed[0] == "." :
            utt_cleaned = utterance_indexed[2:]  
        else:
            utt_cleaned = utterance_indexed
        #print(utt_cleaned)
        #utterances_processed.append(utt_cleaned)
    else:
        utt_cleaned = ""
    return utt_cleaned

In [205]:
# min_words = 3
# utterances_processed =[]
# for utterance_indexed in list(rst):
#     index,role,utt = utterance_indexed
#     utt_cleaned = clean_text(
#         utt,
#         stopwords=stopwords,
#         remove_stopwords=True,
#         pos_filtering=False,
#         stemming=False,
#         # clustering based on lowercase form.
#         lower_case=True
#     )
#     if len(utt_cleaned) >= min_words:
#         utterances_processed.append((index, role, ' '.join(utt_cleaned)))


In [244]:
test = "Okay . Yep , yep . Okay . Tu tu tu tu Hi , good morning . 'Kay . Oops . Mm . Oh sorry . Mm-hmm . Yeah , me . Cat . Where did this come from ? Uh , yep . Thank you . Uh , maybe you can guess what I'm trying to make ? Yep . It's actually sitting , so it's sitting , it's not standing . Okay , I see it as one thing it's very supportive . It's your best friend and your you can talk to a dog , it can be your best friend , it doesn't discriminate between you , based on what you are . Second it's loyal and third thing it's got intuition . dogs can som sometimes can make out between a thief and a person so basically these are the three unique features I think belong to a dog . Thank you . Okay . Sorry . Does it look like a dog actually ? Mm . Eagle , okay . One point four or something like that . One point four Euro would make a Pound or something like that . Yeah . Okay , pretty huge margin . So then Mm-hmm . Yeah , that c Okay , you wanna integrate everything into one like Okay . So simplification of symbols you could think of . Mm-hmm . Menu , alright . Uh uh Right , I was thinking on the same lines you , instead of having too many b buttons and make it complicated for the user , may h maybe have an L_C_D_ di display or something like that , like a mobile , yeah and with menus . And if it's s somewhat similar to what you have on mobile phone , people might find it easier to browse and navigate also maybe . You mean to save it lesser number . Right . Mm-hmm . Okay . Mm , okay . S It might it might save a b bit of space , it's i instead of looking bulky , it might look small . But it might have its cost implications . Right . Okay . Mm , yeah . "

In [233]:
print(" "+test+" ")

 Okay . Yep , yep . Okay . Tu tu tu tu Hi , good morning . 'Kay . Oops . Mm . Oh sorry . Mm-hmm . Yeah , me . Cat . Where did this come from ? Uh , yep . Thank you . Uh , maybe you can guess what I'm trying to make ? Yep . It's actually sitting , so it's sitting , it's not standing . Okay , I see it as one thing it's very supportive . It's your best friend and your you can talk to a dog , it can be your best friend , it doesn't discriminate between you , based on what you are . Second it's loyal and third thing it's got intuition . dogs can som sometimes can make out between a thief and a person so basically these are the three unique features I think belong to a dog . Thank you . Okay . Sorry . Does it look like a dog actually ? Mm . Eagle , okay . One point four or something like that . One point four Euro would make a Pound or something like that . Yeah . Okay , pretty huge margin . So then Mm-hmm . Yeah , that c Okay , you wanna integrate everything into one like Okay . So simplifi

In [246]:
result = read_ami_icsi(test, filler_words)
clean_comma(result)

"tu hi , good morning . me . cat . where did this come from ? you can guess what i'm trying to make ? it's actually sitting , it's sitting , it's not standing . i see it as one thing it's very supportive . it's your best friend and your you can talk to a dog , it can be your best friend , it doesn't discriminate between you , based on what you are . second it's loyal and third thing it's got intuition . dogs can som sometimes can make out between a thief and a person these are the three unique features belong to a dog . does it look like a dog actually ? eagle , one point four like that . one point four euro would make a pound like that . pretty huge margin . then that c you wanna integrate everything into one like simplification of symbols you could think of . menu , alright . right , i was thinking on the same lines you , instead of having too many b buttons and make it complicated for the user , may h have an l_c_d_ di display like that , like a mobile , and with menus . and if it's

## Saving Documents

The saving will also require a file directory as:

* `data/ICSI_plus_NXT/Full_doc/Dev`, 
* `data/ICSI_plus_NXT/Full_doc/Train`, 
* `data/ICSI_plus_NXT/Full_doc/Test`,
* `data/AMI_manual/Full_doc/Dev`,
* `data/AMI_manual/Full_doc/Test`,
* `data/AMI_manual/Full_doc/Train`


The dataset is split on the speaker (turn) level, meaning for each hour of the meeting (e.g.ES2004a), the saving will have ES2004a.A, ES2004a.B, etc. The abstractive summary is on the meeting level (e.g. ES2004a)

If you would like to have another format, e.g. two columns in csv file, use the dictionaries created previously to build your output.

Dictionary List:

* `meeting_dialogues_ICSI[meeting_name][dialog_id] = [start, end, T/F]`
* `meeting_dialogues_AMI[meeting_name][dialog_id] = [start, end, T/F]`

* `as_dict_ICSI[meeting_name][sentenceid] = abstractive_summary_sentence`
* `as_dict_AMI[meeting_name][sentenceid] = abstractive_summary_sentence`

* `summary_links_ICSI[meeting_name][extractive_sentence_id] = abstractive_sentence_id`
* `summary_links_AMI[meeting_name][extractive_sentence_id] = abstractive_sentence_id`


### Save less stop words, common words 

same format as the next chunk, but with cleaning of the data itself

#### ICSI

In [253]:
# go through all meetings and write out dialogues to files
with open("../data/ICSI_plus_NXT/data_cleansing/ICSI_1024_test_cleaned.csv", mode='w') as csv_file:
    # ctext is the orginal text, while text is the extractive summary
    fieldnames = ['meeting','original','extractive','abstractive']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    
    for meeting in meeting_dialogues_ICSI.keys():
        meetingid = meeting.split(".")[0] 
        #Bdb001
        if meetingid in icsi_test_set:
            original_text = ""
            extractive_summary = ""
            abstractive_summary = ""
            # make sure each abstractive sentence appears only once
            abstractive_existing = set()
            links = summary_links_ICSI.get(meetingid,{})
            this_as_dict = as_dict_ICSI.get(meetingid,{})
            paragraph_count = 0
            for diag_id, _, _, diag, summ_flag in extract_dialogues(meeting, "ICSI"):
                # cleaning words 
                #print(diag)
                diag = read_ami_icsi(diag, filler_words)
                diag = clean_comma(diag)
                #print(diag)
                # if the count of original text words are more than x, then write the row and move to the next
                diag_word_count = len(diag.split())
                if diag_word_count + paragraph_count > 1024:
                    writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})
                    original_text = ""
                    extractive_summary = ""
                    abstractive_summary = ""
                    abstractive_existing = set()  # uniqueness of abstractive sentences per row
                    paragraph_count = 0

                # add extractive summary
                if summ_flag == 1: 
                    extractive_summary+= diag + " "

                # add abstractive summary
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    for ai in abstractive_id:
                        if ai not in abstractive_existing and ai in this_as_dict.keys():
                            abstractive_summary += as_dict_ICSI[meetingid][ai] + " "
                            abstractive_existing.add(ai)

                # add the original text
                original_text+=diag + " "
                # increase the original text word counts
                paragraph_count += diag_word_count

            # write to the row when one meeting ends 
            writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})

In [254]:
# go through all meetings and write out dialogues to files
with open("../data/ICSI_plus_NXT/data_cleansing/ICSI_1024_dev_cleaned.csv", mode='w') as csv_file:
    # ctext is the orginal text, while text is the extractive summary
    fieldnames = ['meeting','original','extractive','abstractive']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    
    for meeting in meeting_dialogues_ICSI.keys():
        meetingid = meeting.split(".")[0] 
        #Bdb001
        if meetingid in icsi_development_set:
            original_text = ""
            extractive_summary = ""
            abstractive_summary = ""
            # make sure each abstractive sentence appears only once
            abstractive_existing = set()
            links = summary_links_ICSI.get(meetingid,{})
            this_as_dict = as_dict_ICSI.get(meetingid,{})
            paragraph_count = 0
            for diag_id, _, _, diag, summ_flag in extract_dialogues(meeting, "ICSI"):
                diag = read_ami_icsi(diag, filler_words)
                diag = clean_comma(diag)
                # if the count of original text words are more than x, then write the row and move to the next
                diag_word_count = len(diag.split())
                if diag_word_count + paragraph_count > 1024:
                    writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})
                    original_text = ""
                    extractive_summary = ""
                    abstractive_summary = ""
                    abstractive_existing = set()  # uniqueness of abstractive sentences per row
                    paragraph_count = 0

                # add extractive summary
                if summ_flag == 1: 
                    extractive_summary+= diag + " "

                # add abstractive summary
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    for ai in abstractive_id:
                        if ai not in abstractive_existing and ai in this_as_dict.keys():
                            abstractive_summary += as_dict_ICSI[meetingid][ai] + " "
                            abstractive_existing.add(ai)

                # add the original text
                original_text+=diag + " "
                # increase the original text word counts
                paragraph_count += diag_word_count

            # write to the row when one meeting ends 
            writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})

In [255]:
# go through all meetings and write out dialogues to files
with open("../data/ICSI_plus_NXT/data_cleansing/ICSI_512_train_cleaned.csv", mode='w') as csv_file:
    # ctext is the orginal text, while text is the extractive summary
    fieldnames = ['meeting','original','extractive','abstractive']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    
    for meeting in meeting_dialogues_ICSI.keys():
        meetingid = meeting.split(".")[0] 
        #Bdb001
        if meetingid not in icsi_development_set and meetingid not in icsi_test_set:
            original_text = ""
            extractive_summary = ""
            abstractive_summary = ""
            # make sure each abstractive sentence appears only once
            abstractive_existing = set()
            links = summary_links_ICSI.get(meetingid,{})
            this_as_dict = as_dict_ICSI.get(meetingid,{})
            paragraph_count = 0
            for diag_id, _, _, diag, summ_flag in extract_dialogues(meeting, "ICSI"):
                diag = read_ami_icsi(diag, filler_words)
                diag = clean_comma(diag)
                # if the count of original text words are more than 512, then write the row and move to the next
                diag_word_count = len(diag.split())
                if diag_word_count + paragraph_count > 512:
                    writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})
                    original_text = ""
                    extractive_summary = ""
                    abstractive_summary = ""
                    abstractive_existing = set()  # uniqueness of abstractive sentences per row
                    paragraph_count = 0

                # add extractive summary
                if summ_flag == 1: 
                    extractive_summary+= diag + " "

                # add abstractive summary
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    for ai in abstractive_id:
                        if ai not in abstractive_existing and ai in this_as_dict.keys():
                            abstractive_summary += as_dict_ICSI[meetingid][ai] + " "
                            abstractive_existing.add(ai)

                # add the original text
                original_text+=diag + " "
                # increase the original text word counts
                paragraph_count += diag_word_count

            # write to the row when one meeting ends 
            writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})

In [176]:
df = pd.read_csv("../data/ICSI_plus_NXT/data_cleansing/ICSI_512_train_cleaned.csv")
#print(df.meeting[0])
#print(df.original[0])
#print(df.abstractive[0])
#print(df.extractive[0])
print(len(df))

1010


#### AMI

In [None]:
# go through all meetings and write out dialogues to files
with open("../data/AMI_manual/data_cleansing/AMI_512_test_cleaned.csv", mode='w') as csv_file:
    # ctext is the orginal text, while text is the extractive summary
    fieldnames = ['meeting','original','extractive','abstractive']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    
    for meeting in meeting_dialogues_AMI.keys():
        meetingid = meeting.split(".")[0] 
        #Bdb001
        if meetingid in ami_test_set:
            original_text = ""
            extractive_summary = ""
            abstractive_summary = ""
            # make sure each abstractive sentence appears only once
            abstractive_existing = set()
            links = summary_links_AMI.get(meetingid,{})
            this_as_dict = as_dict_AMI.get(meetingid,{})
            paragraph_count = 0
            for diag_id, _, _, diag, summ_flag in extract_dialogues(meeting, "AMI"):
                diag = read_ami_icsi(diag, filler_words)
                diag = clean_comma(diag)
                # if the count of original text words are more than x, then write the row and move to the next
                diag_word_count = len(diag.split())
                if diag_word_count + paragraph_count > 512:
                    writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})
                    original_text = ""
                    extractive_summary = ""
                    abstractive_summary = ""
                    abstractive_existing = set()  # uniqueness of abstractive sentences per row
                    paragraph_count = 0

                # add extractive summary
                if summ_flag == 1: 
                    extractive_summary+= diag + " "

                # add abstractive summary
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    for ai in abstractive_id:
                        if ai not in abstractive_existing and ai in this_as_dict.keys():
                            abstractive_summary += as_dict_AMI[meetingid][ai] + " "
                            abstractive_existing.add(ai)

                # add the original text
                original_text+=diag + " "
                # increase the original text word counts
                paragraph_count += diag_word_count

            # write to the row when one meeting ends 
            writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})

In [None]:
# go through all meetings and write out dialogues to files
with open("../data/AMI_manual/data_cleansing/AMI_512_dev_cleaned.csv", mode='w') as csv_file:
    # ctext is the orginal text, while text is the extractive summary
    fieldnames = ['meeting','original','extractive','abstractive']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    
    for meeting in meeting_dialogues_AMI.keys():
        meetingid = meeting.split(".")[0] 
        #Bdb001
        if meetingid in ami_development_set:
            original_text = ""
            extractive_summary = ""
            abstractive_summary = ""
            # make sure each abstractive sentence appears only once
            abstractive_existing = set()
            links = summary_links_AMI.get(meetingid,{})
            this_as_dict = as_dict_AMI.get(meetingid,{})
            paragraph_count = 0
            for diag_id, _, _, diag, summ_flag in extract_dialogues(meeting, "AMI"):
                diag = read_ami_icsi(diag, filler_words)
                diag = clean_comma(diag)
                # if the count of original text words are more than 512, then write the row and move to the next
                diag_word_count = len(diag.split())
                if diag_word_count + paragraph_count > 512:
                    writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})
                    original_text = ""
                    extractive_summary = ""
                    abstractive_summary = ""
                    abstractive_existing = set()  # uniqueness of abstractive sentences per row
                    paragraph_count = 0

                # add extractive summary
                if summ_flag == 1: 
                    extractive_summary+= diag + " "

                # add abstractive summary
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    for ai in abstractive_id:
                        if ai not in abstractive_existing and ai in this_as_dict.keys():
                            abstractive_summary += as_dict_AMI[meetingid][ai] + " "
                            abstractive_existing.add(ai)

                # add the original text
                original_text+=diag + " "
                # increase the original text word counts
                paragraph_count += diag_word_count

            # write to the row when one meeting ends 
            writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})

In [None]:
# go through all meetings and write out dialogues to files
with open("../data/AMI_manual/data_cleansing/AMI_512_train_cleaned.csv", mode='w') as csv_file:
    # ctext is the orginal text, while text is the extractive summary
    fieldnames = ['meeting','original','extractive','abstractive']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    
    for meeting in meeting_dialogues_AMI.keys():
        meetingid = meeting.split(".")[0] 
        #Bdb001
        if meetingid not in ami_development_set and meetingid not in ami_test_set:
            original_text = ""
            extractive_summary = ""
            abstractive_summary = ""
            # make sure each abstractive sentence appears only once
            abstractive_existing = set()
            links = summary_links_AMI.get(meetingid,{})
            this_as_dict = as_dict_AMI.get(meetingid,{})
            paragraph_count = 0
            for diag_id, _, _, diag, summ_flag in extract_dialogues(meeting, "AMI"):
                diag = read_ami_icsi(diag, filler_words)
                diag = clean_comma(diag)
                # if the count of original text words are more than 1024, then write the row and move to the next
                diag_word_count = len(diag.split())
                if diag_word_count + paragraph_count > 512:
                    writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})
                    original_text = ""
                    extractive_summary = ""
                    abstractive_summary = ""
                    abstractive_existing = set()  # uniqueness of abstractive sentences per row
                    paragraph_count = 0

                # add extractive summary
                if summ_flag == 1: 
                    extractive_summary+= diag + " "

                # add abstractive summary
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    for ai in abstractive_id:
                        if ai not in abstractive_existing and ai in this_as_dict.keys():
                            abstractive_summary += as_dict_AMI[meetingid][ai] + " "
                            abstractive_existing.add(ai)

                # add the original text
                original_text+=diag + " "
                # increase the original text word counts
                paragraph_count += diag_word_count

            # write to the row when one meeting ends 
            writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})

### Save Original, Extractive, Abstractive in X tokens

* *ICSI_512_train.csv*
* *ICSI_512_dev.csv*
* *ICSI_512_test.csv*

* *AMI_512_train.csv*
* *AMI_512_dev.csv*
* *AMI_512_test.csv*

* *ICSI_1024_train.csv*
* *ICSI_1024_dev.csv*
* *ICSI_1024_test.csv*

* *AMI_1024_train.csv*
* *AMI_1024_dev.csv*
* *AMI_1024_test.csv*

Columns: 'meeting','original','extractive','abstractive'

#### ICSI

In [166]:
# go through all meetings and write out dialogues to files
with open("../data/ICSI_plus_NXT/T5_csv/ICSI_1024_test.csv", mode='w') as csv_file:
    # ctext is the orginal text, while text is the extractive summary
    fieldnames = ['meeting','original','extractive','abstractive']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    
    for meeting in meeting_dialogues_ICSI.keys():
        meetingid = meeting.split(".")[0] 
        #Bdb001
        if meetingid in icsi_test_set:
            original_text = ""
            extractive_summary = ""
            abstractive_summary = ""
            # make sure each abstractive sentence appears only once
            abstractive_existing = set()
            links = summary_links_ICSI.get(meetingid,{})
            this_as_dict = as_dict_ICSI.get(meetingid,{})
            paragraph_count = 0
            for diag_id, _, _, diag, summ_flag in extract_dialogues(meeting, "ICSI"):

                # if the count of original text words are more than x, then write the row and move to the next
                diag_word_count = len(diag.split())
                if diag_word_count + paragraph_count > 1024:
                    writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})
                    original_text = ""
                    extractive_summary = ""
                    abstractive_summary = ""
                    abstractive_existing = set()  # uniqueness of abstractive sentences per row
                    paragraph_count = 0

                # add extractive summary
                if summ_flag == 1: 
                    extractive_summary+= diag + " "

                # add abstractive summary
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    for ai in abstractive_id:
                        if ai not in abstractive_existing and ai in this_as_dict.keys():
                            abstractive_summary += as_dict_ICSI[meetingid][ai] + " "
                            abstractive_existing.add(ai)

                # add the original text
                original_text+=diag + " "
                # increase the original text word counts
                paragraph_count += diag_word_count

            # write to the row when one meeting ends 
            writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})

In [25]:
df = pd.read_csv("../data/ICSI_plus_NXT/T5_csv/ICSI_512_test.csv")
print(df.meeting[0])
#print(df.original[0])
print(df.abstractive[0])
print(df.extractive[0])

Bed004.A
It is not a working net yet, but identifying clusters of features that define the output mode provides a visual aid for further work. 
So , what I did for this this is uh , a pedagogical belief - net  


In [67]:
summary_links_ICSI['Bro027']

defaultdict(set,
            {'Bro027.B.dialogueact18': {'Bro027.s.14'},
             'Bro027.B.dialogueact25': {'Bro027.s.14'},
             'Bro027.B.dialogueact26': {'Bro027.s.14'},
             'Bro027.B.dialogueact39': {'Bro027.s.4'},
             'Bro027.B.dialogueact40': {'Bro027.s.4'},
             'Bro027.B.dialogueact42': {'Bro027.s.4'},
             'Bro027.B.dialogueact95': {'Bro027.s.4'},
             'Bro027.B.dialogueact96': {'Bro027.s.4'},
             'Bro027.B.dialogueact185': {'Bro027.s.4'},
             'Bro027.A.dialogueact243': {'Bro027.s.15'},
             'Bro027.A.dialogueact245': {'Bro027.s.15'},
             'Bro027.A.dialogueact247': {'Bro027.s.15'},
             'Bro027.A.dialogueact248': {'Bro027.s.15'},
             'Bro027.A.dialogueact251': {'Bro027.s.15'},
             'Bro027.A.dialogueact252': {'Bro027.s.15'},
             'Bro027.A.dialogueact253': {'Bro027.s.15'},
             'Bro027.A.dialogueact256': {'Bro027.s.15'},
             'Bro027.A.dialo

In [68]:
print(as_dict_ICSI['Bro027']['Bro027.s.3'])

The group discussed possible further investigations that arose from these areas, including better linking the two.


In [26]:
# go through all meetings and write out dialogues to files
with open("../data/ICSI_plus_NXT/T5_csv/ICSI_1024_dev.csv", mode='w') as csv_file:
    # ctext is the orginal text, while text is the extractive summary
    fieldnames = ['meeting','original','extractive','abstractive']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    
    for meeting in meeting_dialogues_ICSI.keys():
        meetingid = meeting.split(".")[0] 
        #Bdb001
        if meetingid in icsi_development_set:
            original_text = ""
            extractive_summary = ""
            abstractive_summary = ""
            # make sure each abstractive sentence appears only once
            abstractive_existing = set()
            links = summary_links_ICSI.get(meetingid,{})
            this_as_dict = as_dict_ICSI.get(meetingid,{})
            paragraph_count = 0
            for diag_id, _, _, diag, summ_flag in extract_dialogues(meeting, "ICSI"):

                # if the count of original text words are more than x, then write the row and move to the next
                diag_word_count = len(diag.split())
                if diag_word_count + paragraph_count > 1024:
                    writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})
                    original_text = ""
                    extractive_summary = ""
                    abstractive_summary = ""
                    abstractive_existing = set()  # uniqueness of abstractive sentences per row
                    paragraph_count = 0

                # add extractive summary
                if summ_flag == 1: 
                    extractive_summary+= diag + " "

                # add abstractive summary
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    for ai in abstractive_id:
                        if ai not in abstractive_existing and ai in this_as_dict.keys():
                            abstractive_summary += as_dict_ICSI[meetingid][ai] + " "
                            abstractive_existing.add(ai)

                # add the original text
                original_text+=diag + " "
                # increase the original text word counts
                paragraph_count += diag_word_count

            # write to the row when one meeting ends 
            writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})

In [27]:
# go through all meetings and write out dialogues to files
with open("../data/ICSI_plus_NXT/T5_csv/ICSI_1024_train.csv", mode='w') as csv_file:
    # ctext is the orginal text, while text is the extractive summary
    fieldnames = ['meeting','original','extractive','abstractive']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    
    for meeting in meeting_dialogues_ICSI.keys():
        meetingid = meeting.split(".")[0] 
        #Bdb001
        if meetingid not in icsi_development_set and meetingid not in icsi_test_set:
            original_text = ""
            extractive_summary = ""
            abstractive_summary = ""
            # make sure each abstractive sentence appears only once
            abstractive_existing = set()
            links = summary_links_ICSI.get(meetingid,{})
            this_as_dict = as_dict_ICSI.get(meetingid,{})
            paragraph_count = 0
            for diag_id, _, _, diag, summ_flag in extract_dialogues(meeting, "ICSI"):

                # if the count of original text words are more than 512, then write the row and move to the next
                diag_word_count = len(diag.split())
                if diag_word_count + paragraph_count > 1024:
                    writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})
                    original_text = ""
                    extractive_summary = ""
                    abstractive_summary = ""
                    abstractive_existing = set()  # uniqueness of abstractive sentences per row
                    paragraph_count = 0

                # add extractive summary
                if summ_flag == 1: 
                    extractive_summary+= diag + " "

                # add abstractive summary
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    for ai in abstractive_id:
                        if ai not in abstractive_existing and ai in this_as_dict.keys():
                            abstractive_summary += as_dict_ICSI[meetingid][ai] + " "
                            abstractive_existing.add(ai)

                # add the original text
                original_text+=diag + " "
                # increase the original text word counts
                paragraph_count += diag_word_count

            # write to the row when one meeting ends 
            writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})

In [28]:
df = pd.read_csv("../data/ICSI_plus_NXT/T5_csv/ICSI_1024_train.csv")
#print(df.meeting[0])
#print(df.original[0])
#print(df.abstractive[0])
#print(df.extractive[0])
print(len(df))

697


In [29]:
df[df['original'].notna()]['original'].apply(lambda x: x.split(" ")).map(lambda x: len(x))

0      1149
1      1120
2       301
3        63
4      1133
       ... 
692    1156
693      28
694    1135
695    1154
696    1037
Name: original, Length: 697, dtype: int64

In [30]:
df[df['extractive'].notna()]['extractive'].apply(lambda x: x.split(" ")).map(lambda x: len(x))

0      244
1      150
4      305
5      501
6      366
      ... 
691    171
692    146
694    203
695    178
696    210
Name: extractive, Length: 364, dtype: int64

In [31]:
df[df['abstractive'].notna()]['abstractive'].apply(lambda x: x.split(" ")).map(lambda x: len(x))

4       55
5      125
6       56
7       37
11      39
      ... 
690     16
692     42
694     16
695     17
696     24
Name: abstractive, Length: 222, dtype: int64

#### AMI

In [32]:
# go through all meetings and write out dialogues to files
with open("../data/AMI_manual/T5_csv/AMI_1024_test.csv", mode='w') as csv_file:
    # ctext is the orginal text, while text is the extractive summary
    fieldnames = ['meeting','original','extractive','abstractive']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    
    for meeting in meeting_dialogues_AMI.keys():
        meetingid = meeting.split(".")[0] 
        #Bdb001
        if meetingid in ami_test_set:
            original_text = ""
            extractive_summary = ""
            abstractive_summary = ""
            # make sure each abstractive sentence appears only once
            abstractive_existing = set()
            links = summary_links_AMI.get(meetingid,{})
            this_as_dict = as_dict_AMI.get(meetingid,{})
            paragraph_count = 0
            for diag_id, _, _, diag, summ_flag in extract_dialogues(meeting, "AMI"):

                # if the count of original text words are more than x, then write the row and move to the next
                diag_word_count = len(diag.split())
                if diag_word_count + paragraph_count > 1024:
                    writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})
                    original_text = ""
                    extractive_summary = ""
                    abstractive_summary = ""
                    abstractive_existing = set()  # uniqueness of abstractive sentences per row
                    paragraph_count = 0

                # add extractive summary
                if summ_flag == 1: 
                    extractive_summary+= diag + " "

                # add abstractive summary
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    for ai in abstractive_id:
                        if ai not in abstractive_existing and ai in this_as_dict.keys():
                            abstractive_summary += as_dict_AMI[meetingid][ai] + " "
                            abstractive_existing.add(ai)

                # add the original text
                original_text+=diag + " "
                # increase the original text word counts
                paragraph_count += diag_word_count

            # write to the row when one meeting ends 
            writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})

In [33]:
# go through all meetings and write out dialogues to files
with open("../data/AMI_manual/T5_csv/AMI_1024_dev.csv", mode='w') as csv_file:
    # ctext is the orginal text, while text is the extractive summary
    fieldnames = ['meeting','original','extractive','abstractive']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    
    for meeting in meeting_dialogues_AMI.keys():
        meetingid = meeting.split(".")[0] 
        #Bdb001
        if meetingid in ami_development_set:
            original_text = ""
            extractive_summary = ""
            abstractive_summary = ""
            # make sure each abstractive sentence appears only once
            abstractive_existing = set()
            links = summary_links_AMI.get(meetingid,{})
            this_as_dict = as_dict_AMI.get(meetingid,{})
            paragraph_count = 0
            for diag_id, _, _, diag, summ_flag in extract_dialogues(meeting, "AMI"):

                # if the count of original text words are more than 512, then write the row and move to the next
                diag_word_count = len(diag.split())
                if diag_word_count + paragraph_count > 1024:
                    writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})
                    original_text = ""
                    extractive_summary = ""
                    abstractive_summary = ""
                    abstractive_existing = set()  # uniqueness of abstractive sentences per row
                    paragraph_count = 0

                # add extractive summary
                if summ_flag == 1: 
                    extractive_summary+= diag + " "

                # add abstractive summary
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    for ai in abstractive_id:
                        if ai not in abstractive_existing and ai in this_as_dict.keys():
                            abstractive_summary += as_dict_AMI[meetingid][ai] + " "
                            abstractive_existing.add(ai)

                # add the original text
                original_text+=diag + " "
                # increase the original text word counts
                paragraph_count += diag_word_count

            # write to the row when one meeting ends 
            writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})

In [34]:
# go through all meetings and write out dialogues to files
with open("../data/AMI_manual/T5_csv/AMI_1024_train.csv", mode='w') as csv_file:
    # ctext is the orginal text, while text is the extractive summary
    fieldnames = ['meeting','original','extractive','abstractive']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    
    for meeting in meeting_dialogues_AMI.keys():
        meetingid = meeting.split(".")[0] 
        #Bdb001
        if meetingid not in ami_development_set and meetingid not in ami_test_set:
            original_text = ""
            extractive_summary = ""
            abstractive_summary = ""
            # make sure each abstractive sentence appears only once
            abstractive_existing = set()
            links = summary_links_AMI.get(meetingid,{})
            this_as_dict = as_dict_AMI.get(meetingid,{})
            paragraph_count = 0
            for diag_id, _, _, diag, summ_flag in extract_dialogues(meeting, "AMI"):

                # if the count of original text words are more than 1024, then write the row and move to the next
                diag_word_count = len(diag.split())
                if diag_word_count + paragraph_count > 1024:
                    writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})
                    original_text = ""
                    extractive_summary = ""
                    abstractive_summary = ""
                    abstractive_existing = set()  # uniqueness of abstractive sentences per row
                    paragraph_count = 0

                # add extractive summary
                if summ_flag == 1: 
                    extractive_summary+= diag + " "

                # add abstractive summary
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    for ai in abstractive_id:
                        if ai not in abstractive_existing and ai in this_as_dict.keys():
                            abstractive_summary += as_dict_AMI[meetingid][ai] + " "
                            abstractive_existing.add(ai)

                # add the original text
                original_text+=diag + " "
                # increase the original text word counts
                paragraph_count += diag_word_count

            # write to the row when one meeting ends 
            writer.writerow({'meeting':meeting,'original': original_text, 'extractive': extractive_summary, 'abstractive': abstractive_summary})

In [36]:
df = pd.read_csv("../data/AMI_manual/T5_csv/AMI_1024_train.csv")
print(df.meeting[4])
#print(df.original[4])
#print(df.abstractive[4])
#print(df.extractive[4])
print(len(df))

ES2002a.B
6755


In [37]:
df[df['original'].notna()]['original'].apply(lambda x: x.split(" ")).map(lambda x: len(x))

0        925
1         66
2       1474
3        179
4       1288
        ... 
6750     997
6751     887
6752     822
6753     970
6754     591
Name: original, Length: 6748, dtype: int64

In [38]:
df[df['extractive'].notna()]['extractive'].apply(lambda x: x.split(" ")).map(lambda x: len(x))

0        197
1          2
2       1474
3         56
11        13
        ... 
6727      34
6730      18
6732       7
6738      12
6743       5
Name: extractive, Length: 2832, dtype: int64

In [39]:
df[df['abstractive'].notna()]['abstractive'].apply(lambda x: x.split(" ")).map(lambda x: len(x))

0       48
1       36
2       36
3       36
11      36
        ..
6726    15
6727    46
6730    32
6732    32
6743    32
Name: abstractive, Length: 2446, dtype: int64

### Save Original Dialogue

*meeting_da.txt*

e.g. ES2004a.B_da.txt

#### ICSI

In [34]:
# go through all meetings and write out dialogues to files
for meeting in meeting_dialogues_ICSI.keys():
    meetingid = meeting.split(".")[0]    
    original_text = ""
    if meetingid in icsi_test_set:
        with open("../data/ICSI_plus_NXT/Full_doc/Test/"+meeting+"_da.txt", mode='w') as f:
            for _, _, _, diag,_ in extract_dialogues(meeting,"ICSI"):
                original_text+=diag + " "
            f.write(original_text)
            f.write("\n")
    elif meetingid in icsi_development_set:
        with open("../data/ICSI_plus_NXT/Full_doc/Dev/"+meeting+"_da.txt", mode='w') as f:
            for _, _, _, diag,_ in extract_dialogues(meeting,"ICSI"):
                original_text+=diag + " "
            f.write(original_text)
            f.write("\n")
    else:
        with open("../data/ICSI_plus_NXT/Full_doc/Train/"+meeting+"_da.txt", mode='w') as f:
            for _, _, _, diag,_ in extract_dialogues(meeting,"ICSI"):
                original_text+=diag + " "
            f.write(original_text)
            f.write("\n")

#### AMI

In [36]:
# go through all meetings and write out dialogues to files
for meeting in meeting_dialogues_AMI.keys():
    meetingid = meeting.split(".")[0]        
    original_text = ""
    if meetingid in ami_test_set:
        with open("../data/AMI_manual/Full_doc/Test/"+meeting+"_da.txt", mode='w') as f:
            for _, _, _, diag,_ in extract_dialogues(meeting,"AMI"):
                original_text+=diag + " "
            f.write(original_text)
            f.write("\n")
    elif meetingid in ami_development_set:
        with open("../data/AMI_manual/Full_doc/Dev/"+meeting+"_da.txt", mode='w') as f:
            for _, _, _, diag,_ in extract_dialogues(meeting,"AMI"):
                original_text+=diag + " "
            f.write(original_text)
            f.write("\n")
    else:
        with open("../data/AMI_manual/Full_doc/Train/"+meeting+"_da.txt", mode='w') as f:
            for _, _, _, diag,_ in extract_dialogues(meeting,"AMI"):
                original_text+=diag + " "
            f.write(original_text)
            f.write("\n")

### Save Extractive Summary

*meeting_es.txt*

e.g. ES2004a.A_es.txt

#### ICSI

In [38]:
# go through all meetings and write out dialogues to files
for meeting in meeting_dialogues_ICSI.keys():
    meetingid = meeting.split(".")[0]   
    extractive_summary = ""
    
    if meetingid in icsi_test_set:
        with open("../data/ICSI_plus_NXT/Full_doc/Test/"+meeting+"_es.txt", mode='w') as f:
            for _, _, _, diag,summ_flag in extract_dialogues(meeting,"ICSI"):
                if summ_flag == 1: 
                    extractive_summary+= diag + " "
            f.write(extractive_summary)
            f.write("\n")
    elif meetingid in icsi_development_set:
        with open("../data/ICSI_plus_NXT/Full_doc/Dev/"+meeting+"_es.txt", mode='w') as f:
            for _, _, _, diag,summ_flag in extract_dialogues(meeting,"ICSI"):
                if summ_flag == 1: 
                    extractive_summary+= diag + " "
            f.write(extractive_summary)
            f.write("\n")
    else:
        with open("../data/ICSI_plus_NXT/Full_doc/Train/"+meeting+"_es.txt", mode='w') as f:
            for _, _, _, diag,summ_flag in extract_dialogues(meeting,"ICSI"):
                if summ_flag == 1: 
                    extractive_summary+= diag + " "
            f.write(extractive_summary)
            f.write("\n")

#### AMI

In [39]:
# go through all meetings and write out dialogues to files
for meeting in meeting_dialogues_AMI.keys():
    meetingid = meeting.split(".")[0]   
    extractive_summary = ""
    
    if meetingid in ami_test_set:
        with open("../data/AMI_manual/Full_doc/Test/"+meeting+"_es.txt", mode='w') as f:
            for _, _, _, diag,summ_flag in extract_dialogues(meeting,"AMI"):
                if summ_flag == 1: 
                    extractive_summary+= diag + " "
            f.write(extractive_summary)
            f.write("\n")
    elif meetingid in ami_development_set:
        with open("../data/AMI_manual/Full_doc/Dev/"+meeting+"_es.txt", mode='w') as f:
            for _, _, _, diag,summ_flag in extract_dialogues(meeting,"AMI"):
                if summ_flag == 1: 
                    extractive_summary+= diag + " "
            f.write(extractive_summary)
            f.write("\n")
    else:
        with open("../data/AMI_manual/Full_doc/Train/"+meeting+"_es.txt", mode='w') as f:
            for _, _, _, diag,summ_flag in extract_dialogues(meeting,"AMI"):
                if summ_flag == 1: 
                    extractive_summary+= diag + " "
            f.write(extractive_summary)
            f.write("\n")

### Save Abstractive Summary

*meeting_as.txt*

e.g. Bed002_as.txt

#### ICSI

In [67]:
# go through all meetings and write out dialogues to files
for meeting, sentencedict in as_dict_ICSI.items():
    meetingid = meeting
    abstractive_summary = ""
    
    if meetingid in icsi_test_set:
        with open("../data/ICSI_plus_NXT/Full_doc/Test/"+meeting+"_as.txt", mode='w') as f:
            for s,v in sentencedict.items():
                abstractive_summary+= v + " "
            f.write(abstractive_summary)
            f.write("\n")
    elif meetingid in icsi_development_set:
        with open("../data/ICSI_plus_NXT/Full_doc/Dev/"+meeting+"_as.txt", mode='w') as f:
            for s,v in sentencedict.items():
                abstractive_summary+= v + " "
            f.write(abstractive_summary)
            f.write("\n")
    else:
        with open("../data/ICSI_plus_NXT/Full_doc/Train/"+meeting+"_as.txt", mode='w') as f:
            for s,v in sentencedict.items():
                abstractive_summary+= v + " "
            f.write(abstractive_summary)
            f.write("\n")

#### AMI

In [68]:
# go through all meetings and write out dialogues to files
for meeting, sentencedict in as_dict_AMI.items():
    meetingid = meeting
    abstractive_summary = ""
    
    if meetingid in ami_test_set:
        with open("../data/AMI_manual/Full_doc/Test/"+meeting+"_as.txt", mode='w') as f:
            for s,v in sentencedict.items():
                abstractive_summary+= v + " "
            f.write(abstractive_summary)
            f.write("\n")
    elif meetingid in ami_development_set:
        with open("../data/AMI_manual/Full_doc/Dev/"+meeting+"_as.txt", mode='w') as f:
            for s,v in sentencedict.items():
                abstractive_summary+= v + " "
            f.write(abstractive_summary)
            f.write("\n")
    else:
        with open("../data/AMI_manual/Full_doc/Train/"+meeting+"_as.txt", mode='w') as f:
            for s,v in sentencedict.items():
                abstractive_summary+= v + " "
            f.write(abstractive_summary)
            f.write("\n")

### Extractive Summary and Abstractive Summary with Links

*meeting_esas.csv*

e.g.Bed002.A_esas.csv

In [98]:
# go through all meetings and write out dialogues to files
fieldnames = ['meeting','abstractive', 'extractive']

for meeting in meeting_dialogues_ICSI.keys():
    meetingid = meeting.split(".")[0] 
    #Bdb001
    extractive_summary = ""
    abstractive_summary = ""
    # make sure each abstractive sentence appears only once
    abstractive_existing = set()
    links = summary_links_ICSI.get(meetingid,{})
    this_as_dict = as_dict_ICSI.get(meetingid,{})
    
    if meetingid in icsi_test_set:
        with open("../data/ICSI_plus_NXT/Full_doc/Test/"+meeting+"_esas.csv", mode='w') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            writer.writeheader()
            for diag_id, _, _, diag,summ_flag in extract_dialogues(meeting,"ICSI"):
                if summ_flag == 1: 
                    extractive_summary+= diag + " "
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    if abstractive_id not in abstractive_existing and abstractive_id in this_as_dict.keys():
                        abstractive_summary += as_dict_ICSI[meetingid][abstractive_id] + " "
                        abstractive_existing.add(abstractive_id)
            writer.writerow({'meeting':meeting,'abstractive': abstractive_summary, 'extractive': extractive_summary})
            
    elif meetingid in icsi_development_set:
        with open("../data/ICSI_plus_NXT/Full_doc/Dev/"+meeting+"_esas.csv", mode='w') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            writer.writeheader()
            for diag_id, _, _, diag,summ_flag in extract_dialogues(meeting,"ICSI"):
                if summ_flag == 1: 
                    extractive_summary+= diag + " "
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    if abstractive_id not in abstractive_existing and abstractive_id in this_as_dict.keys():
                        abstractive_summary += as_dict_ICSI[meetingid][abstractive_id] + " "
                        abstractive_existing.add(abstractive_id)
            writer.writerow({'meeting':meeting,'abstractive': abstractive_summary, 'extractive': extractive_summary})
            
    else:
        with open("../data/ICSI_plus_NXT/Full_doc/Train/"+meeting+"_esas.csv", mode='w') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            writer.writeheader()
            for diag_id, _, _, diag,summ_flag in extract_dialogues(meeting,"ICSI"):
                if summ_flag == 1: 
                    extractive_summary+= diag + " "
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    if abstractive_id not in abstractive_existing and abstractive_id in this_as_dict.keys():
                        abstractive_summary += as_dict_ICSI[meetingid][abstractive_id] + " "
                        abstractive_existing.add(abstractive_id)
            writer.writerow({'meeting':meeting,'abstractive': abstractive_summary, 'extractive': extractive_summary})
            

#### AMI

In [99]:
# go through all meetings and write out dialogues to files
fieldnames = ['meeting','abstractive', 'extractive']

for meeting in meeting_dialogues_AMI.keys():
    meetingid = meeting.split(".")[0] 
    #Bdb001
    extractive_summary = ""
    abstractive_summary = ""
    # make sure each abstractive sentence appears only once
    abstractive_existing = set()
    links = summary_links_AMI.get(meetingid,{})
    this_as_dict = as_dict_AMI.get(meetingid,{})
    
    if meetingid in ami_test_set:
        with open("../data/AMI_manual/Full_doc/Test/"+meeting+"_esas.csv", mode='w') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            writer.writeheader()
            for diag_id, _, _, diag,summ_flag in extract_dialogues(meeting,"AMI"):
                if summ_flag == 1: 
                    extractive_summary+= diag + " "
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    if abstractive_id not in abstractive_existing and abstractive_id in this_as_dict.keys():
                        abstractive_summary += as_dict_AMI[meetingid][abstractive_id] + " "
                        abstractive_existing.add(abstractive_id)
            writer.writerow({'meeting':meeting,'abstractive': abstractive_summary, 'extractive': extractive_summary})
            
    elif meetingid in ami_development_set:
        with open("../data/AMI_manual/Full_doc/Dev/"+meeting+"_esas.csv", mode='w') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            writer.writeheader()
            for diag_id, _, _, diag,summ_flag in extract_dialogues(meeting,"AMI"):
                if summ_flag == 1: 
                    extractive_summary+= diag + " "
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    if abstractive_id not in abstractive_existing and abstractive_id in this_as_dict.keys():
                        abstractive_summary += as_dict_AMI[meetingid][abstractive_id] + " "
                        abstractive_existing.add(abstractive_id)
            writer.writerow({'meeting':meeting,'abstractive': abstractive_summary, 'extractive': extractive_summary})
            
    else:
        with open("../data/AMI_manual/Full_doc/Train/"+meeting+"_esas.csv", mode='w') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            writer.writeheader()
            for diag_id, _, _, diag,summ_flag in extract_dialogues(meeting,"AMI"):
                if summ_flag == 1: 
                    extractive_summary+= diag + " "
                if diag_id in links.keys():
                    abstractive_id=links[diag_id]
                    if abstractive_id not in abstractive_existing and abstractive_id in this_as_dict.keys():
                        abstractive_summary += as_dict_AMI[meetingid][abstractive_id] + " "
                        abstractive_existing.add(abstractive_id)
            writer.writerow({'meeting':meeting,'abstractive': abstractive_summary, 'extractive': extractive_summary})
            