In [None]:
from google.colab import drive
drive.mount('/content/gdrive')  #mounting

Mounted at /content/gdrive


In [None]:
import codecs
import re
import string
import nltk
import pandas as pd
import operator
from nltk import PerceptronTagger


def load_stopwords(path):
    stopwords = set([])

    for line in codecs.open(path, 'r', 'utf-8'):
        if not re.search('^#', line) and len(line.strip()) > 0:
            stopwords.add(line.strip().lower())  # lowercase

    return stopwords


def load_filler_words(path):
    with open(path, 'r+') as f:
        filler = f.read().splitlines()

    return filler


def clean_utterance(utterance, filler_words):
    utt = utterance
    # replace consecutive unigrams with a single instance
    utt = re.sub('\\b(\\w+)\\s+\\1\\b', '\\1', utt)
    # same for bigrams
    utt = re.sub('(\\b.+?\\b)\\1\\b', '\\1', utt)
    # strip extra white space
    utt = re.sub(' +', ' ', utt)
    # strip leading and trailing white space
    utt = utt.strip()

    # remove filler words # highly time-consuming
    utt = ' ' + utt + ' '
    for filler_word in filler_words:
        utt = re.sub(' ' + filler_word + ' ', ' ', utt)
        utt = re.sub(' ' + filler_word.capitalize() + ' ', ' ', utt)

    return utt

'''
def clean_text(text, stopwords, remove_stopwords=True, pos_filtering=False, stemming=True, lower_case=True):
    if lower_case:
        # convert to lower case
        text = text.lower()
    # strip extra white space
    text = re.sub(' +', ' ', text)
    # strip leading and trailing white space
    text = text.strip()
    # tokenize (split based on whitespace)
    tokens = text.split(' ')

    # remove punctuation
    tokens = [t for t in tokens if t not in string.punctuation]

    if pos_filtering:
        tagger = PerceptronTagger()
        # apply POS-tagging
        tagged_tokens = tagger.tag(tokens)
        # retain only nouns and adjectives
        tokens = [item[0] for item in tagged_tokens if item[1] in ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJS', 'JJR', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']]
    if remove_stopwords:
        # remove stopwords
        tokens = [token for token in tokens if token.lower() not in stopwords]
    if stemming:
        stemmer = nltk.stem.PorterStemmer()
        # apply Porter's stemmer
        tokens_stemmed = list()
        for token in tokens:
            tokens_stemmed.append(stemmer.stem(token))
        tokens = tokens_stemmed

    return (tokens)

'''

"""
def read_ami_icsi(path, filler_words):
    asr_output = pd.read_csv(
        path,
        sep='\t',
        header=None,
        names=['ID', 'start', 'end', 'letter', 'role', 'A', 'B', 'C', 'utt']
    )

    utterances = []
    for tmp in zip(asr_output['role'].tolist(), asr_output['utt'].tolist()):
        role, utt = tmp
        for ch in ['{vocalsound}', '{gap}', '{disfmarker}', '{comment}', '{pause}', '@reject@']:
            utt = re.sub(ch, '', utt)

        utt = re.sub("'Kay", 'Okay', utt)
        utt = re.sub("'kay", 'Okay', utt)
        utt = re.sub('"Okay"', 'Okay', utt)
        utt = re.sub("'cause", 'cause', utt)
        utt = re.sub("'Cause", 'cause', utt)
        utt = re.sub('"cause"', 'cause', utt)
        utt = re.sub('"\'em"', 'them', utt)
        utt = re.sub('"\'til"', 'until', utt)
        utt = re.sub('"\'s"', 's', utt)

        # l. c. d. -> lcd
        # t. v. -> tv
        utt = re.sub('h. t. m. l.', 'html', utt)
        utt = re.sub(r"(\w)\. (\w)\. (\w)\.", r"\1\2\3", utt)
        utt = re.sub(r"(\w)\. (\w)\.", r"\1\2", utt)
        utt = re.sub(r"(\w)\.", r"\1", utt)

        # clean_utterance, remove filler_words
        utt = clean_utterance(utt, filler_words=filler_words)

        # strip extra white space
        utt = re.sub(' +', ' ', utt)
        # strip leading and trailing white space
        utt = utt.strip()

        if utt != '' and utt != '.' and utt != ' ':
            utterances.append((role, utt))

    # remove duplicate utterances per speaker
    utterances = sorted(set(utterances), key=utterances.index)
    utterances_indexed = zip(range(len(utterances)), zip(*utterances)[0], zip(*utterances)[1])

    return utterances_indexed

'''
def accumulate(iterable, func=operator.add):
    'Return running totals'
    # accumulate([1,2,3,4,5]) --> 1 3 6 10 15
    # accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
    it = iter(iterable)
    try:
        total = next(it)
    except StopIteration:
        return
    yield total
    for element in it:
        total = func(total, element)
        yield total
'''   

"""

'\ndef read_ami_icsi(path, filler_words):\n    asr_output = pd.read_csv(\n        path,\n        sep=\'\t\',\n        header=None,\n        names=[\'ID\', \'start\', \'end\', \'letter\', \'role\', \'A\', \'B\', \'C\', \'utt\']\n    )\n\n    utterances = []\n    for tmp in zip(asr_output[\'role\'].tolist(), asr_output[\'utt\'].tolist()):\n        role, utt = tmp\n        for ch in [\'{vocalsound}\', \'{gap}\', \'{disfmarker}\', \'{comment}\', \'{pause}\', \'@reject@\']:\n            utt = re.sub(ch, \'\', utt)\n\n        utt = re.sub("\'Kay", \'Okay\', utt)\n        utt = re.sub("\'kay", \'Okay\', utt)\n        utt = re.sub(\'"Okay"\', \'Okay\', utt)\n        utt = re.sub("\'cause", \'cause\', utt)\n        utt = re.sub("\'Cause", \'cause\', utt)\n        utt = re.sub(\'"cause"\', \'cause\', utt)\n        utt = re.sub(\'"\'em"\', \'them\', utt)\n        utt = re.sub(\'"\'til"\', \'until\', utt)\n        utt = re.sub(\'"\'s"\', \'s\', utt)\n\n        # l. c. d. -> lcd\n        # t. v. ->

In [None]:
#%cd '/content/gdrive/My Drive/summerization/Create_DATA/CoreRank/data/'

%cd '/content/gdrive/My Drive/summerization/Create_DATA/CoreRank/data/'

/content/gdrive/My Drive/summerization/Create_DATA/CoreRank/data


In [None]:
ls

'ami_params_create_community (1).gsheet'   __init__.py
 ami_params_create_community.csv           [0m[01;34mmeeting[0m/
 ami_params_create_community.gsheet        new
 clustering.py                             original_reference.txt
 clustering.pyc                            [01;34m__pycache__[0m/
 [01;34mcommunity[0m/                                Sample_transcript.txt
 [01;34mcommunity_tagged[0m/                         tf_idf.py
 core_rank.py                              tf_idf.pyc
 core_rank.pyc                             utils.py
 data                                      utils.pyc
 [01;34mdata1[0m/                                    [01;34mutterance[0m/
 dictionary_tokenizer.py                   utterance_community_detection.py


In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
'''
import os
#path_to_root = '/data/'
#os.chdir(path_to_root)
import string
#import core_rank
#from data import utils
#from data import clustering
from meeting import meeting_lists
#from collections import Counter
from nltk import PerceptronTagger
#from nltk import TweetTokenizer
#from dictionary_tokenizer import DictionaryTokenizer
#from sklearn.model_selection import ParameterGrid


domain     = 'meeting' # meeting
dataset_id = 'ami'     # ami, icsi
language   = 'en'      # en
source     = 'asr'     # asr, manual

# #########################
# ### RESOURCES LOADING ###
# #########################
if domain == 'meeting':
    path_to_stopwords    = '/content/drive/My Drive/summerization/Create_DATA/CoreRank/resources/stopwords/meeting/stopwords.en.dat'
    path_to_filler_words = '/content/drive/My Drive/summerization/Create_DATA/CoreRank/resources/stopwords/meeting/filler_words.en.txt'
    stopwords = load_stopwords(path_to_stopwords)
    filler_words = load_filler_words(path_to_filler_words)

    if dataset_id == 'ami':
        ids = meeting_lists.ami_development_set + meeting_lists.ami_test_set
    elif dataset_id == 'icsi':
        ids = meeting_lists.icsi_development_set + meeting_lists.icsi_test_set

from nltk import PerceptronTagger
tagger = PerceptronTagger()


# ######################
# ### CORPUS LOADING ###
# ######################
corpus = {}
for id in ids:
    if domain == 'meeting':
        if dataset_id == 'ami' or dataset_id == 'icsi':
            if source == 'asr':
                path = 'meeting/' + dataset_id + '/' + id + '.da-asr'
            elif source == 'manual':
                path = 'meeting/' + dataset_id + '/' + id + '.da'
            # filler words will be removed during corpus loading
            corpus[id] = read_ami_icsi(path, filler_words)
'''

In [None]:
'''
for i in corpus[id]:
  print(i)
'''  

'\nfor i in corpus[id]:\n  print(i)\n'

In [None]:
'''
# #############################
# ### CORPUS PRE-PROCESSING ###
# #############################
corpus_tagged = {}
for id in ids:
    utterances_indexed = corpus[id]
    utterances_indexed_tagged = []
    for i in range(len(utterances_indexed)):
        index, role, utt = utterances_indexed[i]

        # tokenization
        tokens = utt.split(' ')
        # tokens = tokenizer.tokenize(utt)
        corpus[id][i] = (index, role, ' '.join(tokens))  # update

        # tagging
        tokens_tagged = [tuple[0] + '/' + (tuple[1] if tuple[0] not in string.punctuation else 'PUNCT') for tuple in tagger.tag(tokens)]
        utterances_indexed_tagged.append((index, role, ' '.join(tokens_tagged)))
    corpus_tagged[id] = utterances_indexed_tagged
'''

"\n# #############################\n# ### CORPUS PRE-PROCESSING ###\n# #############################\ncorpus_tagged = {}\nfor id in ids:\n    utterances_indexed = corpus[id]\n    utterances_indexed_tagged = []\n    for i in range(len(utterances_indexed)):\n        index, role, utt = utterances_indexed[i]\n\n        # tokenization\n        tokens = utt.split(' ')\n        # tokens = tokenizer.tokenize(utt)\n        corpus[id][i] = (index, role, ' '.join(tokens))  # update\n\n        # tagging\n        tokens_tagged = [tuple[0] + '/' + (tuple[1] if tuple[0] not in string.punctuation else 'PUNCT') for tuple in tagger.tag(tokens)]\n        utterances_indexed_tagged.append((index, role, ' '.join(tokens_tagged)))\n    corpus_tagged[id] = utterances_indexed_tagged\n"

In [None]:
'''
for i,j in corpus_tagged.items():
  print(i)
'''

'\nfor i,j in corpus_tagged.items():\n  print(i)\n'

In [None]:
!pwd

/content/gdrive/My Drive/summerization/Create_DATA/CoreRank/data


In [None]:
#asr_output = pd.read_csv("meeting/ami/TS3007d.da-asr",sep='\t',header=None,names=['ID', 'start', 'end', 'letter', 'role', 'A', 'B', 'C', 'utt'])

In [None]:
asr_output=pd.read_csv('/content/gdrive/My Drive/summerization/Create_DATA/CoreRank/data/Sample_transcript.txt',sep=':',names=['role','utt'])

In [None]:
asr_output

Unnamed: 0,role,utt
0,Chairman Wormsley,Each of you has received the agenda. I will e...
1,Commissioner Brown,So moved.
2,Commissioner Hobbs,Seconded
3,Chairman Wormsley,It has been moved and seconded that the agend...
4,Commissioner McCroskey,"Mister Chairman, my name has been omitted fro..."
5,Chairman Wormsley,"Thank you. If there are no objections, the mi..."
6,Chairman Wormsley,"Commissioner Adkins, the first item on the ag..."
7,Commissioner Adkins,"Mister Chairman, I would like to make a motio..."
8,Commissioner Carmical,I second the motion.
9,Chairman Wormsley,This resolution has a motion and second. Will...


In [None]:
train2 = asr_output['role'].str.split(" ", expand = True) 
train2.head(5)

Unnamed: 0,0,1,2
0,Chairman,Wormsley,
1,Commissioner,Brown,
2,Commissioner,Hobbs,
3,Chairman,Wormsley,
4,Commissioner,McCroskey,


In [None]:
asr_output['role']=train2[1]

In [None]:
asr_output.head()

Unnamed: 0,role,utt
0,Wormsley,Each of you has received the agenda. I will e...
1,Brown,So moved.
2,Hobbs,Seconded
3,Wormsley,It has been moved and seconded that the agend...
4,McCroskey,"Mister Chairman, my name has been omitted fro..."


In [None]:
labels=list(asr_output['utt'])  # done

In [None]:
path_to_stopwords    = '/content/gdrive/My Drive/summerization/Create_DATA/CoreRank/resources/stopwords/meeting/stopwords.en.dat'
path_to_filler_words = '/content/gdrive/My Drive/summerization/Create_DATA/CoreRank/resources/stopwords/meeting/filler_words.en.txt'
stopwords = load_stopwords(path_to_stopwords) #not used
filler_words = load_filler_words(path_to_filler_words)

In [None]:
utterances = []
for tmp in zip(asr_output['role'].tolist(), asr_output['utt'].tolist()):
  role, utt = tmp
  for ch in ['{vocalsound}', '{gap}', '{disfmarker}', '{comment}', '{pause}', '@reject@']:
    utt = re.sub(ch, '', utt) # cleaning
    utt = re.sub("'Kay", 'Okay', utt)
    utt = re.sub("'kay", 'Okay', utt)
    utt = re.sub('"Okay"', 'Okay', utt)
    utt = re.sub("'cause", 'cause', utt)
    utt = re.sub("'Cause", 'cause', utt)
    utt = re.sub('"cause"', 'cause', utt)
    utt = re.sub('"\'em"', 'them', utt)
    utt = re.sub('"\'til"', 'until', utt)
    utt = re.sub('"\'s"', 's', utt)

        # l. c. d. -> lcd
        # t. v. -> tv
    utt = re.sub('h. t. m. l.', 'html', utt)
    utt = re.sub(r"(\w)\. (\w)\. (\w)\.", r"\1\2\3", utt)
    utt = re.sub(r"(\w)\. (\w)\.", r"\1\2", utt)
    utt = re.sub(r"(\w)\.", r"\1", utt)

        # clean_utterance, remove filler_words
    utt = clean_utterance(utt, filler_words=filler_words)

        # strip extra white space
    utt = re.sub(' +', ' ', utt)
        # strip leading and trailing white space
    utt = utt.strip()

    if utt != '' and utt != '.' and utt != ' ':
      utterances.append((role, utt))

    # remove duplicate utterances per speaker
    utterances = sorted(set(utterances), key=utterances.index)
    utterances_indexed = zip(range(len(utterances)), zip(*utterances)[0], zip(*utterances)[1])



In [None]:
fil=[]
for i in utterances:
  fil.append(i[1])


In [None]:
fil

['Each of you has received the agenda I will entertain a motion that the agenda be approved',
 'So moved',
 'Seconded',
 'It has been moved and seconded that the agenda be approved as received by the members All those in favor signify by saying "Aye"?...Opposed by saying "No"?...The agenda is approved You have received a copy of the minutes of the last meeting Are there any corrections or additions to the meeting?',
 'Mister Chairman, my name has been omitted from the Special Committee on Indigent Care',
 'If there are no objections, the minutes will be corrected to include the name of Commissioner McCroskey Will the clerk make this correction Any further corrections? Seeing none, without objection the minutes will stand approved as read (This is a short cut way that is commonly used for approval of minutes and/or the agenda rather than requiring a motion and second)',
 'Commissioner Adkins, the first item on the agenda is yours',
 "Mister Chairman, I would like to make a motion to app

In [None]:
# #############################
# ### CORPUS PRE-PROCESSING ###
# #############################
from nltk import PerceptronTagger
tagger = PerceptronTagger()

final=[]   
for i in utterances_indexed:
  index, role, utt = i
  tokens = utt.split(' ')
  tokens_tagged = [tuple[0] + '/' + (tuple[1] if tuple[0] not in string.punctuation else 'PUNCT') for tuple in tagger.tag(tokens)]
  final.append((index, role,' '.join(tokens_tagged)))
  #if index==2:
   # break


In [None]:
texts=final
texts

[(0,
  'Wormsley',
  'Each/DT of/IN you/PRP has/VBZ received/VBN the/DT agenda/NN I/PRP will/MD entertain/VB a/DT motion/NN that/IN the/DT agenda/NN be/VB approved/VBN'),
 (1, 'Brown', 'So/RB moved/VBD'),
 (2, 'Hobbs', 'Seconded/VBN'),
 (3,
  'Wormsley',
  'It/PRP has/VBZ been/VBN moved/VBN and/CC seconded/VBD that/IN the/DT agenda/NN be/VB approved/VBN as/IN received/VBN by/IN the/DT members/NNS All/VBP those/DT in/IN favor/NN signify/NN by/IN saying/VBG "Aye"?...Opposed/VBN by/IN saying/VBG "No"?...The/JJ agenda/NN is/VBZ approved/VBN You/PRP have/VBP received/VBN a/DT copy/NN of/IN the/DT minutes/NNS of/IN the/DT last/JJ meeting/NN Are/NNP there/RB any/DT corrections/NNS or/CC additions/NNS to/TO the/DT meeting?/NN'),
 (4,
  'McCroskey',
  'Mister/NNP Chairman,/NNP my/PRP$ name/NN has/VBZ been/VBN omitted/VBN from/IN the/DT Special/NNP Committee/NNP on/IN Indigent/NNP Care/NNP'),
 (5,
  'Wormsley',
  'If/IN there/EX are/VBP no/DT objections,/IN the/DT minutes/NNS will/MD be/VB corre

In [None]:
'''
texts = Text
labels = str(labels)
word_pos=None
dialogues = []
data_list = []
for each in texts:
  role = each[1]
  sentence = ' '.join(word_pos.split('/')[0] for word_pos in each[2].split())
  sentence = sentence.strip().lower()
  pos_sentence = ' '.join(word_pos.split('/')[1] for word_pos in each[2].split())
  pos_sentence = pos_sentence.strip().lower()
  dialogues.append({'role': role, 'sentence': sentence, 'pos_sentence': pos_sentence})
data_list.append({'labels': labels, 'dialogues': dialogues})
'''
       

"\ntexts = Text\nlabels = str(labels)\nword_pos=None\ndialogues = []\ndata_list = []\nfor each in texts:\n  role = each[1]\n  sentence = ' '.join(word_pos.split('/')[0] for word_pos in each[2].split())\n  sentence = sentence.strip().lower()\n  pos_sentence = ' '.join(word_pos.split('/')[1] for word_pos in each[2].split())\n  pos_sentence = pos_sentence.strip().lower()\n  dialogues.append({'role': role, 'sentence': sentence, 'pos_sentence': pos_sentence})\ndata_list.append({'labels': labels, 'dialogues': dialogues})\n"

In [None]:
'''
with open("testing.txt", "w") as file:
    file.write(str(data_list))

with open("testing.txt", "r") as file:
    data2 = eval(file.readline())
'''    


'\nwith open("testing.txt", "w") as file:\n    file.write(str(data_list))\n\nwith open("testing.txt", "r") as file:\n    data2 = eval(file.readline())\n'

In [None]:
labels=fil

In [None]:
print(len(labels))
print(len(texts))

44
44


In [None]:
l1=[]
for i in labels:
  if i != '{vocalsound}':
    l1.append(i + ' ||')


In [None]:
l1[43]

'Without objection, the meeting will stand adjourned ||'

In [None]:
l1[43]='Without objection, the meeting will stand adjourned'

In [None]:
l1

['Each of you has received the agenda I will entertain a motion that the agenda be approved ||',
 'So moved ||',
 'Seconded ||',
 'It has been moved and seconded that the agenda be approved as received by the members All those in favor signify by saying "Aye"?...Opposed by saying "No"?...The agenda is approved You have received a copy of the minutes of the last meeting Are there any corrections or additions to the meeting? ||',
 'Mister Chairman, my name has been omitted from the Special Committee on Indigent Care ||',
 'If there are no objections, the minutes will be corrected to include the name of Commissioner McCroskey Will the clerk make this correction Any further corrections? Seeing none, without objection the minutes will stand approved as read (This is a short cut way that is commonly used for approval of minutes and/or the agenda rather than requiring a motion and second) ||',
 'Commissioner Adkins, the first item on the agenda is yours ||',
 "Mister Chairman, I would like to

In [None]:
l2 = ' '.join([elem for elem in l1]) 

In [None]:
replaced_string=l2

In [None]:
with open("original_reference.txt", "w") as output:
  output.write(str(replaced_string))

In [None]:
A=dict(texts=texts,labels=replaced_string)

In [None]:
for key,value in A.items():
  print(key)

texts
labels


In [None]:
import torch
torch.save({
            'ES2004a': A
            }, 'corrected_corpus')

In [None]:
new=torch.load('corrected_corpus')

In [None]:
for key, value in new.items():
  print(key)
  #texts = value['texts']
  #labels = value['labels']
  #tok=key

ES2004a
