# Task 2 Generate Sparse Representations 

In [235]:
from collections import defaultdict
from os import listdir,makedirs
from os.path import isfile, join, split, exists, splitext
import re
from collections import Counter

In [236]:
# load stopwords into list
stopwords_file = './stopwords_en.txt'
stopwords = [word for word in open(stopwords_file).read().split("\r\n")]

In [237]:
# use regular expression to make word tokenization
def extract_tokens(document, stopwords):
    words = re.findall("\w+(?:[-']\w+)?", document.lower())
    return [word for word in words if word not in stopwords]

In [238]:
# load meeting transcripts generated in task 1 
def load_txt(txt_file):
    f = open(txt_file)
    line = f.readline()
    sentence_list = []
    while line: 
        if line.strip() != '**********':
            sentence_list.append(line.strip())
        line = f.readline()
    return sentence_list

In [239]:
#  batch load meeting transcripts generated in task 1 
def batch_load_txt(onlyfiles):
    meeting_transcript_list = []
    for txt_file in onlyfiles: 
        meeting_transcript = load_txt(txt_file)
        meeting_transcript_list.extend(meeting_transcript)
    return meeting_transcript_list

In [240]:
# generate words dictioinary, key is word, value is index
def generate_token_dict(tokens_list):
    tokens_set = []
    [tokens_set.extend(tokens) for tokens in tokens_list]
    distinct_sorted_token = sorted(set(tokens_set))
    token_idx = range(0, len(distinct_sorted_token))
    sorted_token_set = zip(distinct_sorted_token, token_idx)
    token_dict = {}
    for word, idx in sorted_token_set:
        token_dict[word] = idx
    return token_dict

In [241]:
#  tokenization for each meeting transcript
def generate_segment_tokens_list(meeting_transcript_list):
    meeting_transcript_list_tokens = map(lambda x: extract_tokens(x, stopwords), meeting_transcript_list)
    frequency = defaultdict(int)
    for meeting_transcript_tokens in meeting_transcript_list_tokens:
        for token in meeting_transcript_tokens:
            frequency[token] += 1
    tokens_list = [[token for token in meeting_transcript_tokens if frequency[word] <= 1] for meeting_transcript_tokens in meeting_transcript_list_tokens]
    return tokens_list

In [242]:
# use words dictionary to vocab.txt
def output_vocab(vocab_dict, output_file):
    vocab_list = [(word, idx) for word, idx in vocab_dict.items()]
    vocab_list_sorted = sorted(vocab_list, key=lambda x:x[0])
    f = open(output_file, 'w')
    for word, idx in vocab_list_sorted:
        f.write("%s:%s\n" % (word,idx))
    f.close()

In [243]:
# use meeting transcripts generated in task 1  to create topic_seg
def generate_topic_seg(topic_txt_file):
    f = open(topic_txt_file, 'r')
    line = f.readline()
    count = 0
    boundaries = []
    while line:
        if line.strip() != "**********":
            count += 1
        else:
            boundaries.append(count)
        line = f.readline()
    zero_list = [0] * count
    for i in boundaries:
        zero_list[i-1] = 1
    topic_seg = ",".join(map(str, zero_list))
    meeting_transcript = split(topic_txt_file)[-1].replace(".txt", "")
    return "%s:%s" % (meeting_transcript, topic_seg)


In [244]:
# generate topic_seg for multiple txt file
def batch_generate_topic_seg(onlyfiles):
    topic_seg_list = []
    for txt_file in onlyfiles:
        topic_seg_list.append(generate_topic_seg(txt_file))
    return topic_seg_list

In [245]:
# output topic-seg generated by multiple txt file to tog_seg.txt
def output_topic_seg(topic_seg_list, output_file):
    f = open(output_file, 'w')
    for topic_seg in topic_seg_list:
        f.write("%s\n" %topic_seg)
    f.close()

In [246]:
def trans_tokens_to_index(tokens_list, vocab_dict):
    tokens_index_list = [vocab_dict.get(token) for token in tokens_list if vocab_dict.has_key(token)]
    counts = Counter(tokens_index_list)
    sparse_vec = ["%s:%s" % (token, freq) for token, freq in counts.items()]
    return ",".join(sparse_vec)

In [247]:
# use word dictionary to transform word to index, then count the frequency of the word, concat with ','
def sparse_txt_file(txt_file, vocab_dict):
    f = open(txt_file, 'r')
    line = f.readline()
    lines = []
    while line:
        lines.append(line.strip())
        line = f.readline()
    lines.pop(-1)
    paragraph_list = " ".join(lines).strip().split("**********")
    paragraph_list_tokens = map(lambda x: extract_tokens(x, stopwords), paragraph_list)
    paragraph_sparse_rep = map(lambda x: trans_tokens_to_index(x, vocab_dict), paragraph_list_tokens)
    return paragraph_sparse_rep

In [248]:
# write sparse representations to a txt file
def output_sparse_txt(paragraph_sparse_rep, output_file):
    f = open(output_file, 'w')
    for sparse_rep in paragraph_sparse_rep:
        f.write("%s\n" % sparse_rep)
    f.close()

In [249]:
# write batch sparse representations to a txt file
def batch_output_sparse_txt(txt_file_list, token_dict, output_dir):
    for topic_txt_file in  txt_file_list:
        sparse_rep_output_file = join(output_dir, split(topic_txt_file)[-1])
        paragraph_sparse_rep = sparse_txt_file(topic_txt_file, token_dict)
        output_sparse_txt(paragraph_sparse_rep, sparse_rep_output_file)

In [250]:
# preprocess
txt_files_dir = './txt_files'
onlyfiles = [join(txt_files_dir, f) for f in listdir(txt_files_dir) if (isfile(join(txt_files_dir, f))) and (splitext(f)[1] == '.txt')]

meeting_transcript_list = batch_load_txt(onlyfiles)
segment_tokens_list = generate_segment_tokens_list(meeting_transcript_list)
token_dict = generate_token_dict(segment_tokens_list)

In [251]:
# save word_string:integer_index to vocab.txt
vocab_output_file = './vocab.txt'
output_vocab(token_dict, vocab_output_file)

In [252]:
# save the topic boundaries encoded in boolean vectors to topic_segs.txt 
topic_seg_output_file = './topic_segs.txt'
topic_seg_list = batch_generate_topic_seg(onlyfiles)
output_topic_seg(topic_seg_list, topic_seg_output_file)

In [253]:
# save the sparse representations for all its paragraphs to .txt in the "sparse_files" folder
sparse_rep_output_dir = './sparse_files'
batch_output_sparse_txt(onlyfiles, token_dict , sparse_rep_output_dir)