In [11]:
import shutil

In [5]:
%run embeddings.ipynb

In [1]:
def parse_folder(read_folder, write_folder):
    '''
    Read folder is a folder containing files in the format: 
    <artricle title>\n\n<article body>\n\n<article highlights>\n\n<entity map>
    
    parses each file into body, output and highlights into
    write_folder\
        documents\      # contains article body
        summaries\      # contains article highlight
        outputs\        # contains binary labels of sentences
        
    TODO: parsed files end with `\n` which adds an empty line while converting to indices,
        make sure parsed files don't end with new line
    '''
    
    doc_folder = os.path.join(write_folder, "documents")
    summary_folder = os.path.join(write_folder, "summaries")
    output_folder = os.path.join(write_folder, "outputs")
    
    if not os.path.exists(doc_folder): os.mkdir(doc_folder)
    if not os.path.exists(summary_folder): os.mkdir(summary_folder)
    if not os.path.exists(output_folder): os.mkdir(output_folder)

    i = 0
    files = sorted(os.listdir(read_folder))
    for filename in files:
        try:
            print(i, filename, end='\r')
            content = open(os.path.join(read_folder, filename)).read()
            parts = content.split('\n\n')
            lines = parts[1].split('\n')
            output = [int(line[-1]) for line in lines]
            lines = [line.split('\t')[0] for line in lines]
            summary = parts[2].split('\n')
            entities = parts[3].split('\n')

            entity_map = {}
            new_lines = []
            for line in entities:
                try:
                    id, name = line.split(":")
                    entity_map[id] = name
                except:
                    pass

            for line in lines:
                words = line.split(' ')
                new_words = [entity_map[word] if word in entity_map else word for word in words]
                line = ""
                for word in new_words:
                    line += word + " "
                line += '\n'
                new_lines.append(line)
            with open(os.path.join(doc_folder, str(i)), 'w+') as file:
                file.writelines(new_lines)

            new_summary = []
            for line in summary:
                words = line.split(' ')
                new_words = [entity_map[word] if word in entity_map else word for word in words]
                line = ""
                for word in new_words:
                    line += word + " "
                line += '\n'
                new_summary.append(line)
            with open(os.path.join(summary_folder, str(i)), 'w+') as file:
                file.writelines(new_summary)
                
            with open(os.path.join(output_folder, str(i)), 'w+') as file:
                for val in output:
                    file.write(str(val) + '\n')

            i += 1
        except:
            pass

In [13]:
def parse_all(root_folder, write_folder):
    '''
    calls parse_folder function for each subfolder in the root_folder
    and creates corresponding folders in write_folder
    '''
    if not os.path.exists(write_folder): os.makedirs(write_folder)
    folders = os.listdir(root_folder)
    folders = [folder for folder in folders if os.path.isdir(os.path.join(root_folder, folder))]
    for folder in folders:
        dest_folder = os.path.join(write_folder, folder)
        if os.path.exists(dest_folder): shutil.rmtree(dest_folder)
        os.mkdir(dest_folder)
        parse_folder(os.path.join(root_folder, folder), dest_folder)

In [7]:
class NeuralSumToEmbedding(GloveEmbeddings):
    '''
    Class which handles conversion of text to indices
    Inherits from GloveEmbeddings class in embeddings.ipynb
    '''
    
    def __init__(self, glove_filename=None, dump_filename=None, extra_vocab_filename=None):
        super(NeuralSumToEmbedding, self).__init__()
        if glove_filename:
            self.load_glove(glove_filename)
        elif dump_filename:
            self.load_dump(dump_filename)
        if extra_vocab_filename:
            extra_vocab = pickle.load(open(extra_vocab_filename, 'rb'))[1]
            for i in extra_vocab.keys():
                self.add_to_vocab(i)
        self.vectors.reshape(len(self.words), -1)
        
    def tokeniser(self, text):
        '''
        text is \n seperated lines and <space> seperated tokens
        '''
        lines = text.split('\n')
        words = [line.split() for line in lines]
        return words
    
    def convert_to_indices_(self, document_folder, dump_file=None):
        '''
        converts all files in documents to indices and pickles them into dump_file
        TODO: if dump_file is None function won't work
        '''
        filenames = sorted(os.listdir(document_folder), key=lambda x: int(x))
        indices = []
        for filename in filenames:
            with open(os.path.join(document_folder, filename)) as docfile:
                text = self.tokeniser(docfile.read())
                indices_ = self.convert_to_indices(text)
                indices.append(indices_)
     
            print(filename, end='\r')
            
        if dump_file:
            pickle.dump(indices, open(dump_file, 'wb+'))
            
    def root_convert(self, root_read, root_dest):
        '''
        calls convert_to_indices_ to all subfolders of root_read and 
        creates corresponding files in root_dest
        '''
        folders = [folder for folder in os.listdir(root_read) if os.path.isdir(os.path.join(root_read, folder))]
        if os.path.exists(root_dest): shutil.rmtree(root_dest)
        os.makedirs(root_dest)
        for folder in folders:
            dest_file = os.path.join(root_dest, folder + '.pkl')
            document_folder = os.path.join(root_read, folder, "documents")
            self.convert_to_indices_(document_folder, dest_file)

In [8]:
class DataHandler:
    def __init__(self):
        pass
    
    def load_dump(self, filename):
        self.data = pickle.load(open(filename, 'rb'))
        
    def pad(self, max_sen_len=50, max_doc_len=90, output=False, padding_idx=400001):
        '''
        pades zeros to the right of each sentence upto a maximum length.
        TODO: masks? left padding?
        '''
        self.lines = []
        for doc in self.data:
            self.lines.append([])
            doc = doc[:max_doc_len]
            for line in doc:
                self.lines[-1].append([padding_idx for _ in range(max_sen_len - len(line))] + line[:max_sen_len])
                
    def padded_dump(self, filename):
        pickle.dump([self.lines, self.output], open(filename, 'wb+'))
        
    def load_padded_dump(self, filename, truncate_sum=5):
        self.lines  = pickle.load(open(filename, 'rb'))
        
    def extend_padded_dump(self, filename, truncate_sum=15):
        lines = pickle.load(open(filename, 'rb'))   
        self.lines.extend(lines)        
        
    def make_batches(self, batch_size):
        '''
        creates batches by grouping indices of same length together.
        TODO: make documents of different length to be in same batch
        '''
        self.lengths = [[] for _ in range(90)]
        self.batches = []
        for i in range(len(self.data)):
            self.lengths[len(self.lines[i]) -1].append(i)
        
        for i in self.lengths:
            random.shuffle(i)
            
        for i in self.lengths:
            for j in range(0, len(i), batch_size):
                self.batches.append(i[j:j+batch_size])
        random.shuffle(self.batches)