In [1]:
import os
import _pickle as pickle
import numpy as np
import random

In [36]:
class parseNeuralSum():
    def __init__(self):
        self.vocab = {}
        self.entity_map = {}
        
    def parse_file(self, filename):
        content = open(filename).read()
        parts = content.split('\n\n')
        lines = parts[1].split('\n')
        output = [int(line[-1]) for line in lines]
        lines = [line.split('\t')[0] for line in lines]
        summary = parts[2].split('\n')
        return lines, output, summary
    
    def parse_folder_1(self, input_folder, target_document_folder, target_output_folder=None, 
                     target_summary_folder=None):
        filenames = os.listdir(input_folder)
        id = 0
        for filename in filenames:
            lines, output, summary = self.parse_file(input_folder + filename)
            with open(target_document_folder + str(id), 'w+') as file:
                [file.write(line + '\n') for line in lines]
            if target_output_folder:
                with open(target_output_folder + str(id), 'w+') as file:
                    for num in output:
                        file.write(str(num) + '\n')
            if target_summary_folder:
                with open(target_summary_folder + str(id), 'w+') as file:
                    [file.write(line + '\n') for line in summary]
            id += 1
            print(id, end='\r')
        
    def dump_all(self, filename):
        pickle.dump([self.vocab, self.entity_map], open(filename, 'wb+'))

In [35]:
class GloveEmbeddings:
    def __init__(self):
        self.word2id = {}
        self.vectors = []
        self.words = []
        self.dim = None
        
    def load_glove(self, filename):
        id = 0
        with open(filename) as file:
            for line_ in file:
                line = line_.split()
                word = line[0]
                self.words.append(word)
                self.word2id[word] , id = id, id + 1
                vect = np.array(line[1:]).astype(np.float)
                self.vectors.append(vect)
        self.vectors = np.array(self.vectors)
        self.dim = self.vectors.shape[-1]
        self.add_to_vocab('<unk>')
    
    def modify_pretrained(self, vocab):
        pass
    
    def add_to_vocab(self, word):
        word = word.lower()
        self.words.append(word)
        self.word2id[word] = len(self.words) - 1
        self.vectors = np.append(self.vectors, np.random.random(self.dim))
        
    def dump_all(self, filename):
        pickle.dump([self.word2id, self.vectors.reshape(-1), self.words, self.dim], open(filename, 'wb+'))
        
    def load_dump(self, filename):
        self.word2id, self.vectors, self.words, self.dim = pickle.load(open(filename, 'rb+'))
        self.vectors = self.vectors.reshape(-1, self.dim)
        
    def convert_to_indices(self, lines):
        indices = []
        for line in lines:
            indices.append([])
            for word in line:
                if word in self.word2id:
                    id = self.word2id[word]
                else:
                    id = self.word2id['<unk>']
                indices[-1].append(id)
        return indices

In [38]:
class NeuralSumGlove(GloveEmbeddings):
    def __init__(self, glove_filename=None, dump_filename=None):
        super(NeuralSumGlove, self).__init__()
        if glove_filename:
            self.load_glove(glove_filename)
        else:
            self.load_dump(dump_filename)
        
    def tokeniser(self, text):
        lines = text.split('\n')[:-1]
        words = [line.split() for line in lines]
        return words
    
    def convert_to_indices_(self, document_folder, output_folder=None, dump_file=None):
        filenames = sorted(os.listdir(document_folder), key=lambda x: int(x))
        indices = []
        for filename in filenames:
            indices.append([])
            with open(document_folder + filename) as docfile:
                text = self.tokeniser(docfile.read())
                indices_ = self.convert_to_indices(text)
                indices[-1].append(indices_)
            if output_folder:
                with open(output_folder + filename) as outputfile:
                    text = outputfile.read().split('\n')[:-1]
                    output = [int(line) for line in text]
                    indices[-1].append(output)
            print(filename, end='\r')
        if dump_file:
            pickle.dump(indices, open(dump_file, 'wb+'))

In [None]:
class NeuralSumDataHandler:
    def __init__(self):
        pass
    
    def load_dump(self, filename):
        self.data = pickle.load(open(filename, 'rb'))
        
    def pad(self, max_sen_len=50, max_doc_len=90, output=False):
        self.lines = []
        self.output = []
        if output:
            for doc, out in self.data:
                self.lines.append([])
                doc = doc[:max_doc_len]
                for line in doc:
                    self.lines[-1].append([0 for _ in range(max_sen_len - len(line))] + line[:max_sen_len])
                self.output.append(out[:max_doc_len])
                
    def padded_dump(self, filename):
        pickle.dump([self.lines, self.output], open(filename, 'wb+'))
        
    def load_padded_dump(self, filename):
        self.lines, self.output = pickle.load(open(filename, 'rb+'))
        self.output = np.array(self.output)
        self.output[self.output == 2] = 0
        
    def make_batches(self, batch_size):
        self.lengths = [[] for _ in range(90)]
        self.batches = []
        for i in range(len(self.lines)):
            self.lengths[len(self.lines[i]) -1].append(i)
        for i in self.lengths:
            for j in range(0, len(i), batch_size):
                self.batches.append(i[j:j+batch_size])
        random.shuffle(self.batches)