In [None]:
#read corpus with positive and negative samples


#most commonly occuring utilities
#dealing with corpus containing the following structure
#sentence1:- w1 w2 w3
#sentence2:- w1 w2 w3
#create the baseline feeder class
import warnings
class Feeder(object):
    def __init__(self, min_count=1, pos_sampling=1, neg_sampling=2, verbose=0):
        if min_count < 1:
            warnings.warn('min_count must be greater than 0; set to default 1')
            min_count = 1
        if neg_sampling < 1:
            warnings.warn('Need at least one negative sample; neg_sampling set to default 2')
            neg_sampling = 2
        if pos_sampling < 1:
            warnings.warn('Need at least one positive sample; pos_sampling set to default 1')
            pos_sampling = 1
        self.vocab        = dict()
        self.min_count    = min_count
        self.pos_sampling = pos_sampling
        self.neg_sampling = neg_sampling
        self.verbose      = verbose

    #yield base sentence till end as tuple(word_indices, info to be passed)
    def base_sentence_generator(self):
        pass
    
    #generate n positive sentence partners as list of list of word_indices
    def get_pos_sentences(self, info=None):
        pass
    
    #similarly negative sentences as list of list of word-indices
    def get_neg_sentences(self, info=None):
        pass

    
from random import choice

#bes the specific class for base class
#input file:- one sentence per line and word is separated by space
class SimpleFeeder(Feeder):
    #file path to data file and available words are the only ones to be considered
    def __init__(self, filename, min_count=5, neg_sampling=2, available_words=None, verbose=1):
        super(SimpleFeeder, self).__init__(min_count=min_count, pos_sampling=2,
                                           neg_sampling=neg_sampling, verbose=verbose)
        self.filename  = filename
        self.sentences = list()

        self._read_data(available_words)

        
    #read words in only the available_words list
    def _read_data(self, available_words=None):
        # iterate trough sentences and count words
        word_count = dict()
        if self.verbose > 0:
            print('count words...')
        with open(self.filename) as f:
            for i, line in enumerate(f):
                line = line.rstrip('\n')
                sentence = line.split(' ')
                for word in sentence:
                    word_count[word] = word_count.get(word, 0) + 1

        # remove words which are not available
        if available_words is not None:
            for word in word_count.keys():
                if word not in available_words:
                    word_count[word] = -1

        if self.verbose > 0:
            print('read sentences from filename: {}'.format(self.filename))

        # iterate a second time and store sentences as word indices
        with open(self.filename) as f:
            for i, line in enumerate(f):
                line     = line.rstrip('\n')
                sentence = line.split(' ')
                for word in sentence:
                    if word_count[word] >= self.min_count:
                        self.vocab[word] = self.vocab.get(word, len(self.vocab))
                sentence = [self.vocab[word] for word in sentence if word in self.vocab]
                # don't add sentence if no words are left after removing infrequent words
                if len(sentence) > 0:
                    self.sentences.append(sentence)

        if self.verbose > 0:
            print('found {} sentences'.format(len(self.sentences)))

    #yield sentence, id as (list of word ids, sentence)
    def base_sentence_generator(self):
        for sentence_id in range(1, len(self.sentences) - 1):
            yield self.sentences[sentence_id], sentence_id
            
            
    #returns list of list of word-ids from sentence id.
    def get_pos_sentences(self, info=None):
        if info is None:
            raise ValueError('Need sentence_id of base sentence as info to retrieve positive examples')
        pos_sentence_1 = self.sentences[info - 1]
        pos_sentence_2 = self.sentences[info + 1]
        return [pos_sentence_1, pos_sentence_2]

    #returns neg sentences randomly from corpus list of list of word-ids
    def get_neg_sentences(self, info=None):
        neg_sentences = list()
        for _ in range(self.neg_sampling):
            sentence_id = None
            while sentence_id is None:
                sentence_id = choice(range(len(self.sentences)))
                if info is not None and sentence_id == info:
                    sentence_id = None
            neg_sentences.append(self.sentences[sentence_id])
        return neg_sentences
