## Utils

In [1]:
import logging
import os
import pickle 
def get_logger(name, log_file=None):
    format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    if not log_file:
        handle = logging.StreamHandler()
    else:
        handle = logging.FileHandler(log_file)
    handle.setFormatter(format)
    logger = logging.getLogger(name)
    logger.addHandler(handle)
    logger.setLevel(logging.DEBUG)
    return logger


def load_pkl(pkl_path):
    with open(pkl_path, 'rb') as f:
        result = pickle.load(f)
    return result


def dump_pkl(vocab, pkl_path, overwrite=True):
    if os.path.exists(pkl_path) and not overwrite:
        return
    with open(pkl_path, 'wb') as f:
        # pickle.dump(vocab, f, protocol=pickle.HIGHEST_PROTOCOL)
        pickle.dump(vocab, f, protocol=0)

In [2]:
# segementation part
def edit_distance_word(word, char_set):
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in char_set]
    return set(transposes + replaces)


def get_sub_array(nums):
    ret = []
    ii = 0
    for i, c in enumerate(nums):
        if i == 0:
            pass
        elif i <= ii:
            continue
        elif i == len(nums) - 1:
            ret.append([c])
            break
        ii = i
        cc = c
        # get continuity Substring
        while ii < len(nums) - 1 and nums[ii + 1] == cc + 1:
            ii = ii + 1
            cc = cc + 1
        if ii > i:
            ret.append([c, nums[ii] + 1])
        else:
            ret.append([c])
    return ret

In [3]:
import re
import jieba
from jieba import posseg
def is_chinese(uchar):
    if '\u4e00' <= uchar <= '\u9fa5':
        return True
    else:
        return False


def is_chinese_string(string):
    for c in string:
        if not is_chinese(c):
            return False
    return True


def is_number(uchar):
    if u'u0030' <= uchar <= u'u0039':
        return True
    else:
        return False


def is_alphabet(uchar):
    if (u'u0041' <= uchar <= u'u005a') or (u'u0061' <= uchar <= u'u007a'):
        return True
    else:
        return False


def is_alphabet_string(string):
    for c in string:
        if c < 'a' or c > 'z':
            return False
    return True


def is_other(uchar):
    if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
        return True
    else:
        return False


def B2Q(uchar):
    inside_code = ord(uchar)
    if inside_code < 0x0020 or inside_code > 0x7e:  # 不是半角字符就返回原来的字符
        return uchar
    if inside_code == 0x0020:  # 除了空格其他的全角半角的公式为:半角=全角-0xfee0
        inside_code = 0x3000
    else:
        inside_code += 0xfee0
    return chr(inside_code)


def Q2B(uchar):
    inside_code = ord(uchar)
    if inside_code == 0x3000:
        inside_code = 0x0020
    else:
        inside_code -= 0xfee0
    if inside_code < 0x0020 or inside_code > 0x7e:  # 转完之后不是半角字符返回原来的字符
        return uchar
    return chr(inside_code)


def stringQ2B(ustring):
    return "".join([Q2B(uchar) for uchar in ustring])


def uniform(ustring):
    return stringQ2B(ustring).lower()


def remove_punctuation(strs):
    return re.sub("[\s+\.\!\/<>“”,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", "", strs.strip())


def segment(sentence, cut_type='word', pos=False):
    import logging
    jieba.default_logger.setLevel(logging.ERROR)
    if pos:
        if cut_type == 'word':
            word_pos_seq = posseg.lcut(sentence)
            word_seq, pos_seq = [], []
            for w, p in word_pos_seq:
                word_seq.append(w)
                pos_seq.append(p)
            return word_seq, pos_seq
        elif cut_type == 'char':
            word_seq = list(sentence)
            pos_seq = []
            for w in word_seq:
                w_p = posseg.lcut(w)
                pos_seq.append(w_p[0].flag)
            return word_seq, pos_seq
    else:
        if cut_type == 'word':
            return jieba.lcut(sentence)
        elif cut_type == 'char':
            return list(sentence)


def tokenize(sentence, mode='default'):
    import logging
    jieba.default_logger.setLevel(logging.ERROR)
    return list(jieba.tokenize(sentence, mode=mode))


if __name__ == "__main__":
    a = 'nihao'
    print(a, is_alphabet_string(a))
    # test Q2B and B2Q
    for i in range(0x0020, 0x007F):
        print(Q2B(B2Q(chr(i))), B2Q(chr(i)))
    # test uniform
    ustring = '中国 人名ａ高频Ａ  扇'
    ustring = uniform(ustring)
    print(ustring)
    print(is_other(','))
    print(uniform('你干么！ｄ７＆８８８学英 语ＡＢＣ？ｎｚ'))
    print(is_chinese('喜'))
    print(is_chinese_string('喜,'))
    print(is_chinese_string('丽，'))

nihao True
  　
! ！
" ＂
# ＃
$ ＄
% ％
& ＆
' ＇
( （
) ）
* ＊
+ ＋
, ，
- －
. ．
/ ／
0 ０
1 １
2 ２
3 ３
4 ４
5 ５
6 ６
7 ７
8 ８
9 ９
: ：
; ；
< ＜
= ＝
> ＞
? ？
@ ＠
A Ａ
B Ｂ
C Ｃ
D Ｄ
E Ｅ
F Ｆ
G Ｇ
H Ｈ
I Ｉ
J Ｊ
K Ｋ
L Ｌ
M Ｍ
N Ｎ
O Ｏ
P Ｐ
Q Ｑ
R Ｒ
S Ｓ
T Ｔ
U Ｕ
V Ｖ
W Ｗ
X Ｘ
Y Ｙ
Z Ｚ
[ ［
\ ＼
] ］
^ ＾
_ ＿
` ｀
a ａ
b ｂ
c ｃ
d ｄ
e ｅ
f ｆ
g ｇ
h ｈ
i ｉ
j ｊ
k ｋ
l ｌ
m ｍ
n ｎ
o ｏ
p ｐ
q ｑ
r ｒ
s ｓ
t ｔ
u ｕ
v ｖ
w ｗ
x ｘ
y ｙ
z ｚ
{ ｛
| ｜
} ｝
~ ～
中国 人名a高频a  扇
True
你干么!d7&888学英 语abc?nz
True
False
False


## config

In [4]:
import os
data_dir = './data'
train_path = os.path.join(data_dir, 'train_sample.txt')
test_path = os.path.join(data_dir, 'test_sample.txt')
output_dir = './output'
save_vocab_path = os.path.join(output_dir, 'vocab.txt')
attn_model_path = os.path.join(output_dir, 'attn_model.weight')
batch_size = 32
epochs = 50
rnn_hidden_dim = 128
maxlen = 400
min_count = 5
dropout = 0.0
use_gpu = False
sep = '\t'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

## reader

In [5]:
from collections import defaultdict
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

PAD_TOKEN = 'PAD'
EOS_TOKEN = 'EOS'
GO_TOKEN = 'GO'
UNK_TOKEN = 'UNK'


class Reader(object):
    def __init__(self, train_path=None, token_2_id=None,
                 special_tokens=(), min_count=1, sep='\t'):
        if token_2_id:
            self.token_2_id = token_2_id
        else:
            token_counts = defaultdict(int)
            for tokens in self.read_tokens(train_path):
                for i in tokens:
                    token_counts[i] += 1
            new_token_counts = {}
            for i, j in token_counts.items():
                if j >= min_count:
                    new_token_counts[i] = j
            self.token_counts = new_token_counts
            count_pairs = sorted(self.token_counts.items(), key=lambda k: (-k[1], k[0]))
            vocab, _ = list(zip(*count_pairs))
            vocab = list(vocab)
            vocab[0:0] = special_tokens
            full_token_id = list(zip(vocab, range(len(vocab))))
            self.token_2_id = dict(full_token_id)
        self.id_2_token = {int(v): k for k, v in self.token_2_id.items()}
        self.sep = sep

    def read_tokens(self, path):
       
        raise NotImplementedError("Must implement read_tokens")

    def unknown_token(self):
        raise NotImplementedError("Must implement unknow_tokens")

    def read_samples_by_string(self, path):
        raise NotImplementedError("Must implement read_samples")

    def convert_token_2_id(self, token):
        token_id = token if token in self.token_2_id else self.unknown_token()
        return self.token_2_id[token_id]

    def convert_id_2_token(self, id):
        return self.id_2_token[id]

    def is_unknown_token(self, token):
        return token not in self.token_2_id or token == self.unknown_token()

    def sentence_2_token_ids(self, sentence):
        return [self.convert_token_2_id(w) for w in sentence.split()]

    def token_ids_2_tokens(self, word_ids):
        return [self.convert_id_2_token(w) for w in word_ids]

    def read_samples(self, path):
        for source_words, target_words in self.read_samples_by_string(path):
            source = [self.convert_token_2_id(w) for w in source_words]
            target = [self.convert_token_2_id(w) for w in target_words]
            # head: "GO"; last: "EOS"
            target.insert(0, GO_ID)
            target.append(EOS_ID)
            yield source, target

    def read_samples_tokens(self, path):
        for source_words, target_words in self.read_samples_by_string(path):
            target = target_words
            # head: "GO"; last: "EOS"
            target.insert(0, GO_TOKEN)
            target.append(EOS_TOKEN)
            yield source_words, target

    def build_dataset(self, path):
        print('Read data, path:{0}'.format(path))
        sources, targets = [], []
        for source, target in self.read_samples_tokens(path):
            sources.append(source)
            targets.append(target)
        return sources, targets

## corpus_reader

In [6]:
from codecs import open

# from generator.utils.io_utils import get_logger
# from generator.reader import Reader, PAD_TOKEN, EOS_TOKEN, GO_TOKEN, UNK_TOKEN

logger = get_logger(__name__)


def save_word_dict(dict_data, save_path):
    with open(save_path, 'w', encoding='utf-8') as f:
        for k, v in dict_data.items():
            f.write("%s\t%d\n" % (k, v))


def load_word_dict(save_path):
    dict_data = dict()
    with open(save_path, 'r', encoding='utf-8') as f:
        for line in f:
            items = line.strip().split()
            try:
                dict_data[items[0]] = int(items[1])
            except IndexError:
                logger.error('error', line)
    return dict_data


class CorpusReader(Reader):
    def __init__(self, train_path=None, token_2_id=None, min_count=1, sep='\t'):
        super(CorpusReader, self).__init__(
            train_path=train_path,
            token_2_id=token_2_id,
            special_tokens=[PAD_TOKEN, GO_TOKEN, EOS_TOKEN, UNK_TOKEN],
            min_count=min_count,
            sep=sep)
        self.UNKNOWN_ID = self.token_2_id[UNK_TOKEN]

    def read_samples_by_string(self, path):
        with open(path, 'r', encoding='utf-8') as f:
            while True:
                line = f.readline()
                line = line.lower().strip()
                if not line:
                    break
                if self.sep not in line:
                    continue
                source, target = line.split(self.sep)
                yield source.split(), target.split()

    def unknown_token(self):
        return UNK_TOKEN

    def read_tokens(self, path, is_infer=False):
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                yield line.lower().strip().split()


def str2id(s, char2id, maxlen):
    return [char2id.get(c, char2id[UNK_TOKEN]) for c in s[:maxlen]]


def padding(x, char2id):
    ml = max([len(i) for i in x])
    return [i + [char2id[PAD_TOKEN]] * (ml - len(i)) for i in x]


def id2str(ids, id2char):
    return ''.join([id2char.get(i, '') for i in ids])

## Extract

In [7]:
import jieba.analyse
sentence = 'when someone commits a murder they typically go to extreme lengths to cover up their brutal crime . the harsh prison sentences that go along with killing someone are enough to deter most people from ever wanting to be caught , not to mention the intense social scrutiny they would face . occasionally , however , there are folks who come forward and admit guilt in their crime . this can be for any number of reasons , like to gain notoriety or to clear their conscience , though , in other instances , people do it to come clean to the people they care about . when rachel hutson was just 19 years old , she murdered her own mother in cold blood . as heinous and unimaginable as her crime was , it was what she did after that shocked people the most … rachel was just a teenager when she committed an unthinkable act against her own other … while that in and of itself was a heinous crime , it ’s what rachel did in the aftermath of her own mother ’s murder that shook people to their core . you ’re not going to believe what strange thing she decided to do next … it ’s hard to understand what drove rachel to commit this terrible act , but sending the photo afterward seems to make even less sense . share this heartbreaking story with your friends below .'
keywords = jieba.analyse.extract_tags(sentence, topK=20, withWeight=True)

for item in keywords:
    print(item[0], item[1])
print('*' * 42)
keywords = jieba.analyse.textrank(sentence, topK=20, withWeight=True, allowPOS=('n', 'nr', 'ns','eng'))
for item in keywords:
    print(item[0], item[1])

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\sujal\AppData\Local\Temp\jieba.cache
Loading model cost 0.501 seconds.
Prefix dict has been built successfully.


people 0.38072507971019104
was 0.38072507971019104
their 0.30458006376815283
crime 0.30458006376815283
rachel 0.30458006376815283
she 0.30458006376815283
her 0.30458006376815283
what 0.30458006376815283
when 0.22843504782611462
they 0.22843504782611462
own 0.22843504782611462
someone 0.15229003188407642
murder 0.15229003188407642
go 0.15229003188407642
most 0.15229003188407642
come 0.15229003188407642
other 0.15229003188407642
do 0.15229003188407642
just 0.15229003188407642
mother 0.15229003188407642
******************************************
her 1.0
people 0.939535544696493
she 0.9050636065888443
their 0.8998168470055914
they 0.8474956845601105
what 0.7728483618043107
was 0.7553193797252628
rachel 0.7482045941740926
when 0.5876266445450075
own 0.5749351275800411
go 0.4933802679645524
come 0.4781172142187957
crime 0.45992833107989767
act 0.4269534435942861
just 0.4231355960295955
your 0.41912524309343174
mother 0.39352575584470395
number 0.38389680194447895
gain 0.3809415453547345
scru

# sequential 

In [8]:
import os

from keras import backend as K
from keras.layers import Input, Lambda, Layer, Embedding, Bidirectional, Dense, Activation, GRU, CuDNNGRU
from keras.models import Model
from keras.optimizers import Adam


class ScaleShift(Layer):

    def __init__(self, **kwargs):
        super(ScaleShift, self).__init__(**kwargs)

    def build(self, input_shape):
        kernel_shape = (1,) * (len(input_shape) - 1) + (input_shape[-1],)
        self.log_scale = self.add_weight(name='log_scale',
                                         shape=kernel_shape,
                                         initializer='zeros')
        self.shift = self.add_weight(name='shift',
                                     shape=kernel_shape,
                                     initializer='zeros')

    def call(self, inputs):
        x_outs = K.exp(self.log_scale) * inputs + self.shift
        return x_outs


class Interact(Layer):
    def __init__(self, **kwargs):
        super(Interact, self).__init__(**kwargs)

    def build(self, input_shape):
        in_dim = input_shape[0][-1]
        out_dim = input_shape[1][-1]
        self.kernel = self.add_weight(name='kernel',
                                      shape=(in_dim, out_dim),
                                      initializer='glorot_normal')

    def call(self, inputs):
        q, v, v_mask = inputs
        k = v
        mv = K.max(v - (1. - v_mask) * 1e10, axis=1, keepdims=True)  # maxpooling1d
        mv = mv + K.zeros_like(q[:, :, :1]) 
        qw = K.dot(q, self.kernel)
        a = K.batch_dot(qw, k, [2, 2]) / 10.
        a -= (1. - K.permute_dimensions(v_mask, [0, 2, 1])) * 1e10
        a = K.softmax(a)
        o = K.batch_dot(a, v, [2, 1])
        return K.concatenate([o, q, mv], 2)

    def compute_output_shape(self, input_shape):
        return (None, input_shape[0][1],
                input_shape[0][2] + input_shape[1][2] * 2)


class Seq2seqAttnModel(object):
    def __init__(self, chars, hidden_dim=128, attn_model_path=None, use_gpu=False, dropout=0.2):
        self.chars = chars
        self.hidden_dim = hidden_dim
        self.model_path = attn_model_path
        self.use_gpu = use_gpu
        self.dropout = float(dropout)

    def build_model(self):
        x_in = Input(shape=(None,))
        y_in = Input(shape=(None,))
        x = x_in
        y = y_in
        x_mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(x)
        y_mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(y)

        x_one_hot = Lambda(self._one_hot)([x, x_mask])
        x_prior = ScaleShift()(x_one_hot)

        # embedding
        embedding = Embedding(len(self.chars), self.hidden_dim)
        x = embedding(x)
        y = embedding(y)

        if self.use_gpu:
            # encoder
            x = Bidirectional(CuDNNGRU(int(self.hidden_dim / 2), return_sequences=True))(x)
            x = Bidirectional(CuDNNGRU(int(self.hidden_dim / 2), return_sequences=True))(x)
            # decoder
            y = CuDNNGRU(self.hidden_dim, return_sequences=True)(y)
            y = CuDNNGRU(self.hidden_dim, return_sequences=True)(y)
        else:
            # encoder
            x = Bidirectional(GRU(int(self.hidden_dim / 2), return_sequences=True, dropout=self.dropout))(x)
            x = Bidirectional(GRU(int(self.hidden_dim / 2), return_sequences=True, dropout=self.dropout))(x)
            # decoder
            y = GRU(self.hidden_dim, return_sequences=True, dropout=self.dropout)(y)
            y = GRU(self.hidden_dim, return_sequences=True, dropout=self.dropout)(y)

        xy = Interact()([y, x, x_mask])
        xy = Dense(512, activation='relu')(xy)
        xy = Dense(len(self.chars))(xy)
        xy = Lambda(lambda x: (x[0] + x[1]) / 2)([xy, x_prior]) 
        xy = Activation('softmax')(xy)

        cross_entropy = K.sparse_categorical_crossentropy(y_in[:, 1:], xy[:, :-1])
        loss = K.sum(cross_entropy * y_mask[:, 1:, 0]) / K.sum(y_mask[:, 1:, 0])

        model = Model([x_in, y_in], xy)
        model.add_loss(loss)
        model.compile(optimizer=Adam(1e-3))
        if os.path.exists(self.model_path):
            model.load_weights(self.model_path)
        return model

    def _one_hot(self, x):
        x, x_mask = x
        x = K.cast(x, 'int32')
        x = K.one_hot(x, len(self.chars))
        x = K.sum(x_mask * x, 1, keepdims=True)
        x = K.cast(K.greater(x, 0.5), 'float32')
        return x

## Evaluate

In [9]:
import numpy as np
from keras.callbacks import Callback

# from generator.corpus_reader import str2id, id2str
# from generator.reader import GO_TOKEN, EOS_TOKEN


def gen_target(input_text, model, char2id, id2char, maxlen=400, topk=3, max_target_len=50):
    xid = np.array([str2id(input_text, char2id, maxlen)] * topk) 
    yid = np.array([[char2id[GO_TOKEN]]] * topk)
    scores = [0] * topk
    for i in range(max_target_len):
        proba = model.predict([xid, yid])[:, i, :]
        log_proba = np.log(proba + 1e-6)
        arg_topk = log_proba.argsort(axis=1)[:, -topk:]
        _yid = []
        _scores = []
        if i == 0:
            for j in range(topk):
                _yid.append(list(yid[j]) + [arg_topk[0][j]])
                _scores.append(scores[j] + log_proba[0][arg_topk[0][j]])
        else:
            for j in range(len(xid)):
                for k in range(topk):
                    _yid.append(list(yid[j]) + [arg_topk[j][k]])
                    _scores.append(scores[j] + log_proba[j][arg_topk[j][k]])
            _arg_topk = np.argsort(_scores)[-topk:]
            _yid = [_yid[k] for k in _arg_topk]
            _scores = [_scores[k] for k in _arg_topk]
        yid = []
        scores = []
        for k in range(len(xid)):
            if _yid[k][-1] == char2id[EOS_TOKEN]:
                return id2str(_yid[k][1:-1], id2char)
            else:
                yid.append(_yid[k])
                scores.append(_scores[k])
        yid = np.array(yid)
    return id2str(yid[np.argmax(scores)][1:-1], id2char)


class Evaluate(Callback):
    def __init__(self, model, attn_model_path, char2id, id2char, maxlen):
        super(Evaluate, self).__init__()
        self.lowest = 1e10
        self.model = model
        self.attn_model_path = attn_model_path
        self.char2id = char2id
        self.id2char = id2char
        self.maxlen = maxlen

    def on_epoch_end(self, epoch, logs=None):
        sents = [
            "Field &amp; Main Bank purchased a new position in PowerShares Fin . Preferred Port . ( NYSEARCA : PGF )  "
            "in the fourth quarter , according to its most recent disclosure with the SEC . The institutional investor "
            "purchased 22,550 shares of the exchange traded fund 's stock , valued at approximately $ 425,000 . "
            "Other large investors also recently modified their holdings of the company . Cedar Hill Associates LLC "
            "acquired a new stake in shares of PowerShares Fin . Preferred Port .",
            ]
        
        for sent in sents:
            target = gen_target(sent, self.model, self.char2id, self.id2char, self.maxlen)
            print('input:' + sent)
            print('output:' + target)
        
        if logs['val_loss'] <= self.lowest:
            self.lowest = logs['val_loss']
            self.model.save_weights(self.attn_model_path)

# training

In [34]:
import sys

sys.path.append('..')
import os

import numpy as np

# from corpus_reader import CorpusReader, str2id, padding, load_word_dict, save_word_dict
# from generator.evaluate import Evaluate
# from generator import config
# from generator.seq2seq_attn_model import Seq2seqAttnModel


def data_generator(input_texts, target_texts, char2id, batch_size, maxlen=400):
    while True:
        X, Y = [], []
        for i in range(len(input_texts)):
            X.append(str2id(input_texts[i], char2id, maxlen))
            Y.append(str2id(target_texts[i], char2id, maxlen))
            if len(X) == batch_size:
                X = np.array(padding(X, char2id))
                Y = np.array(padding(Y, char2id))
                yield [X, Y], None
                X, Y = [], []


def get_validation_data(input_texts, target_texts, char2id, maxlen=400):
    X, Y = [], []
    for i in range(len(input_texts)):
        X.append(str2id(input_texts[i], char2id, maxlen))
        Y.append(str2id(target_texts[i], char2id, maxlen))
        X = np.array(padding(X, char2id))
        Y = np.array(padding(Y, char2id))
        return [X, Y], None


def train(train_path='',
          test_path='',
          save_vocab_path='',
          attn_model_path='',
          batch_size=64,
          epochs=100,
          maxlen=400,
          hidden_dim=128,
          min_count=5,
          dropout=0.2,
          use_gpu=False,
          sep='\t'):
    # load or save word dict
    if os.path.exists(save_vocab_path):
        token_2_id = load_word_dict(save_vocab_path)
        data_reader = CorpusReader(train_path=train_path, token_2_id=token_2_id, min_count=min_count, sep=sep)
    else:
        print('Training data...')
        data_reader = CorpusReader(train_path=train_path, min_count=min_count, sep=sep)
        token_2_id = data_reader.token_2_id
        save_word_dict(token_2_id, save_vocab_path)

    id_2_token = data_reader.id_2_token
    input_texts, target_texts = data_reader.build_dataset(train_path)
    test_input_texts, test_target_texts = data_reader.build_dataset(test_path)

    model = Seq2seqAttnModel(token_2_id,
                             attn_model_path=attn_model_path,
                             hidden_dim=hidden_dim,
                             use_gpu=use_gpu,
                             dropout=dropout).build_model()
    print(model.summary())
    evaluator = Evaluate(model, attn_model_path, token_2_id, id_2_token, maxlen)
    model.fit(data_generator(input_texts, target_texts, token_2_id, batch_size, maxlen),
                        steps_per_epoch=(len(input_texts) + batch_size - 1) // batch_size,
                        epochs=epochs,
                        validation_data=get_validation_data(test_input_texts, test_target_texts, token_2_id, maxlen),
                        callbacks=[evaluator])


if __name__ == "__main__":
    tr_path=os.path.join(data_dir, 'train_sample.txt')
    ts_path=os.path.join(data_dir, 'test_sample.txt')
    sv=os.path.join(output_dir, 'vocab.txt')
    at_model=os.path.join(output_dir, 'attn_model.weight')
    train(train_path=tr_path,
          test_path=ts_path,
          save_vocab_path=sv,
          attn_model_path=at_model,
          batch_size=32,
          epochs=50,
          maxlen=400,
          hidden_dim=128,
          min_count=5,
          dropout=0.0,
          use_gpu=False,
          sep='\t')

Read data, path:./data\train_sample.txt
Read data, path:./data\test_sample.txt
Model: "model_24"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_50 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 input_49 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 embedding_24 (Embedding)       (None, None, 128)    1227520     ['input_49[0][0]',               
                                                                  'input_50[0][0]']               
                                                                                                  
 gru_98 (GRU

ValueError: in user code:

    File "C:\Users\sujal\Desktop\machine_learning\project1\env\lib\site-packages\keras\engine\training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\sujal\Desktop\machine_learning\project1\env\lib\site-packages\keras\engine\training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\sujal\Desktop\machine_learning\project1\env\lib\site-packages\keras\engine\training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\sujal\Desktop\machine_learning\project1\env\lib\site-packages\keras\engine\training.py", line 1050, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\sujal\Desktop\machine_learning\project1\env\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\sujal\Desktop\machine_learning\project1\env\lib\site-packages\keras\engine\input_spec.py", line 219, in assert_input_compatibility
        raise ValueError(

    ValueError: Layer "model_24" expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, None) dtype=int32>]


## Infer

In [None]:
import sys
import os

sys.path.append('..')

# Import necessary dependencies here
# from generator import config
# from generator.corpus_reader import load_word_dict
# from generator.evaluate import gen_target
# from generator.seq2seq_attn_model import Seq2seqAttnModel


class Inference(object):
    def __init__(self, save_vocab_path='', attn_model_path='', maxlen=400):
        if os.path.exists(save_vocab_path):
            self.char2id = load_word_dict(save_vocab_path)
            self.id2char = {int(j): i for i, j in self.char2id.items()}
            self.chars = set(self.char2id.keys())
        else:
            print('Vocabulary path does not exist.')

        # Initialize other attributes and load the model
        self.maxlen = maxlen
        self.model = Seq2seqAttnModel(self.chars, attn_model_path=attn_model_path).build_model()

    def infer(self, sentence):
        # Perform inference using the loaded model
        return gen_target(sentence, self.model, self.char2id, self.id2char, self.maxlen, topk=3)


if __name__ == "__main__":
    inputs = [
        "Field &amp; Main Bank purchased a new position in PowerShares Fin . Preferred Port . ( NYSEARCA : PGF )  "
        "in the fourth quarter , according to its most recent disclosure with the SEC . The institutional investor "
        "purchased 22,550 shares of the exchange traded fund 's stock , valued at approximately $ 425,000 . "
        "Other large investors also recently modified their holdings of the company . Cedar Hill Associates LLC "
        "acquired a new stake in shares of PowerShares Fin . Preferred Port .",
    ]
    
    save_vocab_path = 'path_to_vocab_file'
    attn_model_path = 'path_to_model_file'
    maxlen = 400

    inference = Inference(save_vocab_path=save_vocab_path, attn_model_path=attn_model_path, maxlen=maxlen)
    for i in inputs:
        target = inference.infer(i)
        print('input:' + i)
        print('output:' + target)

    while True:
        sent = input('input:')
        print("output:" + inference.infer(sent))
