In [1]:
import datetime
import os
import pickle
import sys
import sqlite3
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
sys.path.append('../py/')

In [3]:
from text_cnn import TextCNN
from text_helpers import (
    build_dataset, clean_data, jieba_cut, load_dict, 
    pad_crop, read_stopwords, word_to_number)

In [61]:
build_dataset??

In [48]:
def add_sample_to_sql(frac):
    db = '../py/reviews.sqlite'
    conn = sqlite3.connect(db)
    now = datetime.datetime.now()
    df = pd.read_csv('../data/test_shuffle.txt', names='review sentiment'.split(), sep='\t')
    df['date'] = now
    sample_test = df.sample(frac=frac)
    row_counts = sample_test.shape[0]
    sample_test.to_sql('review_db', conn, if_exists='append', index=False)
    conn.close()
    print(f'Added {row_counts} records to review_db.')

In [12]:
def sqltext_to_number(
        X, vocab2ix_path='../data/vocab2ix.pkl', 
        ix2vocab_path='../data/ix2vocab.pkl', 
        stopwords_path='../data/stop_words_chinese.txt', 
        max_words=20):
    # Read dict and stopwords
    vocab2ix, _ = load_dict(vocab2ix_path, ix2vocab_path)
    stopwords = read_stopwords(stopwords_path)
    # Clean data
    data_cut = jieba_cut(X)
    text_data = clean_data(data_cut, stopwords)
    # Words to numbers
    num_data = word_to_number(text_data, vocab2ix)
    num_data_pad = list(pad_crop(num_data, max_words))
    return np.asarray(num_data_pad)

In [6]:
def get_continue_train_dataset(db_path, continue_train_size):
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    c.execute('SELECT * FROM review_db ORDER BY date DESC')
    results = c.fetchmany(continue_train_size)
    data = np.array(results)
    X = data[:, 0]
    y = data[:, 1].astype(int)
    x_train = sqltext_to_number(X)
    conn.close()
    return x_train, y

In [7]:
learning_rate = 0.001
training_steps = 100
print_loss_every = 2
batch_size = 50
keep_proba = 0.5

In [8]:
train = np.loadtxt('../data/train_data.txt', dtype=int)
test = np.loadtxt('../data/test_data.txt', dtype=int)
x_train = train[:, :-1]
y_train = train[:, -1:].reshape((-1,))
x_test = test[:, :-1]
y_test = test[:, -1:].reshape((-1,))

接下来，我们试着从测试数据里面随机挑选一部分数据，然后推送到数据库，看看 Online-Learning 对测试数据的意义

In [55]:
# add_sample_to_sql(0.001)

Added 11 records to review_db.


In [57]:
x_con_train, y_con_train = get_continue_train_dataset('../py/reviews.sqlite', 10000)
dataset_size = x_con_train.shape[0]

In [58]:
dataset_size

162

In [59]:
tf.reset_default_graph()
graph = tf.Graph()
with graph.as_default() as g:
    with tf.Session(graph=g) as sess:
        saver = tf.train.import_meta_graph('../save_model/model.ckpt.meta')
        saver.restore(sess, '../save_model/model.ckpt')
        input_x = graph.get_operation_by_name('input_x').outputs[0]
        input_y = graph.get_operation_by_name('input_y').outputs[0]
        keep_proba_ph = graph.get_operation_by_name('keep_proba').outputs[0]
        accuracy = graph.get_operation_by_name('accuracy/accuracy').outputs[0]
        loss = graph.get_operation_by_name('loss/loss').outputs[0]
        
        # Initialize opt variables
        temp = set(tf.global_variables())
        train_step = tf.train.AdamOptimizer(learning_rate, name='adam2').minimize(loss)
        sess.run(tf.variables_initializer(set(tf.global_variables()) - temp))

        # Set eval feed_dict and print previous accuracy
        train_feed_dict = {input_x: x_train, input_y: y_train, keep_proba_ph: 1.0}
        test_feed_dict = {input_x: x_test, input_y: y_test, keep_proba_ph: 1.0}
        previous_train_acc = accuracy.eval(feed_dict=train_feed_dict)
        previous_test_acc = accuracy.eval(feed_dict=test_feed_dict)
        print(f"Previous: Train/Test ACC: {previous_train_acc:.3f}/{previous_test_acc:.3f}")
        
        # Train
        for i in range(training_steps):
            start = (i * batch_size) % dataset_size
            end = min(start + batch_size, dataset_size)
            feed_dict={input_x: x_con_train[start:end],
                       input_y: y_con_train[start:end],
                       keep_proba_ph: keep_proba}
            sess.run(train_step, feed_dict=feed_dict)
            if i % print_loss_every == 0:
                avg_cost = loss.eval(feed_dict=feed_dict)
                train_acc = accuracy.eval(feed_dict=train_feed_dict)
                test_acc = accuracy.eval(feed_dict=test_feed_dict)
                print(f"Epoch: {i:04d} | AvgCost: {avg_cost:7.4f}", end="")
                print(f" | Train/Test ACC: {train_acc:.3f}/{test_acc:.3f}")

        # After training, save the sess
        # save_path = saver.save(sess, 'model/model.ckpt')

Previous: Train/Test ACC: 0.953/0.869
Epoch: 0000 | AvgCost:  0.8999 | Train/Test ACC: 0.955/0.874
Epoch: 0002 | AvgCost:  0.5421 | Train/Test ACC: 0.959/0.880
Epoch: 0004 | AvgCost:  0.7504 | Train/Test ACC: 0.959/0.880
Epoch: 0006 | AvgCost:  1.1248 | Train/Test ACC: 0.959/0.881
Epoch: 0008 | AvgCost:  0.4133 | Train/Test ACC: 0.959/0.882
Epoch: 0010 | AvgCost:  0.5314 | Train/Test ACC: 0.959/0.882
Epoch: 0012 | AvgCost:  0.6451 | Train/Test ACC: 0.958/0.881
Epoch: 0014 | AvgCost:  0.4869 | Train/Test ACC: 0.958/0.882
Epoch: 0016 | AvgCost:  1.7890 | Train/Test ACC: 0.957/0.882
Epoch: 0018 | AvgCost:  0.3451 | Train/Test ACC: 0.957/0.880
Epoch: 0020 | AvgCost:  0.6135 | Train/Test ACC: 0.957/0.881
Epoch: 0022 | AvgCost:  0.7336 | Train/Test ACC: 0.957/0.881
Epoch: 0024 | AvgCost:  0.2813 | Train/Test ACC: 0.956/0.881
Epoch: 0026 | AvgCost:  0.4135 | Train/Test ACC: 0.956/0.880
Epoch: 0028 | AvgCost:  0.3224 | Train/Test ACC: 0.955/0.879
Epoch: 0030 | AvgCost:  0.3150 | Train/Test ACC

**小结：**

从上面结果的打印，可以看到，Train ACC 变化不大(因为用的不是训练集的数据)，但 Test ACC 有了显著提升，说明 Online-learning 是有意义的

### Refrences

* [python - In TensorFlow is there any way to just initialize uninitialised variables? - Stack Overflow](http://stackoverflow.com/questions/35164529/in-tensorflow-is-there-any-way-to-just-initialize-uninitialised-variables/37291254)