In [None]:
import tensorflow as tf
import numpy as np
import os
import time
import datetime
from functions_helpers import *
from TextCNN import *

data_folder = 'twitter-datasets/'
embeddings_dim = 20
pos_train_file = data_folder + 'pos_train.txt'
neg_train_file = data_folder + 'neg_train.txt'
vocab_pickle = data_folder + 'vocab.pkl'
cooc_pickle = data_folder + 'cooc.pkl'
embeddings_file = data_folder + 'embeddings.npy'
filter_sizes = "3,4,5" # must be a string, not array of int
num_filters = 128

# Parameters
# ==================================================

# Data loading params
tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")
tf.flags.DEFINE_string("positive_data_file", pos_train_file, "Data source for the positive data.")
tf.flags.DEFINE_string("negative_data_file", neg_train_file, "Data source for the positive data.")
tf.flags.DEFINE_string("vocab_file", vocab_pickle, "Data source for the positive data.")
tf.flags.DEFINE_string("embeddings_file", embeddings_file, "Data source for the positive data.")


# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", embeddings_dim, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", filter_sizes, "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", num_filters, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularizaion lambda (default: 0.0)")

# Training parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
x_train, y_train = load_data_label(FLAGS.positive_data_file, FLAGS.negative_data_file)
print('X_train shape', x_train.shape,'Y_train shape',  y_train.shape)
np.savetxt('x_train_padded.txt', x_train)
np.savetxt('y_train_padded.txt', y_train)

# load_vocab
vocab = load_pickle(FLAGS.vocab_file)
embeddings = np.load(FLAGS.embeddings_file)


print(embeddings.shape)
print(len(vocab))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y_train)))
x_shuffled = x_train[shuffle_indices]
y_shuffled = y_train[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y_train)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))



Parameters:
ALLOW_SOFT_PLACEMENT=True
BATCH_SIZE=64
CHECKPOINT_EVERY=100
DEV_SAMPLE_PERCENTAGE=0.1
DROPOUT_KEEP_PROB=0.5
EMBEDDING_DIM=20
EMBEDDINGS_FILE=twitter-datasets/embeddings.npy
EVALUATE_EVERY=100
FILTER_SIZES=3,4,5
L2_REG_LAMBDA=0.0
LOG_DEVICE_PLACEMENT=False
NEGATIVE_DATA_FILE=twitter-datasets/neg_train.txt
NUM_EPOCHS=200
NUM_FILTERS=128
POSITIVE_DATA_FILE=twitter-datasets/pos_train.txt
VOCAB_FILE=twitter-datasets/vocab.pkl

Loading data...
