In [1]:
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
import numpy as np
import talos as ta

Using TensorFlow backend.


In [2]:
Corpus = pd.read_pickle("../out/MLDoc/Shuffled_RS168_tagged_englishAndchinese_corpus_pd.pkl")

In [4]:
Corpus[(Corpus["Data_type"] == "test") & (Corpus["language"] == "Chinese")]["Class"].value_counts()

MCAT    1253
ECAT    1215
CCAT    1169
GCAT     363
Name: Class, dtype: int64

In [3]:
def read_theta(theta_file):
    delimiter = " "
    thetas = []
    file = open(theta_file, "r")
    for each_theta in file.readlines():
        thetas.append(each_theta.strip("\n").split(delimiter)[:-1])
    return thetas

def transform_pd_to_numpy_data(pd_Corpus):
    
    '''
    data_count = pd_Corpus.groupby('Data_type').size()
    training_size = data_count["train"]
    testing_size = data_count["test"]
    dev_size = data_count["dev"]
    input_feature_size = len(pd_Corpus["theta"][0])
    '''
    
    class_dictionary = {}
    class_count = pd_Corpus.groupby('Class').size()
    for key, _ in class_count.iteritems():
        class_dictionary[key] = len(class_dictionary)
    
    class_size = len(class_dictionary)
    
    Data_Object = {}
    Data_Object["x_train"] = np.array(Corpus[Corpus["Data_type"] == "train"]["theta"].values.tolist())
    Data_Object["y_train"] = keras.utils.to_categorical(np.array(Corpus[Corpus["Data_type"] == "train"]["Class"].apply(
                                lambda x: class_dictionary[x]).tolist()), num_classes=class_size)
    Data_Object["x_dev"] = np.array(Corpus[Corpus["Data_type"] == "dev"]["theta"].values.tolist())
    Data_Object["y_dev"] = keras.utils.to_categorical(np.array(Corpus[Corpus["Data_type"] == "dev"]["Class"].apply(
                                lambda x: class_dictionary[x]).tolist()), num_classes=class_size)
    '''
    Data_Object["x_test"] = np.array(Corpus[Corpus["Data_type"] == "test"]["theta"].values.tolist())
    Data_Object["y_test"] = keras.utils.to_categorical(np.array(Corpus[Corpus["Data_type"] == "test"]["Class"].apply(
                                lambda x: class_dictionary[x]).tolist()), num_classes=class_size)
    '''
    Data_Object["x_test"] = np.array(Corpus[(Corpus["Data_type"] == "test") &
                                            (Corpus["language"] == "Chinese")]["theta"].values.tolist())
    Data_Object["y_test"] = keras.utils.to_categorical(np.array(Corpus[(Corpus["Data_type"] == "test") &
                                                                       (Corpus["language"] == "Chinese")]["Class"].apply(
                                lambda x: class_dictionary[x]).tolist()), num_classes=class_size)
    
    return Data_Object, class_dictionary

In [4]:
CLTM_theta = read_theta("../out/Experiment_results/CLTM/2018-11-27/10dim-MLDoc-engAndchi-LFLDA10T100I1e-1beta.theta")

In [5]:
Corpus["theta"] = CLTM_theta
Data_Object, class_dictionary = transform_pd_to_numpy_data(Corpus)

In [6]:
Data_Object["x_test"].shape

(4000, 10)

In [10]:
import pickle

with open('MLDoc_90dim_10topics_data_object.pickle', 'wb') as handle:
    pickle.dump(Data_Object, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Keras Model definition

In [13]:
model = Sequential()
# Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape:
# here, 10-dimensional vectors.
model.add(Dense(64, activation='relu', input_dim=10))
model.add(Dropout(0.5))
model.add(Dense(4, activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])

In [None]:
model.fit(Data_Object["x_train"], Data_Object["y_train"],
          epochs=500,
          batch_size=256,
          validation_data=[Data_Object["x_dev"], Data_Object["y_dev"]])

In [95]:
score = model.evaluate(Data_Object["x_test"], Data_Object["y_test"], batch_size=256)
score



[0.6689969706535339, 0.747625]

# Talos tuning model

In [7]:
def MLDoc_Predictive_Model(x_train, y_train, x_val, y_val, params):
    
    first_neuron = int(params['first_neuron'])
    dropout = float(params['dropout'])
    learning_rate = float(params['learning_rate'])
    decay_rate = float(params['decay_rate'])
    epochs_num = int(params['epochs_num'])
    batch_size_num = int(params['batch_size_num'])
    
    model = Sequential()
    # Dense(64) is a fully-connected layer with 64 hidden units.
    # in the first layer, you must specify the expected input data shape:
    # here, 10-dimensional vectors.
    model.add(Dense(first_neuron, activation='relu', input_dim=x_train.shape[1]))
    model.add(Dropout(dropout))
    model.add(Dense(4, activation='softmax'))

    sgd = SGD(lr=learning_rate, decay=decay_rate, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy',
                  optimizer=sgd,
                  metrics=['accuracy'])
    
    out = model.fit(x_train, y_train,
          epochs=epochs_num,
          batch_size=batch_size_num,
          verbose=0,
          validation_data=[x_val, y_val])
    
    return out, model

In [8]:
p = {'learning_rate': (0.01, 0.1, 5),
     'decay_rate': [1e-6, 1e-5, 1e-4],
     'first_neuron': [8, 16, 32, 64, 128],
     'batch_size_num': [32, 64, 128, 256, 512],
     'epochs_num': [500],
     'dropout': (0, 0.50, 5)}

In [9]:
h = ta.Scan(x=Data_Object["x_train"], y=Data_Object["y_train"],
          x_val=Data_Object["x_dev"],y_val=Data_Object["y_dev"],
          params=p,
          dataset_name='MLDoc_90dim_10topics',
          experiment_no='1',
          model=MLDoc_Predictive_Model,
          grid_downsample=0.05)

100%|██████████| 93/93 [38:47<00:00, 20.06s/it]

Scan Finished!





In [10]:
best_model_id = h.data['val_acc'].astype('float').argmax() - 1
print(h.saved_models[best_model_id])

{"class_name": "Sequential", "config": {"name": "sequential_1", "layers": [{"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "batch_input_shape": [null, 10], "dtype": "float32", "units": 128, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.4, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 4, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_i

In [11]:
best_model = h.saved_models[best_model_id]
best_model = keras.models.model_from_json(best_model)
best_model.set_weights(h.saved_weights[best_model_id])
best_model.compile(loss='categorical_crossentropy',
                  optimizer="sgd",
                  metrics=['accuracy'])

In [12]:
best_model.evaluate(Data_Object["x_test"], Data_Object["y_test"], batch_size=256)



[0.6402635846138001, 0.7855000009536743]