In [3]:
import numpy as np
import lasagne
import theano
import theano.tensor as T
import sys
import batch_char as batch
import pickle as pkl
import io
import os

from t2v import tweet2vec, init_params, load_params
from settings_char import N_BATCH, MAX_LENGTH, MAX_CLASSES


#setting up conditions for Theano:
theano.config.gcc.cxxflags = "-Wno-c++11-narrowing"
os.environ['THEANO_FLAGS'] = "device=cpu,floatX=float32"


def invert(d):
    out = {}
    for k,v in d.items():
        out[v] = k
    return out

def classify(tweet, t_mask, params, n_classes, n_chars):
    # tweet embedding
    emb_layer = tweet2vec(tweet, t_mask, params, n_chars)
    # Dense layer for classes
    l_dense = lasagne.layers.DenseLayer(emb_layer, n_classes, W=params['W_cl'], b=params['b_cl'], nonlinearity=lasagne.nonlinearities.softmax)

    return lasagne.layers.get_output(l_dense), lasagne.layers.get_output(emb_layer)

def main(args):

    data_path = "/users/kumaraprasannajayaraju/Downloads/NLP_Final_Project/Method 1/data/life_t2v_ds_en_op.txt"
    model_path = "/users/kumaraprasannajayaraju/Downloads/NLP_Final_Project/Method 1/src"
    save_path = "/users/kumaraprasannajayaraju/Downloads/NLP_Final_Project/Method 1/data"
    if len(args)>3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    with io.open(data_path,'r',encoding='utf-8') as f:
        for line in f:
            Xc = line.rstrip('\n')
            Xt.append(Xc[:MAX_LENGTH])

    # Model
    print("Loading model params...")
    if len(args)>3:
        params = load_params('%s/model-nlp-t2v_%d.npz' % (model_path,m_num))
    else:
        params = load_params('%s/best_model-nlp-t2v.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict-2-nlp-t2v.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict-2-nlp-t2v.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char)

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet,t_mask],predictions)
    encode = theano.function([tweet,t_mask],embeddings)

    # Test
    print("Encoding...")
    out_pred = []
    out_emb = []
    numbatches = int(len(Xt)/N_BATCH + 1)
    for i in range(numbatches):
        xr = Xt[N_BATCH*i:N_BATCH*(i+1)]
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
        p = predict(x,x_m)
        e = encode(x,x_m)
        ranks = np.argsort(p)[:,::-1]

        for idx, item in enumerate(xr):
            out_pred.append(' '.join([inverse_labeldict[r] if r in inverse_labeldict else 'UNK' for r in ranks[idx,:5]]))
            out_emb.append(e[idx,:])

    # Save
    print("Saving...")
    with io.open('%s/predicted_tags-nlp-t2v.txt'%save_path,'w') as f:
        for item in out_pred:
            f.write(item + '\n')
    with open('%s/embeddings-nlp-t2v.npy'%save_path,'wb') as f:
        np.save(f,np.asarray(out_emb))
        
if __name__ == '__main__':
    main(sys.argv[1:])