In [None]:
import json

from torchmoji.sentence_tokenizer import SentenceTokenizer
from torchmoji.model_def import torchmoji_feature_encoding
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

In [None]:
from __future__ import division, unicode_literals

In [None]:
import numpy as np

In [None]:
from tqdm import tqdm_notebook

In [None]:
TEST_SENTENCES = ['I love mom\'s cooking',
                  'I love how you never reply back..',
                  'I love cruising with my homies',
                  'I love messing with yo mind!!',
                  'I love you and now you\'re just gone..',
                  'This is shit',
                  'This is the shit']

maxlen = 30
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

print('Loading model from {}.'.format(PRETRAINED_PATH))
model = torchmoji_feature_encoding(PRETRAINED_PATH)
print(model)

In [None]:
print TEST_SENTENCES

In [None]:
print('Encoding texts..')
encoding = model(tokenized)

print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0]))
print encoding[0]

In [None]:
encoding.shape

In [None]:
sent_race_dir = '../data/processed/sent_race/'
sent_gender_dir = '../data/processed/sent_gender/'
mention_age_dir = '../data/processed/author_mention_age/'
mention_gender_dir = '../data/processed/author_mention_gender/'

In [None]:
total = 100000

In [None]:
def get_sentences(d):
    with open(d + 'vocab', 'r') as f:
        vocab = f.readlines()
        vocab = map(lambda s: s.strip(), vocab)
    def to_words(sen):
        s = []
        for w in sen:
            s.append(vocab[w])
        return s
    
    with open(d + 'pos_pos', 'r') as f:
        pos_pos = f.readlines()
        pos_pos = [map(int, sen.split(' ')) for sen in pos_pos]
        pos_pos = pos_pos[:total]
        pos_pos = map(to_words, pos_pos)
    with open(d + 'pos_neg', 'r') as f:
        pos_neg = f.readlines()
        pos_neg = [map(int, sen.split(' ')) for sen in pos_neg]
        pos_neg = pos_neg[:total]
        pos_neg = map(to_words, pos_neg)
    with open(d + 'neg_pos', 'r') as f:
        neg_pos = f.readlines()
        neg_pos = [map(int, sen.split(' ')) for sen in neg_pos]
        neg_pos = neg_pos[:total]
        neg_pos = map(to_words, neg_pos)
    with open(d + 'neg_neg', 'r') as f:
        neg_neg = f.readlines()
        neg_neg = [map(int, sen.split(' ')) for sen in neg_neg]
        neg_neg = neg_neg[:total]
        neg_neg = map(to_words, neg_neg)
    
    return pos_pos, pos_neg, neg_pos, neg_neg

In [None]:
maxlen = 150
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)

In [None]:
def sent_join(sents):
    a = []
    for s in sents:
        try:
            a.append(' '.join([x.decode('utf-8') for x in s]))
        except:
            print s
    return a

In [None]:
pos_pos, pos_neg, neg_pos, neg_neg = get_sentences(sent_race_dir)

In [None]:
pos_pos = sent_join(pos_pos)
pos_neg = sent_join(pos_neg)
neg_pos = sent_join(neg_pos)
neg_neg = sent_join(neg_neg)

In [None]:
out_dir = '../data/orig_sent_race'

In [None]:
def batch_encode(in_data, bs_size):
    encoded_data = []
    for i in tqdm(range(0, len(in_data), bs_size)):
        tokenized, _, _ = st.tokenize_sentences(in_data[i: i + bs_size])
        encoded_batch = model(tokenized)
        encoded_data.extend(encoded_batch)
    return np.array(encoded_data)

In [None]:
pos_pos[:2]

In [None]:
temp = batch_encode(pos_pos[:5000], 1000)

In [None]:
temp.shape

In [None]:
temp[0]

In [None]:
len(pos_pos)

In [None]:
for d, name in zip([pos_pos, pos_neg, neg_neg, neg_pos], ['pos_pos', 'pos_neg', 'neg_neg', 'neg_pos']):
    encoded_data = batch_encode(d, bs_size=1000)
    np.save(out_dir + '/{}.npy'.format(name), encoding)

In [None]:
def save_file(data, file_name):
    import io
    encodable_data = []
    with io.open(file_name, "w", encoding="utf-8") as my_file:
        for line in data:
            try:
                my_file.write(line.encode('utf-8') + '\n')
                encodable_data.append(line)
            except:
                pass
    return encodable_data

In [None]:
new_pos_pos = save_file(pos_pos, out_dir + '/pos_pos.txt')
new_pos_neg = save_file(pos_neg, out_dir + '/pos_neg.txt')
new_neg_pos = save_file(neg_pos, out_dir + '/neg_pos.txt')
new_neg_neg = save_file(neg_neg, out_dir + '/neg_neg.txt')

In [None]:
for d, name in zip([new_pos_pos, new_pos_neg, new_neg_neg, new_neg_pos],
                   ['pos_pos', 'pos_neg', 'neg_neg', 'neg_pos']):
    encoded_data = batch_encode(d, bs_size=1000)
    np.save(out_dir + '/{}.npy'.format(name), encoded_data)

In [None]:
encoded_data[0]