In [1]:
from __future__ import print_function, division
import example_helper
import json
import csv
import numpy as np
from deepmoji.sentence_tokenizer import SentenceTokenizer
from deepmoji.model_def import deepmoji_emojis
from deepmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
import sys

Using Theano backend.


# Get Tweets

In [25]:
import configparser
import warnings
from pymongo import MongoClient

warnings.filterwarnings("ignore", category=DeprecationWarning)

config = configparser.ConfigParser()
config.read('config.ini')
host = config['DEFAULT']['IP']
port = config['DEFAULT']['MongoDB-Port']

client = MongoClient(host, int(port))
col = client['Twitter']['twitter-richard-2017']

In [31]:
cursor = col.find({'$text': {'$search': '"family violence"'}}, no_cursor_timeout=True)

In [None]:
TEST_SENTENCES = []
print('Retrieving Tweets...\n')
n=0
for tweet in cursor:
    try:
        TEST_SENTENCES.append(tweet['doc']['text'].strip().decode('utf-8'))
        n += 1
        if n % 100 == 0:
            print(n, 'finished')
    except:
        continue

In [33]:
# remove meaningless tweets
for s in TEST_SENTENCES:
    tokens = s.split(' ')
    for token in tokens:
        if token.startswith('@') or token.startswith('#') or token.startswith('http'):
            del token
    if len(tokens) == 0:
        del s

# Predict

In [36]:
import emoji

OUTPUT_PATH = 'output/twitter-scores.csv'


def top_elements(array, k):
    ind = np.argpartition(array, -k)[-k:]
    return ind[np.argsort(array[ind])][::-1]


maxlen = 30
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

print('Loading model from {}.'.format(PRETRAINED_PATH))
model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
model.summary()

print('Running predictions.')
prob = model.predict(tokenized)

# Find top emojis for each sentence. Emoji ids (0-63)
# correspond to the mapping in emoji_overview.png
# at the root of the DeepMoji repo.
print('Writing results to {}'.format(OUTPUT_PATH))
scores = []
for i, t in enumerate(TEST_SENTENCES):
    t_tokens = tokenized[i]
    t_score = [t]
    t_prob = prob[i]
    ind_top = top_elements(t_prob, 5)
    t_score.append(sum(t_prob[ind_top]))
    t_score.extend(ind_top)
    t_score.extend([t_prob[ind] for ind in ind_top])
    scores.append(t_score)

with open(OUTPUT_PATH, 'wb') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', lineterminator='\n')
    writer.writerow(['Text', 'Top5%',
                     'Emoji_1', 'Emoji_2', 'Emoji_3', 'Emoji_4', 'Emoji_5',
                     'Pct_1', 'Pct_2', 'Pct_3', 'Pct_4', 'Pct_5'])
    for i, row in enumerate(scores):
        try:
            writer.writerow(row)
        except Exception:
            print("Exception at row {}!".format(i))

Tokenizing using dictionary from /Users/wky/Desktop/projects/media-cloud/DeepMoji/model/vocabulary.json
Loading model from /Users/wky/Desktop/projects/media-cloud/DeepMoji/model/deepmoji_weights.hdf5.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 30, 256)      12800000    input_8[0][0]                    
__________________________________________________________________________________________________
activation_8 (Activation)       (None, 30, 256)      0           embedding[0][0]                  
__________________________________________________________________________________________________
bi_lstm

In [35]:
len(TEST_SENTENCES)

1280