In [16]:
import numpy as np
import pandas as pd
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import matplotlib.pyplot as plt
import emoji

from sys import path as pylib #im naming it as pylib so that we won't get confused between os.path and sys.path 
import os
pylib += [os.path.abspath(r'/home/Jay/Notebooks/Group9_emotion_detection/torchMoji')]

In [40]:
data = pd.read_csv('../../deepmoji/data/train.txt', sep = '\t')

In [3]:
label2emotion = {0: "others", 1: "happy", 2: "sad", 3: "angry"}
emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3}

emoticons_additional = {
    '(^・^)': '<happy>', ':‑c': '<sad>', '=‑d': '<happy>', ":'‑)": '<happy>', ':‑d': '<laugh>',
    ':‑(': '<sad>', ';‑)': '<happy>', ':‑)': '<happy>', ':\\/': '<sad>', 'd=<': '<annoyed>',
    ':‑/': '<annoyed>', ';‑]': '<happy>', '(^�^)': '<happy>', 'angru': 'angry', "d‑':":
        '<annoyed>', ":'‑(": '<sad>', ":‑[": '<annoyed>', '(�?�)': '<happy>', 'x‑d': '<laugh>',
}

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
               'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
              'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter",
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter",
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons, emoticons_additional]
)


def tokenize(text):
    text = " ".join(text_processor.pre_process_doc(text))
    return text

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [4]:
x1=[]
x2=[]
x3=[]
y=[]
labels={}
i=0
for ind, row in data.iterrows():
    if row['label'] not in labels:
        labels[row['label']]=i
        i+=1
    y.append(labels[row['label']])
    x1.append(tokenize(row['turn1']))
    x2.append(tokenize(row['turn2']))
    x3.append(tokenize(row['turn3']))

In [5]:
test_data = pd.read_csv('../../deepmoji/data/test.txt', sep = '\t')
test_x1=[]
test_x2=[]
test_x3=[]
test_y=[]
test_labels={}
i=0
for ind, row in test_data.iterrows():
    if row['label'] not in test_labels:
        test_labels[row['label']]=i
        i+=1
    test_y.append(test_labels[row['label']])
    test_x1.append(tokenize(row['turn1']))
    test_x2.append(tokenize(row['turn2']))
    test_x3.append(tokenize(row['turn3']))

In [6]:
from __future__ import print_function, division, unicode_literals
import json

from torchmoji.sentence_tokenizer import SentenceTokenizer
from torchmoji.model_def import torchmoji_feature_encoding
from torchmoji.model_def import torchmoji_emojis
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

def encode_emoji(x):
    model = torchmoji_emojis(PRETRAINED_PATH)
    maxlen = 30
    batch_size = 32

    print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary, maxlen)
    tokenized, _, _ = st.tokenize_sentences(x)
    print('Loading model from {}.'.format(PRETRAINED_PATH))
    print(model)
    print('Running predictions.')
    
    encoding = np.zeros((len(x), 64))
    for i in range(0, len(x), 300):    
        encoding[i:i+300] = model(tokenized[i:i+300])
    return encoding

In [11]:
y=np.array(y)
idx_others=np.where(y==0)[0]
x1 = np.array(x1)
x2 = np.array(x2)
x3 = np.array(x3)

In [61]:
x_all = np.c_[x1,x2,x3]
x_all = [' <eos> \n '.join(x) for x in x_all]

In [62]:
x_all[0]

"do not worry i am girl <eos> \n hmm how do i know if you are <eos> \n what ' s ur name ?"

In [12]:
x1_emoji = encode_emoji(x1[idx_others])
x2_emoji = encode_emoji(x2[idx_others])
x3_emoji = encode_emoji(x3[idx_others])

Tokenizing using dictionary from /home/Jay/Notebooks/Group9_emotion_detection/torchMoji/model/vocabulary.json
Loading model from /home/Jay/Notebooks/Group9_emotion_detection/torchMoji/model/pytorch_model.bin.
TorchMoji(
  (embed): Embedding(50000, 256)
  (embed_dropout): Dropout2d(p=0)
  (lstm_0): LSTMHardSigmoid(256, 512, batch_first=True, bidirectional=True)
  (lstm_1): LSTMHardSigmoid(1024, 512, batch_first=True, bidirectional=True)
  (attention_layer): Attention(2304, return attention=False)
  (final_dropout): Dropout(p=0)
  (output_layer): Sequential(
    (0): Linear(in_features=2304, out_features=64, bias=True)
    (1): Softmax()
  )
)
Running predictions.


  input = module(input)


Tokenizing using dictionary from /home/Jay/Notebooks/Group9_emotion_detection/torchMoji/model/vocabulary.json
Loading model from /home/Jay/Notebooks/Group9_emotion_detection/torchMoji/model/pytorch_model.bin.
TorchMoji(
  (embed): Embedding(50000, 256)
  (embed_dropout): Dropout2d(p=0)
  (lstm_0): LSTMHardSigmoid(256, 512, batch_first=True, bidirectional=True)
  (lstm_1): LSTMHardSigmoid(1024, 512, batch_first=True, bidirectional=True)
  (attention_layer): Attention(2304, return attention=False)
  (final_dropout): Dropout(p=0)
  (output_layer): Sequential(
    (0): Linear(in_features=2304, out_features=64, bias=True)
    (1): Softmax()
  )
)
Running predictions.
Tokenizing using dictionary from /home/Jay/Notebooks/Group9_emotion_detection/torchMoji/model/vocabulary.json
Loading model from /home/Jay/Notebooks/Group9_emotion_detection/torchMoji/model/pytorch_model.bin.
TorchMoji(
  (embed): Embedding(50000, 256)
  (embed_dropout): Dropout2d(p=0)
  (lstm_0): LSTMHardSigmoid(256, 512, batc

In [14]:
def top_elements(array, k):
    ind = np.argpartition(array, -k)[-k:]
    return ind[np.argsort(array[ind])][::-1]

EMOJIS = ":joy: :unamused: :weary: :sob: :heart_eyes: \
:pensive: :ok_hand: :blush: :heart: :smirk: \
:grin: :notes: :flushed: :100: :sleeping: \
:relieved: :relaxed: :raised_hands: :two_hearts: :expressionless: \
:sweat_smile: :pray: :confused: :kissing_heart: :heartbeat: \
:neutral_face: :information_desk_person: :disappointed: :see_no_evil: :tired_face: \
:v: :sunglasses: :rage: :thumbsup: :cry: \
:sleepy: :yum: :triumph: :hand: :mask: \
:clap: :eyes: :gun: :persevere: :smiling_imp: \
:sweat: :broken_heart: :yellow_heart: :musical_note: :speak_no_evil: \
:wink: :skull: :confounded: :smile: :stuck_out_tongue_winking_eye: \
:angry: :no_good: :muscle: :facepunch: :purple_heart: \
:sparkling_heart: :blue_heart: :grimacing: :sparkles:".split(' ')

In [47]:
data_others = data[data['label']=='others']

In [48]:
turn1=[]
turn2=[]
turn3=[]

for i, text in enumerate(data_others['turn1'].values):
    emoji_ids = top_elements(x1_emoji[i], 5)
    emojis = map(lambda x: EMOJIS[x], emoji_ids)
    turn1.append(emoji.emojize("{} {}".format(text,' '.join(emojis)), use_aliases=True))

for i, text in enumerate(data_others['turn2'].values):
    emoji_ids = top_elements(x2_emoji[i], 5)
    emojis = map(lambda x: EMOJIS[x], emoji_ids)
    turn2.append(emoji.emojize("{} {}".format(text,' '.join(emojis)), use_aliases=True))

for i, text in enumerate(data_others['turn3'].values):
    emoji_ids = top_elements(x3_emoji[i], 5)
    emojis = map(lambda x: EMOJIS[x], emoji_ids)
    turn3.append(emoji.emojize("{} {}".format(text,' '.join(emojis)), use_aliases=True))
    #print(emoji.emojize("{} {}".format(x1[i],' '.join(emojis)), use_aliases=True))

data_others['turn1'] = turn1
data_others['turn2'] = turn2
data_others['turn3'] = turn3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [51]:
data_others[['turn1','turn2','turn3']]

Unnamed: 0,turn1,turn2,turn3
0,Don't worry I'm girl 💁 ✋ 🙅 😉 👊,hmm how do I know if you are 👀 😏 😕 😐 🙊,What's ur name? 👀 😳 😏 😂 😕
2,By ✌ 🎵 💔 💜 💓,by Google Chrome 👍 👌 🎵 😍 🔫,Where you live 👀 🎵 😈 😕 😩
4,Just for time pass 🙏 😌 😏 😈 😅,wt do u do 4 a living then ☺ ✌ 🎵 🎶 💔,Maybe 😏 🎵 🙊 😌 😕
5,I'm a dog person 😈 💁 😊 😎 😌,youre so rude ✋ 😠 😡 😤 😒,Whaaaat why 🎵 ☺ 🎶 💔 😢
6,So whatsup 😎 😏 ✌ 😈 😜,Nothing much. Sitting sipping and watching TV....,What are you watching on tv? 😳 👀 😐 😕 😂
7,Ok 😐 👌 👍 ✌ 😕,ok im back!! ☺ 💓 💛 😉 ✌,"So, how are u 👀 ☺ 😊 😄 😌"
8,Really? 😑 😒 😐 😠 😡,really really really really really 🎵 🎶 🙊 🙈 ☺,Y saying so many times...i can hear you ☺ 👀 💓 😉 💔
9,Bay ✌ ✋ 😘 💓 💛,in the bay 🎵 😎 ✌ 🎶 😌,😘 love you 💓 💛 ❤ 😘 💜
11,I will do night. 😴 ✌ 😌 👌 ✨,Alright. Keep me in loop. ✌ 😐 👌 😌 😬,Not giving WhatsApp no. 🙅 ✋ 😠 😡 😑
12,Sure go ahead 👍 👌 ✌ 😉 😌,Many thanks once again! 😊 👍 😄 🙏 ☺,Love you too 😘 💓 💛 💕 ❤
