In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('Comment-Emoji Dataset.csv')
data

Unnamed: 0.1,Unnamed: 0,TEXT,Label
0,0,Vacation wasted ! #vacation2017 #photobomb #ti...,0
1,1,"Oh Wynwood, you’re so funny! : @user #Wynwood ...",1
2,2,Been friends since 7th grade. Look at us now w...,2
3,3,This is what it looks like when someone loves ...,3
4,4,RT @user this white family was invited to a Bl...,3
...,...,...,...
69995,69995,"Yes, I call Galina ""my Bubie"" Go follow my bea...",3
69996,69996,"I SEA you, Seattle @ Ballard Seafood Festival\n",16
69997,69997,If one of my daughters is wearing this and ask...,2
69998,69998,Guess who whoop people on THEIR homecoming?! #...,3


In [3]:
from sklearn.model_selection import train_test_split
X = data['TEXT']
Y = data['Label']

In [4]:
X_train , X_test , Y_train , Y_test = train_test_split(X,Y, random_state=5,test_size=0.3)

In [7]:
X.shape, Y.shape, mappings.shape

((70000,), (70000,), (20, 3))

In [6]:
mappings = pd.read_csv("emoji-mapping.csv")
mappings.head()

Unnamed: 0.1,Unnamed: 0,emoticons,number
0,0,😜,0
1,1,📸,1
2,2,😍,2
3,3,😂,3
4,4,😉,4


In [8]:
from nltk.corpus import stopwords

In [9]:
stop_words = stopwords.words("english")
stop_words[:5]

['i', 'me', 'my', 'myself', 'we']

In [10]:
def tokenize(tweets):
    stop_words = stopwords.words("english")
    tokenized_tweets = []
    for tweet in tweets:
        words = tweet.split(" ")
        tokenized_string = ""
        for word in words:
            if word[0] != '@' and word not in stop_words:
                if word[0] == "#":
                    word = word[1:]
                tokenized_string += word + " "
        tokenized_tweets.append(tokenized_string)
    return tokenized_tweets

In [11]:
def encod_tweets(tweets):
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=" ", lower=True)
    tokenizer.fit_on_texts(tweets)
    return tokenizer, tokenizer.texts_to_sequences(tweets)

In [12]:
def format_data(encoded_tweets, max_length, labels):
    x = pad_sequences(encoded_tweets, maxlen= max_length, padding='post')
    y = []
    for emoji in labels:
        bit_vec = np.zeros(20)
        bit_vec[emoji] = 1
        y.append(bit_vec)
    y = np.asarray(y)
    return x, y

In [13]:
def create_weight_matrix(vocab, raw_embeddings):
    vocab_size = len(vocab) + 1
    weight_matrix = np.zeros((vocab_size, 300))
    for word, idx in vocab.items():
        if word in raw_embeddings:
            weight_matrix[idx] = raw_embeddings[word]
    return weight_matrix

In [14]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
def final_model(weight_matrix, vocab_size, max_length, x, y, epochs = 5):
    embedding_layer = Embedding(vocab_size, 300, weights=[weight_matrix], input_length=max_length, trainable=True, mask_zero=True)
    model = Sequential()
    model.add(embedding_layer)
    model.add(Bidirectional(LSTM(128, dropout=0.2, return_sequences=True)))
    model.add(Bidirectional(LSTM(128, dropout=0.2)))
    model.add(Dense(20, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(x, y, epochs = epochs, validation_split = 0.25)
    score, acc = model.evaluate(x_test, y_test)
    return model, score, acc

In [16]:
import math

In [17]:
tokenized_tweets = tokenize(X_train)
tokenized_tweets += tokenize(X_test)
max_length = math.ceil(sum([len(s.split(" ")) for s in tokenized_tweets])/len(tokenized_tweets))
tokenizer, encoded_tweets = encod_tweets(tokenized_tweets)
max_length, len(tokenized_tweets)

(10, 70000)

In [18]:
train_length = X_train.shape[0]
test_length = X_test.shape[0]
train_length, test_length

(49000, 21000)

In [19]:
x, y = format_data(encoded_tweets[:train_length], max_length, Y_train)
len(x), len(y)

(49000, 49000)

In [20]:
x_test, y_test = format_data(encoded_tweets[train_length:], max_length, Y_test)
len(x_test), len(y_test)

(21000, 21000)

In [21]:
vocab = tokenizer.word_index
vocab, len(vocab)

({'i': 1,
  '️': 2,
  'love': 3,
  'the': 4,
  '…': 5,
  'new': 6,
  'amp': 7,
  'happy': 8,
  'day': 9,
  'my': 10,
  'one': 11,
  'night': 12,
  'york': 13,
  'time': 14,
  'this': 15,
  'beach': 16,
  'today': 17,
  'good': 18,
  'park': 19,
  'christmas': 20,
  'you': 21,
  'best': 22,
  'like': 23,
  "i'm": 24,
  'california': 25,
  'get': 26,
  'we': 27,
  'a': 28,
  'birthday': 29,
  'last': 30,
  'got': 31,
  'city': 32,
  'beautiful': 33,
  'great': 34,
  'little': 35,
  'see': 36,
  'family': 37,
  'university': 38,
  'thanks': 39,
  'back': 40,
  'thank': 41,
  'life': 42,
  'so': 43,
  'fun': 44,
  'it': 45,
  'much': 46,
  'me': 47,
  'favorite': 48,
  'center': 49,
  'first': 50,
  'when': 51,
  'morning': 52,
  'home': 53,
  'always': 54,
  'tonight': 55,
  'go': 56,
  'us': 57,
  'friends': 58,
  'weekend': 59,
  'florida': 60,
  'amazing': 61,
  'texas': 62,
  'year': 63,
  "it's": 64,
  'lake': 65,
  'girl': 66,
  'know': 67,
  'come': 68,
  'school': 69,
  'look': 70

In [22]:
from gensim.models.keyedvectors import KeyedVectors

In [23]:
embeddings_index = {}
f = open('glove.6B.300d.txt','r',encoding='utf-8')
for line in f:
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [24]:
weight_matrix = create_weight_matrix(vocab, embeddings_index)
len(weight_matrix)

85500

In [None]:
model, score, acc = final_model(weight_matrix, len(vocab)+1, max_length, x, y, epochs = 5)
model, score, acc

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

In [None]:
model.summary