In [51]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, f1_score
import sys
import matplotlib.pyplot as plt
import json
import sys
import math
# A list of all emojis
from emojiList import emoji
from gensim.models import Word2Vec as w2v
import multiprocessing
import nltk
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM
from keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers.wrappers import Bidirectional

In [10]:
class word2vec:
    def __init__(self, tweet_file):
        self.file = tweet_file

    def preprocess_tweets(self):
        """ Tokenises all tweets to get words"""

        raw_sentences = []
        tweets = open(self.file, "r")
        for tweet in tweets:
            raw_sentences.append(nltk.word_tokenize(tweet))
        self.sentences = raw_sentences


    def make_model(self):
        """ Model and train the word2vec model on words from tweets"""

        # Define parameters for the w2v model
        num_features = 100
        min_word_count = 3
        num_workers = multiprocessing.cpu_count()
        context_size = 7
        downsampling = 1e-3
        seed = 1

        # Build the model
        self.tweet2vec = w2v(
            sg = 1,
            seed = seed,
            workers = num_workers,
            size = num_features,
            min_count = min_word_count,
            window = context_size,
            sample = downsampling
        )

        # Build the vocabulary
        self.tweet2vec.build_vocab(self.sentences)
        # Train the model
        self.tweet2vec.train(self.sentences, epochs = 10, total_examples = len(self.sentences))

    def run(self):
        self.preprocess_tweets()
        self.make_model()

In [11]:
class getEmojis:
    
    """Class to get Emojis from tweets"""
    def __init__(self, tweet_file):
        self.file = tweet_file

    def read_data(self):
        """Reads all the tweets from the json file"""
        with open(self.file, encoding='utf-8') as data_file:
            self.data = json.loads(data_file.read())

    @staticmethod
    def extract_emojis(s):
        """Given a tweet, returns emjois in it"""
        return ' '.join(c for c in s if c in emoji)

    def find_emojis(self):
        """Find all the emojis in the given data and unique emojis"""
        # Extract the emoji from each tweet and save the unique emoji
        # There is only one unique emoji per tweet
        self.emoji_labels = []
        for i, d in enumerate(self.data):
#             if i > 20000:
#                 break
            emoji_label = self.extract_emojis(d)
            li = np.asarray(list(emoji_label.split(" ")))
            self.emoji_labels.append(np.unique(li))
            
        self.unique_emojis = np.unique(self.emoji_labels)
        self.unique_emojis = (np.array(self.unique_emojis.tolist())[1:]).tolist()
        le = LabelEncoder()
        encoded_labels = le.fit_transform(self.emoji_labels)
        
        f = open('train.text.labels', 'w+')
        for label in encoded_labels:
            f.write("%d\n" % label)
        f.close()
        
    def run(self):
        """Helper function to run all required functions"""
        self.read_data()
        self.find_emojis()

In [12]:
# em = getEmojis("resultdata.json")
w2vec = word2vec("train.txt")
# em.run()
w2vec.run()
# labels = em.emoji_labels

In [15]:
text = open('train.txt')
label = open('train.labels')
train_tweets = []
train_labels = []
for line in text:
    train_tweets.append(line)
for lab in label:
    train_labels.append(lab)
train_labels = list(map(int, train_labels))
text.close()
label.close()

In [16]:
test_tweets = []
test_labels = []
text = open('us_test.text')
label = open('us_test.labels')
for line in text:
    test_tweets.append(line)
for line in label:
    test_labels.append(line)
test_labels = list(map(int, test_labels))

In [17]:
def get_vector(li):
    features = []
    max_len = 0
    for counter, tweet in enumerate(li):
        avg_vec = np.zeros(w2vec.tweet2vec.wv.vector_size)
        max_len = max(max_len, len(tweet))
        for word in tweet:
            if word not in w2vec.tweet2vec.wv.vocab or word in emoji:
                continue
            avg_vec = np.add(avg_vec, w2vec.tweet2vec.wv[word])
        features.append(np.true_divide(avg_vec, len(tweet)))
    return np.asarray(features), max_len

In [18]:
# Get vector representation for each tweet
X_train, max_len_train = get_vector(train_tweets)
X_test, max_len_test = get_vector(test_tweets) 

In [32]:
all_tweets = train_tweets + test_tweets
max_length = math.ceil(sum([len(s.split(" ")) for s in all_tweets])/len(all_tweets))

In [33]:
def encode_docs(tweets):
    #Translate tweets to sequence of numbers
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=" ", lower=True)
    tokenizer.fit_on_texts(tweets)
    return tokenizer, tokenizer.texts_to_sequences(tweets)

In [34]:
def populate_weight_matrix(vocab, raw_embedding):
    # Create weight matrix from pre-trained embeddings
    vocab_size = len(vocab) + 1
    weight_matrix = np.zeros((vocab_size, 100))
    for word, i in vocab.items():
        if word in raw_embedding:
            weight_matrix[i] = raw_embedding[word]
    return weight_matrix

In [42]:
vocab = tokenizer.word_index
weight_matrix = populate_weight_matrix(vocab, w2vec.tweet2vec.wv)

In [41]:
tokenizer, encoded_docs = encode_docs(all_tweets)
temp_train = pad_sequences(encoded_docs[:len(train_tweets)], maxlen=max_length, padding='post')
temp_test = pad_sequences(encoded_docs[-len(test_tweets):], maxlen=max_length, padding='post')

In [53]:
y_train = np_utils.to_categorical(train_labels, 20)
y_test = np_utils.to_categorical(test_labels, 20)
embedding_layer = Embedding(len(vocab) + 1, 100, weights=[weight_matrix], input_length=max_length, trainable=True, mask_zero=True)
model_rnn = Sequential()
model_rnn.add(embedding_layer)
model_rnn.add(Bidirectional(LSTM(128, dropout=0.2, return_sequences=True)))
model_rnn.add(Bidirectional(LSTM(128, dropout=0.2)))
model_rnn.add(Dense(400, activation='relu', input_dim=256))
model_rnn.add(Dense(200, activation='relu'))
model_rnn.add(Dense(20, activation='softmax'))
# model.add(Dense(20, activation='softmax'))
model_rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_rnn.fit(temp_train, y_train, epochs=1, validation_data=(temp_test, y_test))

Train on 77360 samples, validate on 50000 samples
Epoch 1/1


<keras.callbacks.History at 0x7f8f0948b518>

In [71]:
pred = model_rnn.predict_classes(temp_test)
acc = accuracy_score(test_labels, pred)
f1 = f1_score(test_labels, pred, average='micro')

In [72]:
acc

0.38972

In [73]:
f1

0.38971999999999996

In [77]:
model_rnn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 13, 100)           13255300  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 13, 256)           234496    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense_10 (Dense)             (None, 400)               102800    
_________________________________________________________________
dense_11 (Dense)             (None, 200)               80200     
_________________________________________________________________
dense_12 (Dense)             (None, 20)                4020      
Total params: 14,071,056
Trainable params: 14,071,056
Non-trainable params: 0
________________________________________________________________

In [48]:
model = Sequential()
model.add(Dense(400, activation='relu', input_dim=100))
model.add(Dense(200, activation='relu'))
model.add(Dense(20, activation='softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
y_train = np_utils.to_categorical(train_labels, 20)
y_test = np_utils.to_categorical(test_labels, 20)
model.fit(X_train, y_train,
          epochs=20,
          batch_size=128)
score = model.evaluate(X_test, y_test, batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [74]:
# Predict On Training Set
pred = model.predict_classes(X_test)
acc = accuracy_score(test_labels, pred)
f1 = f1_score(test_labels, pred, average='micro')

In [75]:
acc

0.31286

In [76]:
f1

0.31286