In [1]:
import tensorflow.python.platform
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
import sys
import matplotlib.pyplot as plt
import json
import sys
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
# A list of all emojis
from emojiList import emoji
from gensim.models import Word2Vec as w2v
import multiprocessing
import nltk
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD

Using TensorFlow backend.


In [2]:
class word2vec:
    def __init__(self, tweet_file):
        self.file = tweet_file

    def preprocess_tweets(self):
        """ Tokenises all tweets to get words"""

        # Tokenize the sentences
        raw_sentences = []
        tweets = open(self.file, "r")
        for tweet in tweets:
            raw_sentences.append(nltk.word_tokenize(tweet))
        self.sentences = raw_sentences


    def make_model(self):
        """ Model and train the word2vec model on words from tweets"""

        # Define parameters for the w2v model
        num_features = 300
        min_word_count = 3
        num_workers = multiprocessing.cpu_count()
        context_size = 7
        downsampling = 1e-3
        seed = 1

        # Build the model
        self.tweet2vec = w2v(
            sg = 1,
            seed = seed,
            workers = num_workers,
            size = num_features,
            min_count = min_word_count,
            window = context_size,
            sample = downsampling
        )

        # Build the vocabulary
        self.tweet2vec.build_vocab(self.sentences)
        # Train the model
        self.tweet2vec.train(self.sentences, epochs = 10, total_examples = len(self.sentences))

    def run(self):
        self.preprocess_tweets()
        self.make_model()

In [3]:
class getEmojis:
    
    """Class to get Emojis from tweets"""
    def __init__(self, tweet_file):
        self.file = tweet_file

    def read_data(self):
        """Reads all the tweets from the json file"""
        with open(self.file, encoding='utf-8') as data_file:
            self.data = json.loads(data_file.read())

    @staticmethod
    def extract_emojis(s):
        """Given a tweet, returns emjois in it"""
        return ' '.join(c for c in s if c in emoji)

    def find_emojis(self):
        """Find all the emojis in the given data and unique emojis"""
        # Extract the emoji from each tweet and save the unique emoji
        # There is only one unique emoji per tweet
        self.emoji_labels = []
        for i, d in enumerate(self.data):
            if i > 20000:
                break
            emoji_label = self.extract_emojis(d)
            li = np.asarray(list(emoji_label.split(" ")))
            self.emoji_labels.append(np.unique(li))

        self.unique_emojis = np.unique(self.emoji_labels)
        self.unique_emojis = (np.array(self.unique_emojis.tolist())[1:]).tolist()
    
    def run(self):
        """Helper function to run all required functions"""
        self.read_data()
        self.find_emojis()

In [4]:
class MLP:

    def __init__(self, labels, tweet2vec):
        self.labels = labels
        self.tweet2vec = tweet2vec

    def make_data(self):
        en_stopwords = set(stopwords.words('english'))
        snowball_stemmer = SnowballStemmer('english')
        
        # Average w2v of every tweet to be used as a feature
        features = []
        tweets = open("tweet_file", "r")
        for counter, tweet in enumerate(tweets):
            if counter > 20000:
                break
#             without_stopwords = [w for w in tweet.split() if w not in en_stopwords]
#             stemmed = [snowball_stemmer.stem(w) for w in without_stopwords]
#             stemmed = ' '.join(stemmed)
            avg_vec = np.zeros(self.tweet2vec.wv.vector_size)
            for word in tweet:
                if word not in self.tweet2vec.wv.vocab or word in emoji:
                    continue
                avg_vec = np.add(avg_vec, self.tweet2vec.wv[word])
            features.append(np.true_divide(avg_vec, len(tweet)))
        self.features = np.asarray(features)
        tweets.close()

        # One hot encode the labels
        encoder = LabelEncoder()
        encoded_labels = encoder.fit_transform(self.labels)
        # Reshaping into a 2D vector
#         encoded_labels = np.reshape(encoded_labels, (-1, encoded_labels.shape[0]))
#         print(encoded_labels)

#         encoder = OneHotEncoder(handle_unknown='ignore')
        self.one_hot = self.one_hot_labelling(encoded_labels)

    def one_hot_labelling(self, encoded_labels):
        hot = np.zeros((encoded_labels.size, encoded_labels.max()+1))
        hot[np.arange(encoded_labels.size), encoded_labels] = 1
        return hot
    
    def train(self):

        num_epochs = 100
        learningRate = 0.3
        num_labels = np.unique(self.labels).shape[0]
        num_features = self.features.shape[1]
        x = tf.placeholder("float", shape=[None, num_features])
        W = tf.Variable(tf.zeros([num_features, num_labels]))
        b = tf.Variable(tf.zeros([num_labels]))
        y_ = tf.placeholder("float", shape=[None, num_labels])
        features, labels = shuffle(self.features, self.one_hot, random_state = 1)
        # Split the data into testing and training sets
        train_features, test_features, train_labels, test_labels = train_test_split(
            features, labels, test_size=0.2, random_state=42)
        print(train_features.shape)
        print(train_labels.shape)
        print(num_labels)
        weights = {
            'h1': tf.Variable(tf.truncated_normal([num_features, 60])),
            'h2': tf.Variable(tf.truncated_normal([60, 60])),
            'h3': tf.Variable(tf.truncated_normal([60, 60])),
            'h4': tf.Variable(tf.truncated_normal([60, 60])),
            'out': tf.Variable(tf.truncated_normal([60, num_labels]))
        }

        biases = {
            'b1': tf.Variable(tf.truncated_normal([60])),
            'b2': tf.Variable(tf.truncated_normal([60])),
            'b3': tf.Variable(tf.truncated_normal([60])),
            'b4': tf.Variable(tf.truncated_normal([60])),
            'out': tf.Variable(tf.truncated_normal([num_labels]))
        }

        model_path = "./model"
        init = tf.initialize_all_variables()

        saver = tf.train.Saver()

        y = self.multilayer_perceptron(x, weights, biases)
        # y = _generateTensorLayers(x, weights, biases)

        cost_function = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))
        training_step = tf.train.GradientDescentOptimizer(
            learningRate).minimize(cost_function)

        sess = tf.Session()
        sess.run(init)
        mse_history = []
        accuracy_history = []
        cost_history = np.empty(shape = [1], dtype=float)
        
        for epoch in range(num_epochs):
            sess.run(training_step, feed_dict={
                     x: train_features, y_: train_labels})
            cost = sess.run(cost_function, feed_dict={
                            x: train_features, y_: train_labels})
            cost_history = np.append(cost_history, cost)
            correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
            # tf.Print(y, [y])
            # sess.run(y, feed_dict={x: train_features})
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
            pred_y = sess.run(y, feed_dict={x: test_features})
            mse = tf.reduce_mean(tf.square(pred_y - test_labels))
            mse_history.append(sess.run(mse))
            accuracy = (sess.run(accuracy, feed_dict={
                        x: train_features, y_: train_labels}))
            accuracy_history.append(accuracy)
            print('epoch: ', epoch, ' - cost: ', cost,
                  " - MSE: ", mse, "- Train Accuracy: ", accuracy)
        save_path = saver.save(sess, model_path)
        print(" Model Saved in file: {}".format(save_path))
        pred_y = sess.run(y, feed_dict={x: test_features})
        mse = tf.reduce_mean(tf.square(pred_y - test_labels))
        print("MSE : {}".format(sess.run(mse)))

    @staticmethod
    def multilayer_perceptron(x, weights, biases):

        layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
        layer_1 = tf.nn.relu(layer_1)

        layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
        layer_2 = tf.nn.sigmoid(layer_2)

        layer_3 = tf.add(tf.matmul(layer_2, weights['h3']), biases['b3'])
        layer_3 = tf.nn.sigmoid(layer_3)

        layer_4 = tf.add(tf.matmul(layer_3, weights['h4']), biases['b4'])
        layer_4 = tf.nn.relu(layer_4)

        out_layer = tf.matmul(layer_4, weights['out']) + biases['out']
        print ("Hello")
        return out_layer

    def run(self):
        self.make_data()
        self.train()
        

In [21]:
# em = getEmojis("resultdata.json")
w2vec = word2vec("train.txt.text")
# em.run()
w2vec.run()
# labels = em.emoji_labels

In [11]:
mlp = MLP(labels, w2vec.tweet2vec)
# print(mlp.labels)
mlp.run()

  y = column_or_1d(y, warn=True)


(16000, 300)
(16000, 20)
20
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Hello
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

epoch:  0  - cost:  65.73263  - MSE:  Tensor("Mean_2:0", shape=(), dtype=float64) - Train Accuracy:  0.0253125
epoch:  1  - cost:  102.848656  - MSE:  Tensor("Mean_4:0", shape=(), dtype=float64) - Train Accuracy:  0.2149375
epoch:  2  - cost:  135.37259  - MSE:  Tensor("Mean_6:0", shape=(), dtype=float64) - Train Accuracy:  0.104125
epoch:  3  - cost:  26.358427  - MSE:  Tensor("Mean_8:0", shape=(), dtype=float64) - Train Accuracy:  0.1011875
epoch:  4  - cost:  11.80418  - MSE:  Tensor("Mean_10:0", shape=(), dtype=float64) - Train Accuracy:  0.0446875
epoch:  5  - cost:  4.1159706  - MSE:  Tensor("Mean_12:0", shape=(), dtype=float64) - Train Accuracy:  0.0456875
epoch:  6  - cost:  3.29377  

epoch:  67  - cost:  2.8534403  - MSE:  Tensor("Mean_136:0", shape=(), dtype=float64) - Train Accuracy:  0.2149375
epoch:  68  - cost:  2.8503525  - MSE:  Tensor("Mean_138:0", shape=(), dtype=float64) - Train Accuracy:  0.2149375
epoch:  69  - cost:  2.8473477  - MSE:  Tensor("Mean_140:0", shape=(), dtype=float64) - Train Accuracy:  0.2149375
epoch:  70  - cost:  2.8444107  - MSE:  Tensor("Mean_142:0", shape=(), dtype=float64) - Train Accuracy:  0.2149375
epoch:  71  - cost:  2.841547  - MSE:  Tensor("Mean_144:0", shape=(), dtype=float64) - Train Accuracy:  0.2149375
epoch:  72  - cost:  2.8387105  - MSE:  Tensor("Mean_146:0", shape=(), dtype=float64) - Train Accuracy:  0.2149375
epoch:  73  - cost:  2.8360074  - MSE:  Tensor("Mean_148:0", shape=(), dtype=float64) - Train Accuracy:  0.2149375
epoch:  74  - cost:  2.8333135  - MSE:  Tensor("Mean_150:0", shape=(), dtype=float64) - Train Accuracy:  0.2149375
epoch:  75  - cost:  2.8306894  - MSE:  Tensor("Mean_152:0", shape=(), dtype=floa

In [22]:
text = open('train.txt.text')
label = open('train.txt.labels')
tweets = []
labels = []
for line in text:
    tweets.append(line)
for lab in label:
    labels.append(lab)
labels = list(map(int, labels))
text.close()
label.close()

In [10]:
# tweets[21342]

'For the next month keep pancakes away from me #XiHop #txstaxid @ Texas State Alpha Xi Delta \n'

In [23]:
features = []
# print (type(features))
for counter, tweet in enumerate(tweets):
    avg_vec = np.zeros(w2vec.tweet2vec.wv.vector_size)
    for word in tweet:
        if word not in w2vec.tweet2vec.wv.vocab or word in emoji:
            continue
        avg_vec = np.add(avg_vec, w2vec.tweet2vec.wv[word])
    features.append(np.true_divide(avg_vec, len(tweet)))
features = np.asarray(features)

In [24]:
print(features.shape)
from sklearn import svm
# encoder = LabelEncoder()
# encoded_labels = encoder.fit_transform(labels)
train_features, train_labels = shuffle(features, labels, random_state = 1)
# train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=42)
test_features = []
test_labels = []
text = open('us_test.text')
label = open('us_test.labels')
for line in text:
    test_features.append(line)
for line in labels:
    test_labels.append(line)
test_labels = list(map(int, labels))

(77360, 300)


In [33]:
from keras.utils import np_utils
model = Sequential()
# Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape:
# here, 20-dimensional vectors.
model.add(Dense(400, activation='relu', input_dim=300))
model.add(Dropout(40.5))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(20, activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])
y_train = np_utils.to_categorical(train_labels, 20)
y_test = np_utils.to_categorical(test_labels, 20)
model.fit(train_features, y_train,
          epochs=40,
          batch_size=128)
score = model.evaluate(test_features, y_test, batch_size=128)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [35]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=100)
neigh.fit(train_features, train_labels)
pred = neigh.predict(test_features)


In [36]:
from sklearn.metrics import accuracy_score
acc =accuracy_score(test_labels, pred)


In [37]:
acc

0.234552740434333