In [1]:
import numpy as np
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from helpers import *

# Contruct the word embeddings using GloVe algorithm

In [27]:
#!/usr/bin/env python3
from scipy.sparse import *
import numpy as np
import pickle
import random


print("loading cooccurrence matrix")
with open('cooc.pkl', 'rb') as f:
    cooc = pickle.load(f)
print("{} nonzero entries".format(cooc.nnz))

nmax = 100
print("using nmax =", nmax, ", cooc.max() =", cooc.max())

print("initializing embeddings")
embedding_dim = 250
xs = np.random.normal(size=(cooc.shape[0], embedding_dim))
ys = np.random.normal(size=(cooc.shape[1], embedding_dim))

eta = 0.001
alpha = 3 / 4

epochs = 15

for epoch in range(epochs):
    print("epoch {}".format(epoch))
    for ix, jy, n in zip(cooc.row, cooc.col, cooc.data):
        logn = np.log(n)
        fn = min(1.0, (n / nmax) ** alpha)
        x, y = xs[ix, :], ys[jy, :]
        scale = 2 * eta * fn * (logn - np.dot(x, y))
        xs[ix, :] += scale * y
        ys[jy, :] += scale * x
np.save('embeddings_sam', xs)



loading cooccurrence matrix
6496907 nonzero entries
using nmax = 100 , cooc.max() = 207302
initializing embeddings
epoch 0
epoch 1
epoch 2
epoch 3
epoch 4
epoch 5
epoch 6
epoch 7
epoch 8
epoch 9
epoch 10
epoch 11
epoch 12
epoch 13
epoch 14


In [28]:
word_embeddings = np.load("embeddings_sam.npy")
word_embeddings.shape

(21161, 250)

Now we have a representation for each word as a vector of dimension 20.

We will load each tweet and construct a representation for this tweet as a mean of the vector of each word contained in the tweet.

In [29]:
words_list = []
f = open("vocab_cut.txt", "r")
for l in f.readlines():
    l = l.strip()
    words_list.append(l)
words_list = np.array(words_list)
f.close()

In [41]:
np.argmax(words_list=="cucumber")

12404

# Remove common small words

In [42]:
common_words = ["a", "the", "of", "he", "she", "I", "we", "you", "they", "to", "out", "in",\
                "oi", "it", "this", "that", "these", "there"]

In [None]:
tweets_pos_txt = []
f = open("Datasets/twitter-datasets/train_pos.txt")
for l in f.readlines():
    tweets_pos_txt.append(l.strip())
tweets_pos_txt = np.array(tweets_pos_txt)
f.close()
tweets_pos_vec = []
for tw in tweets_pos_txt:
    words_in_tweet = tw.split(" ")
    acc = np.zeros(word_embeddings[0].shape[0])
    n = 0
    for w in words_in_tweet:
        if w not in common_words:
            vec = word_embeddings[np.argmax(words_list==w)]
            acc += vec
            n += 1
    acc = 1.0*acc/n
    tweets_pos_vec.append(acc)
tweets_pos_vec = np.array(tweets_pos_vec)

In [None]:
tweets_neg_txt = []
f = open("Datasets/twitter-datasets/train_neg.txt")
for l in f.readlines():
    tweets_neg_txt.append(l.strip())
tweets_neg_txt = np.array(tweets_neg_txt)
f.close()
tweets_neg_vec = []
for tw in tweets_neg_txt:
    words_in_tweet = tw.split(" ")
    acc = np.zeros(word_embeddings[0].shape[0])
    n = 0
    for w in words_in_tweet:
        if w not in common_words:
            vec = word_embeddings[np.argmax(words_list==w)]
            acc += vec
            n += 1
    acc = 1.0*acc/n
    tweets_neg_vec.append(acc)
tweets_neg_vec = np.array(tweets_neg_vec)

# Save the vectors to gain time

In [None]:
np.save('tweets_pos_vec_sam', tweets_pos_vec)
np.save('tweets_neg_vec_sam', tweets_neg_vec)

In [None]:
tweets_pos_vec = np.load('tweets_pos_vec_sam.npy')
tweets_neg_vec = np.load('tweets_neg_vec_sam.npy')

In [None]:
X = np.concatenate((tweets_pos_vec, tweets_neg_vec))
y = np.concatenate((np.ones((tweets_pos_vec.shape[0])), -1*np.ones((tweets_neg_vec.shape[0]))))

#X, y = remove_duplicated_tweets(X, y)

indices = np.random.permutation(len(y))

X = X[indices]
y = y[indices]



In [None]:
np.save("tweets_X_sam", X)
np.save("tweets_y_sam", y)

# Load data from files (generated by code above) EXECUTE FROM THERE TO TEST

In [5]:
X = np.load("tweets_X.npy")
y = np.load("tweets_y.npy")

In [6]:
j = 0.9

train_X = X[0:int(j*len(y))]
train_y = y[0:int(j*len(y))]
test_X = X[int(j*len(y)):]
test_y = y[int(j*len(y)):]

train_y[train_y == -1.] = 0.
test_y[test_y == -1.] = 0.

# Try with simple SVM

In [7]:
def accuracy(y1, y2):
    return np.mean(y1==y2)
def normalize(X):
    means = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    return means, std, (X-means)/std

In [8]:
lsvc = svm.LinearSVC(max_iter=10000)
lsvc.fit(train_X, train_y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [9]:
accuracy(test_y, lsvc.predict(test_X))

0.60695

In [10]:
sgdcl = SGDClassifier()
sgdcl.fit(train_X, train_y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [11]:
y_pred = sgdcl.predict(test_X)
accuracy(test_y, y_pred)

0.61145

In [12]:
lr = LogisticRegression()
lr.fit(train_X, train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
y_pred = lr.predict(test_X)
accuracy(test_y, y_pred)

0.60645

# Let's try with MLP

In [14]:
mlp = MLPClassifier(hidden_layer_sizes=(200,), max_iter=1000)
mlp.fit(train_X, train_y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(200,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=1000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [15]:
y_pred = mlp.predict(test_X)
accuracy(test_y, y_pred)

0.6747

In [21]:
mlp = MLPClassifier(hidden_layer_sizes=(400,200,100), max_iter=1000, activation='relu')
mlp.fit(train_X, train_y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(400, 200, 100), learning_rate='constant',
              learning_rate_init=0.001, max_iter=1000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [22]:
y_pred = mlp.predict(test_X)
accuracy(test_y, y_pred)

0.67835

# Try with combination

In [18]:
def combine_output(y1, y2):
    """
    take +1 if at least one of the classifier outputs +1
    """
    y = y1+y2
    y[y>0] = 1.
    return y

In [19]:
y_final = combine_output(combine_output(mlp.predict(test_X), lsvc.predict(test_X)), lr.predict(test_X))

In [20]:
accuracy(y_final, test_y)

0.63785