In [25]:
import numpy as np
from sklearn import svm
from sklearn.linear_model import SGDClassifier

# Contruct the word embeddings using GloVe algorithm

In [None]:
#!/usr/bin/env python3
from scipy.sparse import *
import numpy as np
import pickle
import random


print("loading cooccurrence matrix")
with open('cooc.pkl', 'rb') as f:
    cooc = pickle.load(f)
print("{} nonzero entries".format(cooc.nnz))

nmax = 100
print("using nmax =", nmax, ", cooc.max() =", cooc.max())

print("initializing embeddings")
embedding_dim = 20
xs = np.random.normal(size=(cooc.shape[0], embedding_dim))
ys = np.random.normal(size=(cooc.shape[1], embedding_dim))

eta = 0.001
alpha = 3 / 4

epochs = 10

for epoch in range(epochs):
    print("epoch {}".format(epoch))
    for ix, jy, n in zip(cooc.row, cooc.col, cooc.data):
        logn = np.log(n)
        fn = min(1.0, (n / nmax) ** alpha)
        x, y = xs[ix, :], ys[jy, :]
        scale = 2 * eta * fn * (logn - np.dot(x, y))
        xs[ix, :] += scale * y
        ys[jy, :] += scale * x
np.save('embeddings', xs)



In [None]:
word_embeddings = np.load("embeddings.npy")
word_embeddings.shape

Now we have a representation for each word as a vector of dimension 20.

We will load each tweet and construct a representation for this tweet as a mean of the vector of each word contained in the tweet.

In [None]:
words_list = []
f = open("vocab_cut.txt", "r")
for l in f.readlines():
    l = l.strip()
    words_list.append(l)
words_list = np.array(words_list)
f.close()

In [None]:
np.argmax(words_list=="paul")

In [None]:
tweets_pos_txt = []
f = open("Datasets/twitter-datasets/train_pos.txt")
for l in f.readlines():
    tweets_pos_txt.append(l.strip())
tweets_pos_txt = np.array(tweets_pos_txt)
f.close()
tweets_pos_vec = []
for tw in tweets_pos_txt:
    words_in_tweet = tw.split(" ")
    acc = np.zeros(word_embeddings[0].shape[0])
    for w in words_in_tweet:
        vec = word_embeddings[np.argmax(words_list==w)]
        acc += vec
    acc = acc/len(words_in_tweet)
    tweets_pos_vec.append(acc)
tweets_pos_vec = np.array(tweets_pos_vec)

In [None]:
tweets_neg_txt = []
f = open("Datasets/twitter-datasets/train_neg.txt")
for l in f.readlines():
    tweets_neg_txt.append(l.strip())
tweets_neg_txt = np.array(tweets_neg_txt)
f.close()
tweets_neg_vec = []
for tw in tweets_neg_txt:
    words_in_tweet = tw.split(" ")
    acc = np.zeros(word_embeddings[0].shape[0])
    for w in words_in_tweet:
        vec = word_embeddings[np.argmax(words_list==w)]
        acc += vec
    acc = acc/len(words_in_tweet)
    tweets_neg_vec.append(acc)
tweets_neg_vec = np.array(tweets_neg_vec)

# Save the vectors to gain time

In [None]:
np.save('tweets_pos_vec', tweets_pos_vec)
np.save('tweets_neg_vec', tweets_neg_vec)

# Load data from files (generated by code above) EXECUTE FROM THERE TO TEST

In [2]:
tweets_pos_vec = np.load('tweets_pos_vec.npy')
tweets_neg_vec = np.load('tweets_neg_vec.npy')

In [44]:
X = np.concatenate((tweets_pos_vec, tweets_neg_vec))
y = np.concatenate((np.ones((tweets_pos_vec.shape[0])), -1*np.ones((tweets_pos_vec.shape[0]))))

indices = np.random.permutation(len(y))

X = X[indices]
y = y[indices]

j = 0.9

train_X = X[0:int(j*len(y))]
train_y = y[0:int(j*len(y))]
test_X = X[int(j*len(y)):]
test_y = y[int(j*len(y)):]

train_y[train_y == -1.] = 0.
test_y[test_y == -1.] = 0.

# Try with simple SVM

In [49]:
def accuracy(y1, y2):
    return np.mean(y1==y2)
def normalize(X):
    means = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    return means, std, (X-means)/std

In [60]:
lsvc = svm.LinearSVC(max_iter=10000)
lsvc.fit(train_X, train_y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [61]:
accuracy(test_y, lsvc.predict(test_X))

0.5937

In [62]:
sgdcl = SGDClassifier()
sgdcl.fit(train_X, train_y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [63]:
y_pred = sgdcl.predict(test_X)
accuracy(test_y, y_pred)

0.5944