In [2]:
import numpy as np
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from helpers import *
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors

# Contruct the word embeddings using GloVe algorithm

In [None]:
#!/usr/bin/env python3
from scipy.sparse import *
import numpy as np
import pickle
import random


print("loading cooccurrence matrix")
with open('cooc.pkl', 'rb') as f:
    cooc = pickle.load(f)
print("{} nonzero entries".format(cooc.nnz))

nmax = 100
print("using nmax =", nmax, ", cooc.max() =", cooc.max())

print("initializing embeddings")
embedding_dim = 250
xs = np.random.normal(size=(cooc.shape[0], embedding_dim))
ys = np.random.normal(size=(cooc.shape[1], embedding_dim))

eta = 0.001
alpha = 3 / 4

epochs = 15

for epoch in range(epochs):
    print("epoch {}".format(epoch))
    for ix, jy, n in zip(cooc.row, cooc.col, cooc.data):
        logn = np.log(n)
        fn = min(1.0, (n / nmax) ** alpha)
        x, y = xs[ix, :], ys[jy, :]
        scale = 2 * eta * fn * (logn - np.dot(x, y))
        xs[ix, :] += scale * y
        ys[jy, :] += scale * x
np.save('embeddings_sam', xs)



In [3]:
word_embeddings = np.load("embeddings_sam.npy")
word_embeddings.shape

(21161, 250)

Now we have a representation for each word as a vector of dimension 20.

We will load each tweet and construct a representation for this tweet as a mean of the vector of each word contained in the tweet.

In [9]:
words_list = []
f = open("vocab_cut.txt", "r")
for l in f.readlines():
    l = l.strip()
    words_list.append(l)
words_list = np.array(words_list)
f.close()
np.save("words_list_small_dataset.npy", words_list)

In [5]:
np.argmax(words_list=="cucumber")

12404

# Remove common small words

In [None]:
common_words = ["a", "the", "of", "he", "she", "I", "we", "you", "they", "to", "out", "in",\
                "oi", "it", "this", "that", "these", "there", "!", ".", "?", "...", "(", ")", "*"]

In [6]:
tweets_pos_txt = []
f = open("Datasets/twitter-datasets/train_pos_clean.txt")
for l in f.readlines():
    tweets_pos_txt.append(l.strip())
tweets_pos_txt = np.array(tweets_pos_txt)
f.close()
tweets_pos_vec = []
for tw in tweets_pos_txt:
    words_in_tweet = tw.split(" ")
    acc = np.zeros(word_embeddings[0].shape[0])
    n = 0
    for w in words_in_tweet:
        vec = word_embeddings[np.argmax(words_list==w)]
        acc += vec
        n += 1
    acc = 1.0*acc/n
    tweets_pos_vec.append(acc)
tweets_pos_vec = np.array(tweets_pos_vec)

KeyboardInterrupt: 

In [0]:
tweets_neg_txt = []
f = open("Datasets/twitter-datasets/train_neg_clean.txt")
for l in f.readlines():
    tweets_neg_txt.append(l.strip())
tweets_neg_txt = np.array(tweets_neg_txt)
f.close()
tweets_neg_vec = []
for tw in tweets_neg_txt:
    words_in_tweet = tw.split(" ")
    acc = np.zeros(word_embeddings[0].shape[0])
    n = 0
    for w in words_in_tweet:
        vec = word_embeddings[np.argmax(words_list==w)]
        acc += vec
        n += 1
    acc = 1.0*acc/n
    tweets_neg_vec.append(acc)
tweets_neg_vec = np.array(tweets_neg_vec)

# Save the vectors to gain time

In [None]:
np.save('tweets_pos_vec_sam', tweets_pos_vec)
np.save('tweets_neg_vec_sam', tweets_neg_vec)

In [7]:
tweets_pos_vec = np.load('tweets_pos_vec_sam.npy')
tweets_neg_vec = np.load('tweets_neg_vec_sam.npy')

In [8]:
tweets_pos_vec.shape

(100000, 250)

In [None]:
X = np.concatenate((tweets_pos_vec, tweets_neg_vec))
y = np.concatenate((np.ones((tweets_pos_vec.shape[0])), -1*np.ones((tweets_pos_vec.shape[0]))))

#X, y = remove_duplicated_tweets(X, y)

indices = np.random.permutation(len(y))

X = X[indices]
y = y[indices]



In [None]:
np.save("tweets_X_sam", X)
np.save("tweets_y_sam", y)

# Load data from files (generated by code above) EXECUTE FROM THERE TO TEST

In [4]:
X = np.load("tweets_X.npy")
y = np.load("tweets_y.npy")

In [5]:
j = 0.9

train_X = X[0:int(j*len(y))]
train_y = y[0:int(j*len(y))]
test_X = X[int(j*len(y)):]
test_y = y[int(j*len(y)):]

train_y[train_y == -1.] = 0.
test_y[test_y == -1.] = 0.

# Try with simple SVM

In [5]:
def accuracy(y1, y2):
    return np.mean(y1==y2)
def normalize(X):
    means = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    return means, std, (X-means)/std

In [None]:
lsvc = svm.LinearSVC(max_iter=10000)
lsvc.fit(train_X, train_y)

In [None]:
accuracy(test_y, lsvc.predict(test_X))

In [None]:
sgdcl = SGDClassifier()
sgdcl.fit(train_X, train_y)

In [None]:
y_pred = sgdcl.predict(test_X)
accuracy(test_y, y_pred)

In [None]:
lr = LogisticRegression()
lr.fit(train_X, train_y)

In [None]:
y_pred = lr.predict(test_X)
accuracy(test_y, y_pred)

# Let's try with MLP

In [6]:
mlp = MLPClassifier(hidden_layer_sizes=(250, 250), max_iter=1000)
mlp.fit(train_X, train_y)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(250, 250), learning_rate='constant',
              learning_rate_init=0.001, max_iter=1000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [0]:
y_pred = mlp.predict(test_X)
accuracy(test_y, y_pred)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(400,200,100), max_iter=1000, activation='relu')
mlp.fit(train_X, train_y)

In [None]:
y_pred = mlp.predict(test_X)
accuracy(test_y, y_pred)

## Random forest

In [None]:
clf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0)

In [None]:
clf.fit(train_X, train_y)

In [None]:
y_pred = clf.predict(test_X)
accuracy(test_y, y_pred)

## KNN

In [6]:
n_neighbors = 15
neigh = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')

In [7]:
neigh.fit(train_X, train_y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')

In [8]:
y_pred = neigh.predict(test_X)
accuracy(test_y, y_pred)

# Try with combination

In [None]:
def combine_output(y1, y2):
    """
    take +1 if at least one of the classifier outputs +1
    """
    y = y1+y2
    y[y>0] = 1.
    return y

In [None]:
y_final = combine_output(combine_output(mlp.predict(test_X), lsvc.predict(test_X)), lr.predict(test_X))

In [None]:
accuracy(y_final, test_y)