# Introduction and instruction to run

This notebook is the notebook used for neural nets training on Google Colab. To run it, you will need to install tensorflow. Moreover, loading the stanford embedding takes around 6 hours and computing the vectors for the full dataset takes around 10 hours. This is why there some savings of numpy arrays as checkpoints.

To run the notebook, you need data that was too big to be given back with the rest. We then uploaded to our Google Drive and here is the link to download it:

https://drive.google.com/drive/folders/1r8tVqsL2PJ8VaUk7AHsEbeiwoqkxO3Qc?usp=sharing


You need to extract the two folders that are in the archive ('data' and 'glove_from_stanford') beside this notebook's file.

To prove that we haven't modified the archive since the deadline here is the hash (md5) from the .zip file:

38c149f74806e5c65288825267ecd12a



# Helpers

In [None]:
import numpy as np
import os

def words_list(file_name):
    words_list = []
    f = open(file_name, "r")
    for l in f.readlines():
        l = l.strip()
        words_list.append(l)
    words_list = np.array(words_list)
    f.close()
    return words_list

def tweets_txt(file_name):
    tweets_txt = []
    f = open(file_name, "r")
    for l in f.readlines():
        tweets_txt.append(l.strip())
    f.close()
    return np.array(tweets_txt)

def tweet_means(tweets_txt, word_embeddings, words_list, embedding_size):
    tweets_vec = []
    i = 0
    for tw in tweets_txt:
        words_in_tweet = tw.split(" ")
        acc = np.zeros(embedding_size)
        for w in words_in_tweet:
            vec = word_embeddings[np.argmax(words_list==w)]
            acc += vec
        acc = acc/len(words_in_tweet)
        tweets_vec.append(acc)
        if i%1000 == 0:
            print(i, " done")
        i += 1
    tweets_vec = np.array(tweets_vec)
    return tweets_vec

# Imports for Colab

In [None]:
import tensorflow as tf
import numpy as np
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print('No GPU Found')

# Load GloVe embedding from Stanford

In [None]:
f = open("glove_from_stanford/glove.twitter.27B.200d.txt", "r")
words = []

i = 0
embeddings = []
for l in f.readlines():
    li = l.split()
    w = li[0]
    vec_string = li[1:]
    vec = []
    for e in vec_string:
        vec.append(float(e))
    vec = np.array(vec)
    if i%10000 == 0:
        print("done: ", i )
    if vec.shape[0] == 200:
        words.append(w)
        embeddings.append(vec)

    else:
        print(w, " was not the right shape. The shape was: ", vec.shape)
    i += 1

In [None]:
embedding_stacked = np.stack(embeddings, axis=0)

In [None]:
words = np.array(words)

In [None]:
np.save("embedding_stanford.npy", embedding_stacked)
np.save("words_stanford.npy", words)

# Reduce stanford embedding for this particular dataset

In [None]:
f = open("data/vocab_cut_clean_kim_yoon.txt", 'r')
words = []
for l in f.readlines():
    words.append(l[:-1])
words = np.array(words)
f.close()
np.save("words_full_list_clean_kim_yoon.npy", words)

In [None]:
embedding_stanford = np.load("embedding_stanford.npy")
word_list_stanford = np.load("words_stanford.npy")

words_list_full_dataset = np.load("words_full_list_clean_kim_yoon.npy")


In [None]:
words_needed = np.isin(word_list_stanford, words_list_full_dataset)

In [None]:
word_wanted_indices = np.nonzero(words_needed*1.)[0]

In [None]:
reduced_embedding_stanford = embedding_stanford[word_wanted_indices]
reduced_words_stanford = word_list_stanford[word_wanted_indices]

# Compute vectors for tweets FULL dataset

In [None]:
embedding = reduced_embedding_stanford
word_list = reduced_words_stanford
n_features = embedding.shape[1]

In [None]:
tweets_pos_full_txt = []
f = open("data/train_pos_full_clean_kim_yoon.txt")
for l in f.readlines():
  tweets_pos_full_txt.append(l.strip())
tweets_pos_full_txt = np.array(tweets_pos_full_txt)
tweets_vecs_pos_full = tweet_means(tweets_pos_full_txt, embedding, word_list, n_features)
np.save("tweets_pos_full_clean_stanford.npy", tweets_vecs_pos_full)

In [None]:
tweets_neg_full_txt = []
f = open("data/train_neg_full_clean_kim_yoon.txt")
for l in f.readlines():
    tweets_neg_full_txt.append(l.strip())
tweets_neg_full_txt = np.array(tweets_neg_full_txt)
tweets_vecs_neg_full = tweet_means(tweets_neg_full_txt, embedding, word_list, n_features)
np.save("tweets_neg_full_clean_stanford.npy", tweets_vecs_neg_full)


# Load saved data FULL (FROM HERE to test models)

In [None]:
tweets_pos = np.load("tweets_pos_full_clean_stanford.npy")
tweets_neg = np.load("tweets_neg_full_clean_stanford.npy")
X = np.vstack((tweets_pos, tweets_neg))
y = np.array([1 for i in range(tweets_pos.shape[0])] + [-1 for i in range(tweets_neg.shape[0])])

indices = np.random.permutation([i for i in range(y.shape[0])])

X = X[indices]
y = y[indices]

print(X.shape)
print(y.shape)

#because of protobuf limit of 2GB
X = X.astype(np.float32)
y = y.astype(np.float32)

n_features=X.shape[1]

j = 0.9

train_X = X[0:int(j*len(y))]
train_y = y[0:int(j*len(y))]
test_X = X[int(j*len(y)):]
test_y = y[int(j*len(y)):]

train_y[train_y == -1.] = 0.
test_y[test_y == -1.] = 0.

#free some ram
tweets_pos = 0.0
tweets_neg = 0.0

# Work with data GPU

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.InputLayer(n_features))
model.add(tf.keras.layers.Dense(1000, activation='relu'))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(1000, activation='relu'))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(1000, activation='relu'))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(1000, activation='relu'))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(500, activation='relu'))
model.add(tf.keras.layers.Dropout(0.05))
model.add(tf.keras.layers.Dense(2, activation='softmax'))
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.01),
                loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                metrics=['accuracy'])
model.save('models/model_temp.h5')

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_X, train_y))
test_dataset = tf.data.Dataset.from_tensor_slices((test_X, test_y))

BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = 100

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

Here I often save the model and reload it because the Colab notebook experienced some crash so with this technique, we do not loose to much work.

In [None]:
n_epochs = 5
for _ in range(15):
  model = tf.keras.models.load_model('models/model_temp-1.h5')
  model.fit(train_dataset, epochs=3)
  f = open("models/number-of-it-done-on-model-temp-1.txt", 'a')
  f.write(str(n_epochs) + '\n')
  f.close()
  model.save('/content/drive/My Drive/Colab Notebooks/ML-MA1/Project02/models/model_temp-1.h5')
  model.evaluate(test_dataset)

In [None]:
model = tf.keras.models.load_model('/content/drive/My Drive/Colab Notebooks/ML-MA1/Project02/models/model_temp-1.h5')
model.evaluate(test_dataset)

# Load test data and output for AIcrowd

In [None]:
embedding = reduced_embedding_stanford
word_list = reduced_words_stanford
n_features = embedding.shape[1]

tweets_test_txt = []
f = open("data/test_data_clean_kim_yoon.txt")
for l in f.readlines():
    l = l.strip()
    l = l[l.find(',')+1:]
    tweets_test_txt.append(l.strip())
tweets_test_txt = np.array(tweets_test_txt)
print("begin means")
tweets_vecs_test = tweet_means(tweets_test_txt, embedding, word_list, n_features)

np.save("tweets_test_clean_stanford.npy", tweets_vecs_test)

# Make CSV

In [None]:
model = tf.keras.models.load_model('models/model_temp.h5')
tweets_vecs_test = np.load("tweets_test_clean_stanford.npy")
y = model.predict(tweets_vecs_test)
y = np.argmax(y, axis=1)

# make csv
with open("submission.csv", "w") as f:
  f.write("Id,Prediction\n")
  id = 1
  for i in y:
    if i == 0:
      i = -1
    l = str(id) + "," + str(i) + "\n"
    f.write(l)
    id = id + 1