# Introduction

This notebook is the notebook used for neural nets training on Google Colab. To run it, you will need to install tensorflow. Moreover, loading the stanford embedding takes around 6 hours and computing the vectors for the full dataset takes around 10 hours. This is why there some savings of numpy arrays as checkpoints.

# Helpers

In [ ]:
import numpy as np
import os
from pattern import *

#Spelling changes
spelling_dict = {"u" : "you", "dont" : "don't", "cant" : "can't", "r" : "are", "wont" : "won't"}

#Common words to remove
common = ['"', ',', '.', ')', '(', '-', \
          "<url>", "a", "the", "of", "to", \
          "it", "this", "that", "these", "there"]

def words_list(file_name):
    words_list = []
    f = open(file_name, "r")
    for l in f.readlines():
        l = l.strip()
        words_list.append(l)
    words_list = np.array(words_list)
    f.close()
    return words_list

def tweets_txt(file_name):
    tweets_txt = []
    f = open(file_name, "r")
    for l in f.readlines():
        tweets_txt.append(l.strip())
    f.close()
    return np.array(tweets_txt)

def tweet_means(tweets_txt, word_embeddings, words_list, embedding_size, \
                spelling=False, spelling_dict=dict(), \
                negation=False, \
                clean=False, common=[]):
    tweets_vec = []
    i = 0
    for tw in tweets_txt:
        words_in_tweet = tw.split(" ")
        if(spelling):
            words_in_tweet = transform_spelling(words_in_tweet, spelling_dict)
        if(negation):
            words_in_tweet = transform_negation(words_in_tweet)
        if(clean):
            words_in_tweet = remove_words(words_in_tweet, common)
            words_in_tweet = remove_exclamation(words_in_tweet)
        acc = np.zeros(embedding_size)
        for w in words_in_tweet:
            vec = word_embeddings[np.argmax(words_list==w)]
            acc += vec
        acc = acc/len(words_in_tweet)
        tweets_vec.append(acc)
        if i%1000 == 0:
            print(i, " done")
        i += 1
    tweets_vec = np.array(tweets_vec)
    return tweets_vec

def remove_duplicated_tweets_txt(tweets):
    return np.unique(tweets, axis = 0)

def remove_duplicated_tweets(X, y):
    tmp = np.hstack((X, y.reshape((len(y),1))))
    tmp = np.unique(tmp, axis=0)

    new_X = tmp[:,:-1]
    new_y= tmp[:,-1]
    return new_X, new_y

def remove_words(words, tweet):
    """ Remove the words that are in the list <words> from the tweets """
    filtered_tweet = [w for w in tweet if w not in words]   
    return filtered_tweet

def remove_exclamation(tweet):
    """ Remove the "!!!" that may be at the beginning of tweets """
    if(tweet[0:3] == ['!', '!', '!']):
        return tweet[3:]
    return tweet

def transform_negation(tweet):
    """Transform a negated verb into an infinitive form + not
        ex: don't -> do not
    """
    new_tweet = []
    for w in tweet:
        #We check if the verb is negated and if the pattern library knows its infinitive form
        if("n't" in w and conjugate(w) != w):
            new_tweet.append(conjugate(w))
            new_tweet.append("not")
        else:
            new_tweet.append(w)
    
    return new_tweet

def transform_spelling(tweet, spelling_dict):
    """ Replace the words of a tweet by another spelling if they are in <spelling_dict> """
    new_tweet = [spelling_dict.get(w, w) for w in tweet]
    return new_tweet

# Imports for Colab

In [ ]:
import tensorflow as tf
%tensorflow_version 2.x
import numpy as np
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print('No GPU Found')

# Load GloVe embedding from Stanford

In [ ]:
f = open("glove_from_stanford/glove.twitter.27B.200d.txt", "r")
words = []

i = 0
embeddings = []
for l in f.readlines():
    li = l.split()
    w = li[0]
    vec_string = li[1:]
    vec = []
    for e in vec_string:
        vec.append(float(e))
    vec = np.array(vec)
    if i%10000 == 0:
        print("done: ", i )
    if vec.shape[0] == 200:
        words.append(w)
        embeddings.append(vec)

    else:
        print(w, " was not the right shape. The shape was: ", vec.shape)
    i += 1

In [ ]:
embedding_stacked = np.stack(embeddings, axis=0)

In [ ]:
words = np.array(words)

In [ ]:
np.save("embedding_stanford.npy", embedding_stacked)
np.save("stanford_words.npy", words)

# Reduce stanford embedding for this particular dataset

In [ ]:
embedding_stanford = np.load("embedding_stanford.npy")
word_list_stanford = np.load("words_stanford.npy")

words_list_full_dataset = np.load("words_full_list_clean_kim_yoon.npy")


In [ ]:
words_needed = np.isin(word_list_stanford, words_list_full_dataset)

In [ ]:
word_wanted_indices = np.nonzero(words_needed*1.)[0]

In [ ]:
reduced_embedding_stanford = embedding_stanford[word_wanted_indices]
reduced_words_stanford = word_list_stanford[word_wanted_indices]

# Compute vectors for tweets FULL dataset

In [ ]:
embedding = reduced_embedding_stanford
word_list = reduced_words_stanford
n_features = embedding.shape[1]

In [ ]:
tweets_pos_full_txt = []
f = open("data/train_pos_full_clean.txt")
for l in f.readlines():
  tweets_pos_full_txt.append(l.strip())
tweets_pos_full_txt = np.array(tweets_pos_full_txt)
tweets_vecs_pos_full = tweet_means(tweets_pos_full_txt, embedding, word_list, n_features, False, {}, False, False, [])
np.save("tweets_pos_full_clean_stanford.npy", tweets_vecs_pos_full)

In [ ]:
tweets_neg_full_txt = []
f = open("data/train_neg_full_clean.txt")
for l in f.readlines():
    tweets_neg_full_txt.append(l.strip())
tweets_neg_full_txt = np.array(tweets_neg_full_txt)
tweets_vecs_neg_full = tweet_means(tweets_neg_full_txt, embedding, word_list, n_features, False, {}, False, False, [])
np.save("tweets_neg_full_clean_stanford.npy", tweets_vecs_neg_full)


# Load saved data FULL (FROM HERE to test models)

In [ ]:

tweets_pos = np.load("tweets_pos_full_clean_stanford.npy")
tweets_neg = np.load("tweets_neg_full_clean_stanford.npy")
X = np.vstack((tweets_pos, tweets_neg))
y = np.array([1 for i in range(tweets_pos.shape[0])] + [-1 for i in range(tweets_neg.shape[0])])

indices = np.random.permutation([i for i in range(y.shape[0])])

X = X[indices]
y = y[indices]

print(X.shape)
print(y.shape)

#because of protobuf limit of 2GB
X = X.astype(np.float32)
y = y.astype(np.float32)

n_features=X.shape[1]

j = 0.9

train_X = X[0:int(j*len(y))]
train_y = y[0:int(j*len(y))]
test_X = X[int(j*len(y)):]
test_y = y[int(j*len(y)):]

train_y[train_y == -1.] = 0.
test_y[test_y == -1.] = 0.

#free some ram
tweets_pos = 0.0
tweets_neg = 0.0

# Work with data GPU

In [ ]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.InputLayer(n_features))
model.add(tf.keras.layers.Dense(1000, activation='relu'))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(1000, activation='relu'))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(1000, activation='relu'))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(1000, activation='relu'))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(500, activation='relu'))
model.add(tf.keras.layers.Dropout(0.05))
model.add(tf.keras.layers.Dense(2, activation='softmax'))
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.01),
                loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                metrics=['accuracy'])
model.save('models/model_temp.h5')

In [ ]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_X, train_y))
test_dataset = tf.data.Dataset.from_tensor_slices((test_X, test_y))

BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = 100

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

Here I often save the model and reload it because the Colab notebook experienced some crash so with this technique, we do not loose to much work.

In [ ]:
n_epochs = 5
for _ in range(15):
  model = tf.keras.models.load_model('models/model_temp-1.h5')
  model.fit(train_dataset, epochs=3)
  f = open("models/number-of-it-done-on-model-temp-1.txt", 'a')
  f.write(str(n_epochs) + '\n')
  f.close()
  model.save('/content/drive/My Drive/Colab Notebooks/ML-MA1/Project02/models/model_temp-1.h5')
  model.evaluate(test_dataset)

In [ ]:
model = tf.keras.models.load_model('/content/drive/My Drive/Colab Notebooks/ML-MA1/Project02/models/model_temp-1.h5')
model.evaluate(test_dataset)

# Load test data and output for AIcrowd

In [ ]:
embedding = reduced_embedding_stanford
word_list = reduced_words_stanford
n_features = embedding.shape[1]

tweets_test_txt = []
f = open("data/test_data_clean_kim_yoon.txt")
for l in f.readlines():
    l = l.strip()
    l = l[l.find(',')+1:]
    tweets_test_txt.append(l.strip())
tweets_test_txt = np.array(tweets_test_txt)
print("begin means")
tweets_vecs_test = tweet_means(tweets_test_txt, embedding, word_list, n_features, False, {}, False, False, [])

np.save("tweets_test_clean_stanford.npy", tweets_vecs_test)

# Make CSV

In [ ]:
model = tf.keras.models.load_model('models/model_temp.h5')
tweets_vecs_test = np.load("tweets_test_clean_stanford.npy")
y = model.predict(tweets_vecs_test)
y = np.argmax(y, axis=1)

# make csv
with open("submission.csv", "w") as f:
  f.write("Id,Prediction\n")
  id = 1
  for i in y:
    if i == 0:
      i = -1
    l = str(id) + "," + str(i) + "\n"
    f.write(l)
    id = id + 1