# Word2Vec


## Embedding model - training  

In [None]:
# w2v hyperparameters: 
vec_size = 100
window = 7
epochs = 15

In [None]:
print('Preprocessing train data...', end=' ', flush=True)
labels_train, corpus_train = get_data('train')
labels_valid, corpus_valid = get_data('dev')
labels_test, corpus_test = get_data('test')
print('done.', flush=True)

In [None]:
print('Training w2v model...', end=' ', flush=True)
    model = Word2Vec(sentences=corpus_train, vector_size=vec_size,
                     window=window, min_count=1, workers=4, epochs=epochs)
    model.save(
        "trained_models/word2vec_{}_{}_{}.model".format(vec_size, window, epochs))
    print('done.', flush=True)

## Embedding model - evaluation

We chose the hyperparameters for the w2v model based on the following evaluation, which yielded the parameters above

this was the output from the best model: 

word2vec_100_7_15.model: 

((0.3767671252007464, 3.123816278482829e-12), SpearmanrResult(correlation=0.3488502741199042, pvalue=1.3738505633123715e-10), 9.34844192634561)

In [None]:
# take a look at the eval metrics from word2vec word pairs: 
word_sim = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))
print('output from word pairs: ')
print(word_sim)

## Classification with word2vec embedding - preprocessing


We first need to preprocess the data by embedding the words and averaging them, filtering out the sentences that could not be embedded, due to being empty or only having words that are not in the dictionary

In [None]:
# map text to vectors and average each sentence to one vector:
all_vector_texts = []
all_labels = []
not_in_model = []
for text, labels in zip([corpus_train, corpus_valid, corpus_test], [labels_train, labels_valid,labels_test]):
    delete_labels = []
    vector_text = []
    for i,sentence in enumerate(text):
        # assert that sentence is not empty:
        if sentence == []:
            delete_labels.append(i)
            continue

        feature_vec = []
        for word in sentence:
            try:
                feature_vec.append(w2v_model.wv[word])
            except Exception:
                not_in_model.append(word)

        mean = np.array(feature_vec).mean(axis=0)
        # also get rid of nan means due to only unknown words (example: sentence with 1 word not in dict)
        if np.shape(mean) != (vec_size,):
            delete_labels.append(i)
            continue

        vector_text.append(mean)
    # delete labels:
    for i in sorted(delete_labels, reverse=True):
        del labels[i]

    all_vector_texts.append(vector_text)
    all_labels.append(labels)

train_vector_text = all_vector_texts[0]
val_vector_text = all_vector_texts[1]
test_vector_text = all_vector_texts[2]

train_labels = all_labels[0]
val_labels = all_labels[1]
test_labels = all_labels[2]

## Classification with word2vec embedding - Logistic Regression



Trying out logistic regression on top of the averaged embeddings. We did a grid search to find the parameters that worked best. (l2 penalty , C=5, class weigths = None, F1_weighted score for val and test respectively: 0.746, 0.749) Sadly we did not manage to yield better result here, than on the baseline. 

We also tried varying the word2vec models here, finding that vec_size=300 worked best here, although it performed worse on the word pairs task. 

In [None]:
log_reg = LogisticRegression(solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
log_reg.fit(np.array(train_vector_text), np.array(train_labels))


In [None]:
# evaluate on val and test: 
def evaluate(model, X, y):
    y_pred = model.predict(X)
    micro = f1_score(y, y_pred, average='micro')
    macro = f1_score(y, y_pred, average='macro')
    weighted = f1_score(y, y_pred, average='weighted')
    # samples = f1_score(y, y_pred, average='samples')
    print(f'F1 Score: micro {micro}, macro {macro}, weighted {weighted}')

In [None]:
evaluate(log_reg, val_vector_text, val_labels)
evaluate(log_reg, test_vector_text, test_labels)

# save the model
model_name_logreg = 'log_reg.sav'
pickle.dump(log_reg, open(model_name_logreg, 'wb'))


In [None]:
# load the model and evaluate: 
# you can also load the model here and evaluate the results: 
loaded_model = pickle.load(open(model_name_logreg, 'rb'))
evaluate(loaded_model, val_vector_text, val_labels)
evaluate(loaded_model, test_vector_text, test_labels)

## Classification with word2vec embedding - SVC

We also tried out SVCs on top of the averaged embedding. They performed very promising, when training them on the validation set and evaluating on test, but we did not manage to train them on the full dataset due to computational and time limits. 
For training on validation, we also did a small hyperparameter search yielding the following parameters: 
kernel = 'rbf', C=2
F1_weighted score on test set: 0.77

In [None]:
svc = SVC(gamma='auto', random_state=0, C=2,verbose=True, kernel='rbf')
svc.fit(np.array(val_vector_text), np.array(val_labels))

In [None]:
# evaluate as above
evaluate(svc, test_vector_text, test_labels)

#save the model
model_name_svc = 'svc_val.sav'
pickle.dump(svc, open(model_name_svc, 'wb'))

In [None]:
# load the provided model and evaluate: 
# load the model and evaluate: 
# you can also load the model here and evaluate the results: 
loaded_model = pickle.load(open(model_name_svc, 'rb'))
evaluate(loaded_model, test_vector_text, test_labels)

## Classification with word2vec embedding - Fully Connected Neural Network

Our last attempt was to feed the averaged embeddings into a neural network. This seemed very promising, but the results did not look very well here. We suspect that further fine tuning of the model would have been needed here. 
The resulting F1_weighted score was 0.29. We would have wished to further investigate the issue here, which was not possible due to time constraints. 

In [None]:
# custom f1_weighted: 
def f1_weighted(label, pred):
    label = K.cast(K.flatten(label), 'int32')
    true = K.one_hot(label, num_classes)
    pred_labels = K.argmax(pred, axis=-1)
    pred = K.one_hot(pred_labels, num_classes)

    ground_positives = K.sum(true, axis=0) + K.epsilon()  # = TP + FN
    pred_positives = K.sum(pred, axis=0) + K.epsilon()  # = TP + FP
    true_positives = K.sum(true * pred, axis=0) + K.epsilon()  # = TP
    # all with shape (4,)

    precision = true_positives / pred_positives
    recall = true_positives / ground_positives

    f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())

    weighted_f1 = f1 * ground_positives / K.sum(ground_positives)
    weighted_f1 = K.sum(weighted_f1)

    return weighted_f1


In [None]:
# defining the nn
fc_model = Sequential()
fc_model.add(Dense(64,input_dim=vec_size, activation='relu'))
fc_model.add(Dense(16, activation='relu'))
fc_model.add(Dense(5, activation='softmax'))
fc_model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy', metrics=[f1_weighted])

In [None]:
# training the nn with callbacks and checkpoints
model_name = 'fc_w2v_model'
timestr = time.strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f'./logs/{model_name}_{timestr}', update_freq='batch')
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath='fc_w2v_checkpoint', save_best_only=True, monitor='val_loss',
                                                save_format='tf')
fc_model.fit(np.array(train_vector_text).reshape(-1, 100), np.array(train_labels), epochs=50,
             validation_data=(np.array(val_vector_text).reshape(-1, 100), np.array(val_labels)),
             batch_size=64, callbacks=[tensorboard_callback, checkpoint])
fc_model.save(model_name, save_format='tf')


In [None]:
# load the trained model and evaluate: 
loaded_model = tf.keras.models.load_model(model_name)
# evaluate:
val_results = model.evaluate(np.array(val_vector_text).reshape(-1, 100), np.array(val_labels))
test_results = model.evaluate(np.array(test_vector_text).reshape(-1, 100), np.array(test_labels))

print(val_results)
print(test_results)