# Word2Vec


## Embedding model - training  

In [None]:
# w2v hyperparameters: 
vec_size = 100
window = 7
epochs = 15

In [None]:
print('Preprocessing train data...', end=' ', flush=True)
labels_train, corpus_train = get_data('train')
labels_valid, corpus_valid = get_data('dev')
labels_test, corpus_test = get_data('test')
print('done.', flush=True)

In [None]:
print('Training w2v model...', end=' ', flush=True)
    model = Word2Vec(sentences=corpus_train, vector_size=vec_size,
                     window=window, min_count=1, workers=4, epochs=epochs)
    model.save(
        "trained_models/word2vec_{}_{}_{}.model".format(vec_size, window, epochs))
    print('done.', flush=True)

## Embedding model - evaluation

We chose the hyperparameters for the w2v model based on the following evaluation, which yielded the parameters above

In [None]:
# take a look at the eval metrics from word2vec word pairs: 
word_sim = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))
print('output from word pairs: ')
print(word_sim)

## Classification with word2vec embedding - preprocessing


We first need to preprocess the data by embedding the words and averaging them

In [None]:
# map text to vectors and average each sentence to one vector:
all_vector_texts = []
all_labels = []
not_in_model = []
for text, labels in zip([corpus_train, corpus_valid, corpus_test], [labels_train, labels_valid,labels_test]):
    delete_labels = []
    vector_text = []
    for i,sentence in enumerate(text):
        # assert that sentence is not empty:
        if sentence == []:
            delete_labels.append(i)
            continue

        feature_vec = []
        for word in sentence:
            try:
                feature_vec.append(w2v_model.wv[word])
            except Exception:
                not_in_model.append(word)

        mean = np.array(feature_vec).mean(axis=0)
        # also get rid of nan means due to only unknown words (example: sentence with 1 word not in dict)
        if np.shape(mean) != (vec_size,):
            delete_labels.append(i)
            continue

        vector_text.append(mean)
    # delete labels:
    for i in sorted(delete_labels, reverse=True):
        del labels[i]

    all_vector_texts.append(vector_text)
    all_labels.append(labels)

train_vector_text = all_vector_texts[0]
val_vector_text = all_vector_texts[1]
test_vector_text = all_vector_texts[2]

train_labels = all_labels[0]
val_labels = all_labels[1]
test_labels = all_labels[2]

## Classification with word2vec embedding - Logistic Regression



Trying out logistic regression on top of the averaged embeddings. We did a grid search to find the parameters that worked best. (l2 penalty , C=5, class weigths = None, F1_weighted score for val and test respectively: 0.746, 0.749) Sadly we did not manage to yield better result here, than on the baseline. 

We also tried varying the word2vec models here, finding that vec_size=300 worked best here, although it performed worse on the word pairs task. 

In [None]:
log_reg = LogisticRegression(solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
log_reg.fit(np.array(train_vector_text), np.array(train_labels))


In [None]:
# evaluate on val and test: 
def evaluate(model, X, y):
    y_pred = model.predict(X)
    micro = f1_score(y, y_pred, average='micro')
    macro = f1_score(y, y_pred, average='macro')
    weighted = f1_score(y, y_pred, average='weighted')
    # samples = f1_score(y, y_pred, average='samples')
    print(f'F1 Score: micro {micro}, macro {macro}, weighted {weighted}')

evaluate(log_reg, val_vector_text, val_labels)
evaluate(log_reg, test_vector_text, test_labels)

# save the model
model_name_logreg = 'log_reg.sav'
pickle.dump(log_reg, open(model_name_logreg, 'wb'))"""
