In [1]:
import datasets
import numpy as np
import logging
import os
import pprint

# import pipeline
# from lxml.etree import tostring
# from lxml.builder import E
from collections import defaultdict
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from collections import defaultdict
import time
import random
import dynet as dy

In [2]:
logging.basicConfig(level=logging.DEBUG)

In [3]:
logging.info("loading dataset...")
df = datasets.load_pan17("../data/training/")
corpus = df.corpus
corpus['text'] = corpus.text.apply(lambda x: '\n'.join(x))  # join tweets

INFO:root:loading dataset...


In [7]:
corpus.head()

Unnamed: 0,author,lang,text,gender,variety
0,2660ba148fdb6d40d45a2cde1ebc0938,ar,وين راحوا لفتاتك المليانه حب و غيره\nشو ال غير...,female,levantine
1,8190b6c35e3bfc0a7a5b606987f12088,ar,أعوذ بكلمات الله التامات من شر ما خلق\n♻️ http...,male,levantine
2,beb2267b45907af46c677b57b181d33b,ar,700 مقاتل مغربي مدرب خارج سيطرة الأجهزة الأمني...,male,maghrebi
3,febcaf1e3821d7a53cf440d772307d6c,ar,أبرز #تصريحات #النجوم في العام 2016 \n...,female,levantine
4,3ba53c8b59303a381e058d11a6678221,ar,اللَّهم يَا خَالق الحُبّ والنَّوَى ..\nأعْطِ ل...,male,levantine


In [8]:
# split the dataset in train-dev-test
train_df = corpus[corpus.lang == 'en'].loc[3600:5600]
dev_df = corpus[corpus.lang == 'en'].loc[5601:6100]
test_df = corpus[corpus.lang == 'en'].loc[6101:6600]

In [9]:
# Functions to read in the corpus
w2i = defaultdict(lambda: len(w2i))
t2i_m = defaultdict(lambda: len(t2i_m)) # main labels
t2i_a = defaultdict(lambda: len(t2i_a)) # aux labels
UNK = w2i["<unk>"]


def read_dataset(dataframe):
    for index, row in dataframe.iterrows():
        main_label, aux_label, text = row.gender, row.variety, row.text
        yield ([w2i[x] for x in text.split(" ")], t2i_m[main_label], t2i_a[aux_label])


# Read in the data
train = list(read_dataset(train_df))[:30]
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset(dev_df))[:30]
test = list(read_dataset(test_df))
nwords = len(w2i)
ntags1 = len(t2i_m)
ntags2 = len(t2i_a)

In [10]:
# Start DyNet and defin trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

# Define the model
EMB_SIZE = 64
HID_SIZE = 64
W_emb = model.add_lookup_parameters((nwords, EMB_SIZE))  # Word embeddings

fwdLSTM = dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model)  # Forward LSTM
bwdLSTM = dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model)

H_sm_main = model.add_parameters((64, 2 * HID_SIZE))  # Softmax weights
O_sm_main = model.add_parameters((1,64))  # Softmax bias

H_sm_aux = model.add_parameters((64, 2 * HID_SIZE))  # Softmax weights
O_sm_aux = model.add_parameters((ntags2,64))  # Softmax bias


# A function to calculate scores for one value
def calc_scores(words, main_tag, aux_tag):
    dy.renew_cg()
    word_embs = [dy.lookup(W_emb, x) for x in words]
    fwd_init = fwdLSTM.initial_state()
    bwd_init = bwdLSTM.initial_state()

    fwd_embs = fwd_init.transduce(word_embs)
    bwd_embs = bwd_init.transduce(reversed(word_embs))
    
    repr = dy.concatenate([fwd_embs[-1], bwd_embs[-1]]) # use last step as representation
    
    H_m = dy.parameter(H_sm_main)
    O_m = dy.parameter(O_sm_main)
    
    H_a = dy.parameter(H_sm_aux)
    O_a = dy.parameter(O_sm_aux)
    
    
    final_m = O_m*dy.tanh(H_m * repr) # MLP for main task
    final_a = O_a*dy.tanh(H_a * repr) # MLP for auxiliary task
    
    main = dy.binary_log_loss(final_m, dy.scalarInput(main_tag)) # gender
    aux = dy.pickneglogsoftmax(final_a, aux_tag) #variety
    return main, aux


for ITER in range(1):
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    for words, main_tag, aux_tag in train:
        loss = sum(calc_scores(words, main_tag, aux_tag))
        train_loss += loss.value()
        loss.backward()
        trainer.update()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))

iter 0: train loss/sent=nan, time=4.56s


In [32]:
def predict_tag(words, task):
    dy.renew_cg()
    word_embs = [dy.lookup(W_emb, x) for x in words]
    fwd_init = fwdLSTM.initial_state()
    bwd_init = bwdLSTM.initial_state()
    ## Q2: run the forward pass of the LSTM
    fwd_embs = fwd_init.transduce(word_embs)
    bwd_embs = bwd_init.transduce(reversed(word_embs))
    repr = dy.concatenate([fwd_embs[-1], bwd_embs[-1]]) # use last step as representation
    
    H_m = dy.parameter(H_sm_main)
    O_m = dy.parameter(O_sm_main)
    
    H_a = dy.parameter(H_sm_aux)
    O_a = dy.parameter(O_sm_aux)
    
    final_m = O_m * dy.tanh(H_m*repr)
    main = dy.logistic(final_m)
    
    final_a = O_a*dy.tanh(H_a * repr) # MLP for auxiliary task
    aux = dy.softmax(final_a) #variety
    
    if task == 'main':
        if main.value() > 0.5:
            return 1
        else:
            return 0
    elif task =='aux':
        return np.argmax(aux.npvalue())

In [33]:
# Eval
eval_correct = 0.0
for words, main, aux in dev:
    scores = predict_tag(words, 'main')
    predict = scores
    print(predict, main)
    if predict == main:
        eval_correct += 1
print("iter %r: eval acc=%.4f" % (ITER, eval_correct / len(dev)))

0 1
0 1
0 0
0 1
0 1
0 0
0 1
0 1
0 0
0 0
0 0
0 0
0 1
0 0
0 1
0 0
0 0
0 1
0 0
0 0
0 1
0 0
0 1
0 1
0 0
0 1
0 0
0 1
0 0
0 0
iter 0: eval acc=0.5333
