In [924]:
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models import FastText, Word2Vec
import numpy as np

In [925]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev cipher text, and unlabeled test cipher text, into lists.

In [926]:
train, dev, test = [], [], []

In [927]:
for x in open('./train_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    train.append(x)
print (len(train))

16220


In [928]:
for x in open('./dev_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    dev.append(x)
print (len(dev))

2027


#### Different from 'train' and 'dev' that are both list of tuples, 'test' will be just a list.

In [929]:
for x in open('./test_enc_unlabeled.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r')
    test.append(x)
print (len(test))

2028


#### You can split every sentence into lists of words by white spaces.

In [930]:
train_split = [[x[0], x[1].split(' ')] for x in train]
dev_split = [[x[0], x[1].split(' ')] for x in dev]
test_split = [[x.split(' ')] for x in test]

In [931]:
print(train_split[0])

[0, ['lkêcê', 'yoúc', 'cêêö', 'y#êjl', 'lw', 'mówám', 'Újám', 'j', 'Úêê#', 'ütlk', 'Úol', 'lkêú', 'z#ê', 'ctöé8ú', 'ówl', 'xoóóú', 'éê#xw#öê#c', '.']]


### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

In [932]:
# Eventually, results need to be a list of 2028 0 or 1's
results = []

In [933]:
# full_train_tags = [x[0] for x in train_split]
full_train_words = [x[1] for x in train_split]

In [934]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


In [935]:
batch_size = 128
vector_size = 400
hidden_size = 128
num_layers = 2
bidirectional = True
dropout = 0.2

In [936]:
model = Word2Vec(vector_size=vector_size, window=5, min_count=1)
model.build_vocab(full_train_words)
model.train(full_train_words, total_examples=len(full_train_words), epochs=10)
word_vectors = model.wv
weight_vectors = torch.FloatTensor(word_vectors.vectors).to(device)
weight_vectors.requires_grad = True
vocab = dict(word_vectors.key_to_index)
weight_vectors = torch.cat((weight_vectors, torch.zeros(1, vector_size).to(device)), 0)
vocab['UKN'] = len(vocab)
del model
print(weight_vectors.size())
print(len(vocab))

torch.Size([20861, 400])
20861


In [937]:
validate = train[-800:]
train = train[:-800]
train_split = [[x[0], x[1].split(' ')] for x in train]
val_split = [[x[0], x[1].split(' ')] for x in validate]

In [938]:
train_tags = [x[0] for x in train_split]
train_words = [x[1] for x in train_split]
val_tags = [x[0] for x in val_split]
val_words = [x[1] for x in val_split]
dev_tags = [x[0] for x in dev_split]
dev_words = [x[1] for x in dev_split]

In [955]:
def transferDataset(words, labels, vocabulary, isTrain):
    idx = []
    unknown = 0
    if isTrain:
        for i in range(len(words)):
            idx.append([])
            for j in range(len(words[i])):
                idx[i].append(vocabulary[words[i][j]])
    else:
        for i in range(len(words)):
            idx.append([])
            for j in range(len(words[i])):
                if words[i][j] not in vocabulary.keys():
                    idx[i].append(vocabulary['UKN'])
                    unknown += 1
                else:
                    idx[i].append(vocabulary[words[i][j]])
        print("Unknown Token: %d" % unknown)
    lengths = np.array([len(i) for i in idx])
    print(lengths.size)
    word_idx = np.zeros((len(lengths), np.amax(lengths)), dtype=int)
    print(word_idx.shape)
    for i in range(len(lengths)):
        for j in range(lengths[i]):
            word_idx[i, j] = idx[i][j]
    word_idx = torch.LongTensor(word_idx).to(device)
    labels = torch.FloatTensor(labels).to(device)
    # labels = torch.FloatTensor(labels).view(-1, 1)
    # labels = torch.cat((labels, labels), 1).to(device)
    print(labels.size())
    lengths = torch.IntTensor(lengths).to('cpu')
    return word_idx, labels, lengths

In [940]:
train_idx, train_labels, train_length = transferDataset(train_words, train_tags, vocab, True)
val_idx, val_labels, val_length = transferDataset(val_words, val_tags, vocab, True)

15420
(15420, 56)
torch.Size([15420])
800
(800, 54)
torch.Size([800])


In [941]:
class CipherClf(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, weight_vector, bidirectional, dropout):
        super(CipherClf, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding.from_pretrained(weight_vector)
        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers= num_layers, bidirectional = bidirectional, dropout=dropout, batch_first = True)
        num_directions = 2 if bidirectional else 1
        self.linear1 = nn.Linear(num_directions * hidden_size, 64)
        # self.linear2 = nn.Linear(128, 128)
        self.linear3 = nn.Linear(64, 1)
        # self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        # self.drop = nn.Dropout(0.5)

    def forward(self, sent, sent_length):
        embedded = self.embedding(sent)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sent_length, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # print(hidden.size())
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), 1)
        # act1 = self.relu(hidden)
        first_linear = self.linear1(hidden)
        # act1 = self.relu(first_linear)
        # second_linear = self.linear2(act1)
        # act2 = self.relu(second_linear)
        # final_output = self.drop(act2)
        third_linear = self.linear3(first_linear)
        final_output = self.sigmoid(third_linear)


        return final_output

In [942]:
model = CipherClf(vector_size, hidden_size, num_layers, weight_vectors, bidirectional, dropout)
model = model.to(device)
print(model)

CipherClf(
  (embedding): Embedding(20861, 400)
  (lstm): LSTM(400, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (linear1): Linear(in_features=256, out_features=64, bias=True)
  (linear3): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [943]:
optimizer = optim.Adam(model.parameters(), lr=0.005)
loss_func = nn.BCELoss().to(device)

In [944]:
# def validation(nn_model, loss_function, idx, labels, length, size):
#     nn_model.eval()
#     sum_loss = 0.0
#     num_batch = 0
#
#     with torch.no_grad():
#         for i in range(0, len(idx), size):
#             if i + size < len(idx):
#                 val_output = model(idx[i:i+size], length[i:i+size]).view(-1)
#                 val_loss = loss_function(val_output, labels[i:i+size])
#             else:
#                 val_output = model(idx[i:], length[i:]).view(-1)
#                 val_loss = loss_function(val_output, labels[i:])
#             sum_loss += val_loss.item()
#             num_batch += 1
#
#     return sum_loss / num_batch

In [945]:
def validation(nn_model, loss_function, idx, labels, length):
    nn_model.eval()
    correct = 0.0

    with torch.no_grad():
        val_output = model(idx, length).view(-1)
        val_loss = loss_function(val_output, labels)
        val_pred = torch.round(val_output)
        for i in range(len(val_pred)):
            if val_pred[i] == labels[i]:
                correct += 1.0

    return val_loss.item(), correct/len(val_pred)

In [956]:
dev_idx, dev_labels, dev_length = transferDataset(dev_words, dev_tags, vocab, False)

Unknown Token: 653
2027
(2027, 53)
torch.Size([2027])


In [947]:
# def test(nn_model, idx, labels, length, size):
#     nn_model.eval()
#     correct = 0.0
#
#     with torch.no_grad():
#         for i in range(0, len(idx), size):
#             if i + size < len(idx):
#                 test_output = nn_model(idx[i:i+size], length[i:i+size])
#             else:
#                 test_output = nn_model(idx[i:], length[i:])
#             pred = torch.round(test_output)
#             # pred = test_output.argmax(1)
#
#             for j in range(len(pred)):
#                 if pred[j] == labels[i+j]:
#                     correct += 1.0
#         accuracy = correct / len(labels)
#         print("Accuracy: %1.4f" % accuracy)

In [948]:
def test(nn_model, idx, labels, length):
    nn_model.eval()
    correct = 0.0

    with torch.no_grad():
        test_output = nn_model(idx, length)
        pred = torch.round(test_output)
        # pred = test_output.argmax(1)

        for i in range(len(pred)):
            if pred[i] == labels[i]:
                correct += 1.0
        accuracy = correct / len(labels)
        print("Test accuracy: %1.4f" % accuracy)

In [949]:
epoch = 200
last_val_loss = 1000
overfitting = 0

for i in range(epoch):
    total_loss = 0.0
    n_batch = 0
    model.train()

    for j in range(0, len(train_idx), batch_size):
        optimizer.zero_grad()
        if j + batch_size < len(train_idx):
            output = model(train_idx[j:j+batch_size], train_length[j:j+batch_size]).view(-1)
            loss = loss_func(output, train_labels[j:j+batch_size])
        else:
            output = model(train_idx[j:], train_length[j:]).view(-1)
            loss = loss_func(output, train_labels[j:])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        n_batch += 1
    avg_loss = total_loss / n_batch
    if i%5 == 0:
        print('Epoch: %d, Loss: %1.6f' %(i, avg_loss))
    # validation_loss = validation(model, loss_func, val_idx, val_labels, val_length, batch_size)
    validation_loss, validation_accuracy = validation(model, loss_func, val_idx, val_labels, val_length)
    print("val_loss: %1.4f, val_accuracy: %1.4f" % (validation_loss, validation_accuracy))
    if validation_loss > last_val_loss:
        overfitting += 1
        print("Overfitting " + str(overfitting) + " times: train_loss " + "" + str(avg_loss) + " val_loss " + str(validation_loss))
        if overfitting > 9:
            print("Overfitting! Early Stop at epoch " + str(i))
            break
    else:
        overfitting = 0
    last_val_loss = validation_loss
    # test(model, dev_idx, dev_labels, dev_length, batch_size)
    test(model, dev_idx, dev_labels, dev_length)

Epoch: 0, Loss: 0.659541
val_loss: 0.6378, val_accuracy: 0.6275
Test accuracy: 0.6290
val_loss: 0.6029, val_accuracy: 0.6475
Test accuracy: 0.6719
val_loss: 0.6078, val_accuracy: 0.6562
Overfitting 1 times: train_loss 0.5735602341900187 val_loss 0.6078209280967712
Test accuracy: 0.6872
val_loss: 0.6082, val_accuracy: 0.6562
Overfitting 2 times: train_loss 0.5381329374865067 val_loss 0.608181357383728
Test accuracy: 0.6897
val_loss: 0.6455, val_accuracy: 0.6312
Overfitting 3 times: train_loss 0.5013592014135408 val_loss 0.6454907059669495
Test accuracy: 0.6887
Epoch: 5, Loss: 0.465462
val_loss: 0.6619, val_accuracy: 0.6438
Overfitting 4 times: train_loss 0.4654621624749554 val_loss 0.6618862748146057
Test accuracy: 0.6946
val_loss: 0.7383, val_accuracy: 0.6225
Overfitting 5 times: train_loss 0.4360882839388099 val_loss 0.7383458614349365
Test accuracy: 0.6838
val_loss: 0.6854, val_accuracy: 0.6987
Test accuracy: 0.7242
val_loss: 0.6626, val_accuracy: 0.6987
Test accuracy: 0.7435
val_los

In [950]:
# dev_idx, dev_labels, dev_length = transferDataset(dev_words, dev_tags, vocab, False)

In [951]:
test(model, dev_idx, dev_labels, dev_length)

Test accuracy: 0.8278


### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [952]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
# assert (len(results) == 2028)

In [953]:
# make sure the results are not float numbers, but intergers 0 and 1
# results = [int(x) for x in results]

In [954]:
# write your prediction results to 'upload_predictions.txt' and upload that later
# with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
#     for x in results:
#         fp.write(str(x) + '\n')