### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev cipher text, and unlabeled test cipher text, into lists.

In [212]:
train, dev, test = [], [], []

In [213]:
for x in open('./train_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    train.append(x)
print (len(train))
print (train[:3])

16220
[[0, 'lkêcê yoúc cêêö y#êjl lw mówám Újám j Úêê# ütlk Úol lkêú z#ê ctöé8ú ówl xoóóú éê#xw#öê#c .'], [0, '6êcétlê jolêot8 zc éê#xw#öjóáê , tl zc j #jlkê# 8tcl8êcc jöÚ8ê 6wüó lkê öt668ê wx lkê #wj6 , ükê#ê lkê lkêöjltá t#wótêc j#ê lww wÚ2twoc jó6 lkê cê+oj8 éw8tltác lww cöoy .'], [0, 'tx lktc kw8t6jú öw2tê tc coééwcê6 lw Úê j ytxl , cwöêÚw6ú oóü#jééê6 tl êj#8ú , lwwm wol j88 lkê yww6 cloxx , jó6 8êxl Úêktó6 lkê á#jé ( 8tlê#j88ú ) .']]


In [214]:
for x in open('./dev_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    dev.append(x)
print (len(dev))
print (dev[:3])

2027
[[1, 'ów8jó Ú#j2ê8ú l#êj6c ükê#ê xêü jöê#tájó xt8öc 6j#ê lw 6ê82ê 77 tólw lkê üw#86 wx jöÚt2j8êóáê jó6 jöÚtyotlú <<<'], [0, 'ê2êó öo#ékú zc ê+éê#l áwötá ltötóy jó6 xjöê6 ákj#tcöj áj ózl #êcáoê lktc êxxw#l .'], [1, 'üt88 jcco#ê68ú #jóm jc wóê wx lkê á8ê2ê#êcl , öwcl 6êáêélt2ê8ú jöoctóy áwöê6têc wx lkê úêj# .']]


#### Different from 'train' and 'dev' that are both list of tuples, 'test' will be just a list.

In [215]:
for x in open('./test_enc_unlabeled.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r')
    test.append(x)
print (len(test))
print (test[:3])

2028
['j 6t6jáltá jó6 6o88 6wáoöêólj#ú y8w#txútóy cwxlüj#ê jój#ákú .', 'ówlktóy cltámc , #êj88ú , ê+áêél j 8tóyê#tóy á#êêétóêcc wóê xêê8c x#wö Úêtóy 6#jyyê6 lk#woyk j cj6 , cw#6t6 oót2ê#cê wx yoóc , 6#oyc , j2j#táê jó6 6jöjyê6 6#êjöc .', 'öo#ékú jó6 üt8cwó jáloj88ú öjmê j é#êllú yww6 lêjö <<< Úol lkê é#wvêál co##woó6tóy lkêö tc 6tcl#êcctóy8ú #wlê .']


#### You can split every sentence into lists of words by white spaces.

In [216]:
train_split = [[x[0], x[1].split(' ')] for x in train]
dev_split = [[x[0], x[1].split(' ')] for x in dev]
test_split = [[x.split(' ')] for x in test]

### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

In [217]:
# Eventually, results need to be a list of 2028 0 or 1's
results = []

In [218]:
import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub

from itertools import islice
import statistics

In [219]:
from keras.callbacks import EarlyStopping
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM

In [220]:
embed = hub.load("https://tfhub.dev/google/Wiki-words-250-with-normalization/2")

In [221]:
def get_padded_encoded_ciphers(encoded_ciphers, my_max_length):
    
    my_shorter_length = (9/10) * my_max_length
    my_shorter_length = int(my_shorter_length)
    
    padded_ciphers_encoding = []
    
    for enc_cipher in encoded_ciphers:
        
        zero_padding_cnt = my_max_length - enc_cipher.shape[0]
        #zero_padding_cnt = my_shorter_length - enc_cipher.shape[0]
        
        if zero_padding_cnt > 0:
            #Pad array list by appropriate amount
            pad = np.zeros((1, 250))
            for i in range(zero_padding_cnt):
                enc_cipher = np.concatenate((pad, enc_cipher), axis=0)
        else:
            #Trim the array list down by appropriate amount
            cipher_length = len(enc_cipher)
            enc_cipher = enc_cipher[:cipher_length+zero_padding_cnt]
            
        padded_ciphers_encoding.append(enc_cipher)
    return padded_ciphers_encoding

In [222]:
#Find max number of columns
X_train = [x[1] for x in train_split]
m = len(X_train)
n = len(max(X_train, key=len)) #Max number of columns in training data

lengths = [len(x) for x in X_train]
med = int(statistics.median(lengths))

x_val = [x[1] for x in dev_split]
m_prime = len(x_val)
n_prime = len(max(x_val, key=len)) #Max number of columns in development data

x_test = [x[0] for x in test_split]
m_double_prime = len(x_test)
n_double_prime = len(max(x_test, key=len)) #Max number of columns in test data

max_length = max(n, n_prime, n_double_prime)

max_length = int(med)

In [223]:
###################
###TRAINING DATA###
###################

In [305]:
X_train_plain = [x[1] for x in train_split]

#Remove stop words from Training Data
countOfWord = {} #Dictionaries for corpus

#Adds each unseen word into countOfWord vocabulary
for row in X_train_plain:
    for idx, word in enumerate(row):
        
        if word in countOfWord:
            #Increment the count for that word in vocab
            countOfWord[word] += 1
        else:
            countOfWord[word] = 1
            
    #Pad the end of this row with EOS
    #length = len(row)
    #np.pad(row, (0, max_length - length), 'constant', constant_values="\s")
            
#Sort countOfWord vocabulary by frequency
countOfWord = dict(sorted(countOfWord.items(), key=lambda item: item[1], reverse=True))

k = 8
#Remove top k frequent words (possibly stop words)
for itr in range(k):
    countOfWord.popitem()
    
#Translate each stop word into an EOS
for row in X_train_plain:
    for idx, word in enumerate(row):
        #Replace stop words with EOS
        if word not in countOfWord:
            row[idx] = "0"

In [306]:
#Training data X
X_train = [embed(x) for x in X_train_plain]
X_train = get_padded_encoded_ciphers(X_train, max_length)

X_train = tf.convert_to_tensor(X_train)

###################
#Training data Y
Y_train = [float(y[0]) for y in train_split]

Y_train = tf.convert_to_tensor(Y_train)
Y_train = tf.cast(Y_train, dtype=tf.float64)

In [226]:
###################
##DEVELOPMENT DATA#
###################

In [307]:
x_val_plain = [x[1] for x in dev_split]

#Remove stop words from Development Data
countOfWordDev = {} #Dictionaries for corpus

#Adds each unseen word into countOfWord vocabulary
for row in x_val_plain:
    for idx, word in enumerate(row):
        
        if word in countOfWordDev:
            #Increment the count for that word in vocab
            countOfWordDev[word] += 1
        else:
            countOfWordDev[word] = 1
            
    #Pad the end of this row with EOS
    #length = len(row)
    #np.pad(row, (0, max_length - length), 'constant', constant_values=0)
            
#Sort countOfWordDev vocabulary by frequency
countOfWordDev = dict(sorted(countOfWordDev.items(), key=lambda item: item[1], reverse=True))

k = 8
#Remove top k frequent words (possibly stop words)
for itr in range(k):
    countOfWordDev.popitem()
    
#Translate each stop word into an EOS
for row in x_val_plain:
    for idx, word in enumerate(row):
        #Replace stop words with EOS
        if word not in countOfWordDev:
            row[idx] = "0"

In [308]:
#Development data X
x_val = [embed(x) for x in x_val_plain]
x_val = get_padded_encoded_ciphers(x_val, max_length)

x_val = tf.convert_to_tensor(x_val)

###################
#Development data Y
y_val = [x[0] for x in dev_split]

y_val = tf.convert_to_tensor(y_val)
y_val = tf.cast(y_val, dtype=tf.float64)

In [309]:
###################
#####TEST DATA#####
###################

In [310]:
x_test_plain = [x[0] for x in test_split]

#Remove stop words from Test Data
countOfWordTest = {} #Dictionaries for corpus

#Adds each unseen word into countOfWord vocabulary
for row in x_test_plain:
    for idx, word in enumerate(row):
        
        if word in countOfWordTest:
            #Increment the count for that word in vocab
            countOfWordTest[word] += 1
        else:
            countOfWordTest[word] = 1
            
    #Pad the end of this row with EOS
    #length = len(row)
    #np.pad(row, (0, max_length - length), 'constant', constant_values=0)
            
#Sort countOfWordTest vocabulary by frequency
countOfWordTest = dict(sorted(countOfWordTest.items(), key=lambda item: item[1], reverse=True))

k = 8
#Remove top k frequent words (possibly stop words)
for itr in range(k):
    countOfWordTest.popitem()
    
#Translate each stop word into an EOS
for row in x_test_plain:
    for idx, word in enumerate(row):
        #Replace stop words with EOS
        if word not in countOfWordTest:
            row[idx] = "0"

In [311]:
#Test data X
x_test = [embed(x) for x in x_test_plain]
x_test = get_padded_encoded_ciphers(x_test, max_length)

x_test = tf.convert_to_tensor(x_test)

In [312]:
###################
#######MODEL#######
###################

In [361]:
#Create model
model = Sequential()

model.add(LSTM(units=128, activation='tanh', return_sequences=True))
model.add(Dropout(0.5))

model.add(Dense(units=8))

model.add(LSTM(units=128, activation='tanh', return_sequences=True))
model.add(Dropout(0.5))

model.add(Dense(units=8))

model.add(LSTM(units=128, activation='tanh', return_sequences=True))
model.add(Dropout(0.5))

model.add(Dense(units=8))

model.add(LSTM(units=128, activation='tanh', return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(units=1, activation='sigmoid'))

In [402]:
#Compile model
model.compile(
    loss='binary_crossentropy', 
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0025), 
    metrics='accuracy'
)
callback = EarlyStopping(monitor='loss', patience=2, mode='min')     #Training
callback_two = EarlyStopping(monitor='loss', patience=2, mode='min') #Validation

In [406]:
#Train model
history = model.fit(
    X_train, Y_train, batch_size=27, epochs=100, 
    callbacks=callback, validation_data=(x_val, y_val)
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [405]:
#Train model on validation data as well
history_two = model.fit(
    x_val, y_val, batch_size=33, 
    epochs=100, callbacks=callback_two
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


In [394]:
#Save Model
#my_model_path = 'saved_models/seven_layer_eighty_eight_model'
#model.save(my_model_path)



INFO:tensorflow:Assets written to: saved_models/seven_layer_eighty_seven_model/assets


INFO:tensorflow:Assets written to: saved_models/seven_layer_eighty_seven_model/assets


In [395]:
#Load Model
#model = tf.keras.models.load_model('saved_models/seven_layer_eighty_six_model')
model.summary()

Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_83 (LSTM)              (None, 20, 128)           194048    
                                                                 
 dropout_83 (Dropout)        (None, 20, 128)           0         
                                                                 
 dense_72 (Dense)            (None, 20, 8)             1032      
                                                                 
 lstm_84 (LSTM)              (None, 20, 128)           70144     
                                                                 
 dropout_84 (Dropout)        (None, 20, 128)           0         
                                                                 
 dense_73 (Dense)            (None, 20, 8)             1032      
                                                                 
 lstm_85 (LSTM)              (None, 20, 128)         

In [396]:
#Evaluate on development data
loss, accuracy = model.evaluate(x_val, y_val)
print('loss: ', loss)
print('accuracy: ', accuracy)

loss:  0.3303125500679016
accuracy:  0.8746916651725769


In [397]:
#Predict on test data
y_hat = model.predict(x_test)
y_hat_modified = [0 if val <0.5 else 1 for val in y_hat]
results = y_hat_modified

### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [398]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 2028)

In [399]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [400]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')