In [12]:
train, dev, test = [], [], []

In [13]:
for x in open('./train_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    train.append(x)

In [14]:
for x in open('./dev_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    dev.append(x)

In [15]:
for x in open('./test_enc_unlabeled.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r')
    test.append(x)

In [16]:
train_split = [[x[0], x[1].split(' ')] for x in train]
dev_split = [[x[0], x[1].split(' ')] for x in dev]
test_split = [[x.split(' ')] for x in test]

X_train = [x[1] for x in train_split]

from collections import defaultdict
vocab = defaultdict(lambda : 999999)

count = 3
for sent in X_train:
    for word in sent:
        if word not in vocab:
            vocab[word] = count
            count+=1

def prep(lister):
    for i, sent in enumerate(lister):
        for j, word in enumerate(sent):
            try:
                lister[i][j] = vocab[word]
            except:
                print(i, lister[i])

for i, sent in enumerate(X_train):
    for j, word in enumerate(sent):
        X_train[i][j] = vocab[word]

y_train = [x[0] for x in train_split]



X_val = [x[1] for x in dev_split]

for i, sent in enumerate(X_val):
    for j, word in enumerate(sent):
        X_val[i][j] = vocab[word]

y_val = [x[0] for x in dev_split]


X_test = [x[0] for x in test_split]
for i, sent in enumerate(X_test):
    for j, word in enumerate(sent):
        X_test[i][j] = vocab[word]

In [17]:
import numpy as np
X_train = np.array([np.array(x) for x in X_train])
y_train = np.array(y_train)
X_val = np.array([np.array(x) for x in X_val])
y_cv = np.array(y_val)
X_test = np.array([np.array(x) for x in X_test])

In [33]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense,Conv1D,MaxPooling1D,LSTM,Dropout,GRU
from keras.callbacks import ModelCheckpoint

np.random.seed(42)
import warnings
warnings.filterwarnings('ignore')

In [54]:
print('Train data shape - ', X_train.shape)
print('Test data shape - ', X_test.shape)
print('Val data shape - ', X_val.shape)

MAX_SENT_LEN = 300
X_train = sequence.pad_sequences(X_train, maxlen=MAX_SENT_LEN)
X_test = sequence.pad_sequences(X_test, maxlen=MAX_SENT_LEN)
X_val = sequence.pad_sequences(X_val,maxlen=MAX_SENT_LEN)

Train data shape -  (16220, 300)
Test data shape -  (2028, 300)
Val data shape -  (2027, 300)


In [55]:
EMBEDDING_SIZE = 64
TOP_WORDS = 10000

Model Part 1

In [61]:
model = Sequential()
model.add(Embedding(TOP_WORDS, EMBEDDING_SIZE, input_length=MAX_SENT_LEN))
model.add(GRU(64, return_sequences=True))
model.add(GRU(128, return_sequences=True))
model.add(Dropout(0.01))
model.add(GRU(256, return_sequences=True))
model.add(Dropout(0.01))
model.add(GRU(128, return_sequences=True))
model.add(Dropout(0.02))
model.add(GRU(64))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
filepath="./weights_best_own.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
callbacks_list = [checkpoint]
model.fit(X_train, y_train, epochs=15, batch_size=128, verbose = 1,callbacks = callbacks_list,validation_data=(X_val,y_cv))


Model: "sequential_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_21 (Embedding)    (None, 300, 64)           640000    
                                                                 
 gru_18 (GRU)                (None, 300, 64)           24960     
                                                                 
 gru_19 (GRU)                (None, 300, 128)          74496     
                                                                 
 dropout_11 (Dropout)        (None, 300, 128)          0         
                                                                 
 gru_20 (GRU)                (None, 300, 256)          296448    
                                                                 
 dropout_12 (Dropout)        (None, 300, 256)          0         
                                                                 
 gru_21 (GRU)                (None, 300, 128)        

<keras.callbacks.History at 0x2589ab78b50>

In [25]:
model = Sequential()
model.add(Embedding(TOP_WORDS, EMBEDDING_SIZE, input_length=MAX_SENT_LEN))
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.1))
# model.add(LSTM(64, dropout=0.1, recurrent_dropout=0.1))
model.add(Dense(1, activation='sigmoid'))
model.load_weights("weights_best_own.hdf5")
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
scores = model.evaluate(X_val, y_cv, verbose=1,batch_size = 256)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 86.48%


In [63]:
# create the model
model = Sequential()
model.add(Embedding(TOP_WORDS, EMBEDDING_SIZE, input_length=MAX_SENT_LEN))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(Conv1D(filters=256, kernel_size=3, padding='same', activation='relu'))
model.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(256))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
filepath="weights_best_cnn_own.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
callbacks_list = [checkpoint]
model.fit(X_train, y_train, epochs=15, batch_size=128,verbose = 1,callbacks = callbacks_list,validation_data=(X_val,y_cv))


Model: "sequential_23"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_23 (Embedding)    (None, 300, 64)           640000    
                                                                 
 conv1d_55 (Conv1D)          (None, 300, 64)           12352     
                                                                 
 conv1d_56 (Conv1D)          (None, 300, 64)           12352     
                                                                 
 conv1d_57 (Conv1D)          (None, 300, 128)          24704     
                                                                 
 conv1d_58 (Conv1D)          (None, 300, 256)          98560     
                                                                 
 conv1d_59 (Conv1D)          (None, 300, 128)          98432     
                                                                 
 conv1d_60 (Conv1D)          (None, 300, 64)         

<keras.callbacks.History at 0x258abf88fd0>

Model Part 2

In [45]:
model = Sequential()
model.add(Embedding(TOP_WORDS, EMBEDDING_SIZE, input_length=MAX_SENT_LEN))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(Conv1D(filters=256, kernel_size=3, padding='same', activation='relu'))
model.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(GRU(256))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.load_weights("weights_best_cnn_own.hdf5")
scores = model.evaluate(X_val, y_cv, verbose=0)
print("Accuracy: %.7f%%" % (scores[1]*100))

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_15 (Embedding)    (None, 600, 64)           640000    
                                                                 
 conv1d_20 (Conv1D)          (None, 600, 64)           12352     
                                                                 
 conv1d_21 (Conv1D)          (None, 600, 128)          24704     
                                                                 
 conv1d_22 (Conv1D)          (None, 600, 256)          98560     
                                                                 
 conv1d_23 (Conv1D)          (None, 600, 128)          98432     
                                                                 
 conv1d_24 (Conv1D)          (None, 600, 64)           24640     
                                                                 
 max_pooling1d_6 (MaxPooling  (None, 200, 64)        

In [46]:
predictions = model.predict(X_test)

In [47]:
print(predictions[0])

[0.00049794]


In [48]:
new_pred = [1 if x>=0.5 else 0 for x in predictions]

In [49]:
print(set(new_pred))

{0, 1}


In [64]:
from collections import Counter
print(Counter(new_pred))

Counter({0: 1028, 1: 1000})


In [None]:
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in new_pred:
        fp.write(str(x) + '\n')

In [51]:
print(scores[1]*100)

88.75185251235962


In [52]:
with open(f'upload_predictions_{scores[1]*100}.txt', 'w', encoding = 'utf-8') as fp:
    for x in new_pred:
        fp.write(str(x) + '\n')
