In [None]:
embedding_size = 300 # how big is each word vector
max_features = 120000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 831 # max number of words in a discourse text
n_epochs = 10 # how many times to iterate over all samples

In [None]:
import pandas as pd

import json

train_df = pd.read_csv("../data/train.csv")

exclude_list = [
    # infinite execution
    '25F9B9BAA02A',
    '49586CD6A649',
    '6F896BABB13C',
    'BECA14914CFB',
    'E7A3DBC919C1',
    'FC9BC150809F',
    'FFC43F453EF6',
    'EFCA46E0BF9F',

    # spelling errors
    'C30B2AD4AF0A',
    '718800CC3C50',
    '9C2E6F09CC73',

    # nan in argumentation rankings
    '8D4A0D4CD2C2',
    '129497C3E0FC']

train_df = train_df.loc[~train_df.essay_id.isin(exclude_list)].copy().reset_index(drop=True)

train_df["discourse_elements_number"] = train_df.groupby(train_df.essay_id).discourse_id.transform('count')
train_df = train_df.loc[train_df.discourse_elements_number < 15].reset_index(drop=True).copy()

with open('rankings.txt', 'r') as f:
    rankings = f.readlines()

rankings = [ranking[1:-2].split('],[') for ranking in rankings]
for ranking in rankings:
    ranking[0] = ranking[0][1:]
    ranking[-1] = ranking[-1][:-1]
rankings = [ranking for ranking_list in rankings for ranking in ranking_list]
rankings = {ranking.split(',')[0]: ranking.split(',')[1] for ranking in rankings}

train_df['bwaf_rank'] = train_df.discourse_id.map(rankings.__getitem__)
train_df['bwaf_rank'] = train_df['bwaf_rank'].astype('float64')

In [None]:
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

from sklearn.preprocessing import LabelEncoder

X = train_df["discourse_text"]
features = train_df["bwaf_rank"]

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(X))
X = tokenizer.texts_to_sequences(X)

# Pad the sentences 
X = pad_sequences(X, maxlen=maxlen)
features = np.array(features)

# Get the target values
enc = LabelEncoder()
enc.fit(train_df.discourse_effectiveness.values)
y = enc.transform(train_df.discourse_effectiveness.values)

# Shuffling the data
trn_idx = np.random.permutation(len(X))

X = X[trn_idx]
features = features[trn_idx]
y = y[trn_idx]

In [None]:
import gensim
import gensim.downloader

model_w2v = gensim.downloader.load("word2vec-google-news-300")

In [None]:
import torch

def initialize_embeddings(embeddings, tokenizer):
    embedding_matrix = np.zeros((len(tokenizer.index_word) + 1, embedding_size), dtype=np.float32)
                                
    for idx, word in enumerate(tokenizer.index_word): 
        if word in embeddings:
            embedding_matrix[idx+1,:] = embeddings[word]
            
    return embedding_matrix

embedding_matrix = initialize_embeddings(model_w2v, tokenizer)
embedding_matrix = torch.from_numpy(embedding_matrix).cuda()

In [None]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable

class BiLSTM(nn.Module):
    
    def __init__(self, input_size=8, feats=False):
        super(BiLSTM, self).__init__()

        self.feats = feats

        # MLP for numerical features
        self.input_size = input_size
        self.mlp_hidden_size  = 64
        self.fc1 = torch.nn.Linear(self.input_size, self.mlp_hidden_size)
        self.relu = torch.nn.ReLU()

        # biLSTM for text
        self.hidden_size = 256
        # self.embedding = nn.Embedding(max_features, embedding_size)
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        self.lstm = nn.LSTM(embedding_size, self.hidden_size, bidirectional=True, num_layers=2)
        if feats:
            self.linear = nn.Linear((4 * self.hidden_size) + self.input_size, 3)
        else:
            self.linear = nn.Linear(4 * self.hidden_size, 64)

        drp = 0.3
        self.dropout = nn.Dropout(drp)
        self.out = nn.Linear(64, 3)

        self.log_softmax = nn.LogSoftmax(dim=1)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x, numeric_feats=None):
        embeddings = self.embedding(x)
        lstm_output, _ = self.lstm(embeddings)

        avg_pool = torch.mean(lstm_output, 1)
        max_pool, _ = torch.max(lstm_output, 1)
        conc = torch.cat((avg_pool, max_pool), 1)

        if self.feats:
            # mlp_output = self.fc1(bwaf_rank)
            conc = torch.cat((conc, numeric_feats), 1)
        
        lstm_output = self.relu(self.linear(conc))
        lstm_output = self.dropout(lstm_output)

        logits = self.out(lstm_output)
        
        log_logits = self.log_softmax(logits)
        logits = self.softmax(logits)

        return log_logits, logits

In [None]:
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self,dataset):
        self.dataset = dataset

    def __getitem__(self,index):
        data,target = self.dataset[index]
        return data,target,index
        
    def __len__(self):
        return len(self.dataset)

In [None]:
from sklearn.preprocessing import OneHotEncoder

oh_enc = OneHotEncoder(handle_unknown='ignore')
oh_enc.fit(train_df.discourse_type.values.reshape(-1, 1))
discourse_type = oh_enc.transform(train_df.discourse_type.values.reshape(-1, 1)).toarray()

features = np.hstack([features.reshape(-1, 1), discourse_type])

In [None]:
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.model_selection import train_test_split
import copy

def do_train(epoch, train, train_loader, features_train, model, loss_fn, optimizer, pbar):
    running_loss = 0.
    for iteration, (x_batch, y_batch, index) in enumerate(train_loader):
        # Predict/Forward Pass
        feats = torch.tensor(features_train[index]).unsqueeze(dim=1).cuda()
        y_pred_log, y_pred = model(x_batch, feats.float())

        # Compute loss
        loss = loss_fn(y_pred_log, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        this_loss = loss.item()
        running_loss += this_loss
        pbar.set_postfix(epoch=epoch, loss = running_loss / (iteration + 1))
        pbar.update()

def do_eval(epoch, val, val_loader, features_val, model, loss_fn, optimizer):
    val_preds = np.zeros((len(val), len(enc.classes_)))
    val_loss = 0.
    for iteration, (x_batch, y_batch, index) in enumerate(val_loader):
        # Predict/Forward Pass
        feats = torch.tensor(features_val[index]).unsqueeze(dim=1).cuda()
        y_pred_log, y_pred = model(x_batch, feats.float())
        val_preds[index] = y_pred.detach().cpu().numpy()

    return val_preds

def do_fold( X, y, features):
    params = [
        (32, 0.0005, False), (32, 0.005, False), (64, 0.0005, False), (64, 0.005, False), (128, 0.0005, False), (128, 0.005, False),
        (32, 0.0005, True), (32, 0.005, True), (64, 0.0005, True), (64, 0.005, True), (128, 0.0005, True), (128, 0.005, True)]

    log_losses = []
    best_epochs = []
    for param in params:
        batch_size = param[0]
        lr = param[1]
        use_feats = param[2]

        train_idx, val_idx, _, _ = train_test_split(range(0, X.shape[0]), y, stratify=y, test_size=0.2) 

        X_train = torch.tensor(X[train_idx], dtype=torch.long).cuda()
        y_train = torch.tensor(y[train_idx], dtype=torch.long).cuda()

        X_val = torch.tensor(X[val_idx], dtype=torch.long).cuda()
        y_val = torch.tensor(y[val_idx], dtype=torch.long).cuda()

        features_train = features[train_idx]
        features_val = features[val_idx]

        train = MyDataset(torch.utils.data.TensorDataset(X_train, y_train))
        val = MyDataset(torch.utils.data.TensorDataset(X_val, y_val))

        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
        val_loader = torch.utils.data.DataLoader(val, batch_size=batch_size, shuffle=True)

        model = BiLSTM(use_feats)
        model.cuda()

        loss_fn = nn.NLLLoss()
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)

        best_model = model
        best_log_loss = np.inf
        best_epoch = -1
        patience = 0
        with tqdm(desc=f'Train params ({batch_size}, {lr}, {use_feats})', unit='iteration', total=len(train_loader) * n_epochs) as pbar:
            for epoch in range(0, n_epochs):
                do_train(epoch, train, train_loader, features_train, model, loss_fn, optimizer, pbar)
                val_preds = do_eval(epoch, val, val_loader, features_val, model, loss_fn, optimizer)

                log_loss = metrics.log_loss(y_val.cpu().numpy(), val_preds)
                if log_loss < best_log_loss:
                    best_log_loss = log_loss
                    best_model = copy.deepcopy(model)
                    best_epoch = epoch
                    patience = 0
                else:
                    patience += 1
                
                if patience == 2:
                    break

        val_preds = np.zeros((len(val), len(enc.classes_)))
        with tqdm(desc=f'Final evaluation', unit='iteration', total=len(val_loader)) as pbar:
            for iteration, (x_batch, y_batch, index) in enumerate(val_loader):
                # Predict/Forward Pass
                feats = torch.tensor(features_val[index]).unsqueeze(dim=1).cuda()
                y_pred_log, y_pred = best_model(x_batch, feats.float())
                pbar.update()
                val_preds[index] = y_pred.detach().cpu().numpy()
        
        log_losses.append(metrics.log_loss(y_val.cpu().numpy(), val_preds))
        best_epochs.append(best_epoch)

    # print(log_losses)
    # print(best_epochs)

    min_idx = log_losses.index(min(log_losses))
    return params[min_idx], best_epochs[min_idx]

In [None]:
import warnings
warnings.filterwarnings('ignore')

splits = list(StratifiedKFold(n_splits=5, shuffle=True).split(X, y))
log_losses = []
params = []
best_epochs = []
for i, (train_idx, test_idx) in enumerate(splits):
    print(f'Fold {i}')
    X_train = X[train_idx.astype(int)]
    y_train = y[train_idx.astype(int)]

    X_test = X[test_idx.astype(int)]
    y_test = y[test_idx.astype(int)]

    features_train = features[train_idx.astype(int)]
    features_test = features[test_idx.astype(int)]

    param, best_epoch = do_fold(X_train, y_train, features_train)
    params.append(param)
    best_epochs.append(best_epoch)

    batch_size = param[0]
    lr = param[1]
    use_feats = param[2]
    
    X_train = torch.tensor(X_train, dtype=torch.long).cuda()
    y_train = torch.tensor(y_train, dtype=torch.long).cuda()

    X_test = torch.tensor(X_test, dtype=torch.long).cuda()
    y_test = torch.tensor(y_test, dtype=torch.long).cuda()

    train = MyDataset(torch.utils.data.TensorDataset(X_train, y_train))
    test = MyDataset(torch.utils.data.TensorDataset(X_test, y_test))

    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=True)

    model = BiLSTM(use_feats)
    model.cuda()

    loss_fn = nn.NLLLoss()
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)

    with tqdm(desc=f'Train with best params: ({batch_size}, {lr}, {use_feats}, {best_epoch})', unit='iteration', total=len(train_loader) * (best_epoch + 1)) as pbar:
        for epoch in range(0, best_epoch + 1):
            do_train(epoch, train, train_loader, features_train, model, loss_fn, optimizer, pbar)
    test_preds = do_eval(epoch, test, test_loader, features_test, model, loss_fn, optimizer)

    log_loss = metrics.log_loss(y_test.cpu().numpy(), test_preds)
    print(f'Log loss at fold {i}: {log_loss}')
    log_losses.append(log_loss)


In [None]:
print(params)
print(best_epochs)
print(log_losses)