In [1]:
import torch
import os
import csv
import random
import numpy as np
import pandas as pd
import torch.nn.functional as F
import time
import copy
import re
import string

from torch.nn.utils.rnn import pad_sequence
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, ParameterGrid

In [2]:
SEED = 5566
MAX_POSITIONS_LEN = 500
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
random.seed(SEED)
np.random.seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def parsing_text(text):
    #punctuation_remove = string.punctuation.replace("!", "")
    # text = re.sub(r'([a-z])\1\1\1+', r'\1\1\1', text)
    # text = re.sub(r'http : bit . ly', '', text)
    # text = re.sub(r'http : tinyurl . com', '', text)
    # text = re.sub(r'http : cli . gs', '', text)
    # text = re.sub(r'http : www', '', text)
    # text = "".join([i for i in text if i not in punctuation_remove]).strip()
    # text = " ".join(text.split()) #remove abundant white space
    return text


def load_train_label(path='train_label.csv'):
    tra_lb_pd = pd.read_csv(path)
    label = torch.FloatTensor(tra_lb_pd['label'].values)
    idx = tra_lb_pd['id'].tolist()
    text = [parsing_text(s).split(' ') for s in tra_lb_pd['text'].tolist()]
    return idx, text, label

def load_test(path='test.csv'):
    tst_pd = pd.read_csv(path)
    idx = tst_pd['id'].tolist()
    text = [parsing_text(s).split(' ') for s in tst_pd['text'].tolist()]
    return idx, text

In [4]:
class Preprocessor:
    def __init__(self, sentences, w2v_config):
        self.sentences = sentences
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []
        self.w2v_model = None
        self.build_word2vec(sentences, **w2v_config)
        
        
    def build_word2vec(self, x, path, dim, window, min_count, iter, sg, hs, negative):
        if os.path.isfile(path):
            print("loading word2vec model ...")
            w2v_model = Word2Vec.load(path)
        else:
            print("training word2vec model ...")
            w2v_model = Word2Vec(
                x, size=dim, window=window, min_count=min_count, 
                workers=12, iter=iter, sg=sg, hs = hs,
                negative = negative)
        self.w2v_model = w2v_model
            
        self.embedding_dim = w2v_model.vector_size
        for i, word in enumerate(w2v_model.wv.vocab):
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(w2v_model.wv[word])
        
        self.embedding_matrix = torch.tensor(np.array(self.embedding_matrix))
        self.add_embedding('<PAD>')
        self.add_embedding('<UNK>')
        print("total words: {}".format(len(self.embedding_matrix)))
        
    def add_embedding(self, word):
        # 把 word 加進 embedding，並賦予他一個隨機生成的 representation vector
        # word 只會是 "<PAD>" 或 "<UNK>"
        vector = torch.empty(1, self.embedding_dim)
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)   
        
    def sentence2idx(self, sentence):
        sentence_idx = []
        for word in sentence:
            if word in self.word2idx.keys():
                sentence_idx.append(self.word2idx[word])
            else:
                sentence_idx.append(self.word2idx["<UNK>"])
        return torch.LongTensor(np.array(sentence_idx))

In [5]:
class TwitterDataset(torch.utils.data.Dataset):
    def __init__(self, id_list, sentences, labels, preprocessor):
        self.id_list = id_list
        self.sentences = sentences
        self.labels = labels
        self.preprocessor = preprocessor
    
    def __getitem__(self, idx):
        if self.labels is None: 
            return self.id_list[idx], self.preprocessor.sentence2idx(self.sentences[idx])
        return self.id_list[idx], self.preprocessor.sentence2idx(self.sentences[idx]), self.labels[idx]
    
    def __len__(self):
        return len(self.sentences)
    
    def collate_fn(self, data):
        id_list = torch.LongTensor([d[0] for d in data])
        #lengths: sentence lengths of each sentence
        lengths = torch.LongTensor([len(d[1]) for d in data])
        #pad_sequence to maximum length in each batch
        texts = pad_sequence(
            [d[1] for d in data], 
            batch_first=True).contiguous()

        if self.labels is None: 
            return id_list, lengths, texts
        
        labels = torch.FloatTensor([d[2] for d in data])
        return id_list, lengths, texts, labels

In [6]:
class LSTM_Backbone(torch.nn.Module):
    def __init__(self, embedding, hidden_dim, num_layers, bidirectional, fix_embedding=True):
        super(LSTM_Backbone, self).__init__()
        self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
        self.embedding.weight = torch.nn.Parameter(embedding)
        self.embedding.weight.requires_grad = False if fix_embedding else True
        
        self.lstm = torch.nn.LSTM(embedding.size(1), hidden_dim, num_layers=num_layers, \
                                  bidirectional=bidirectional, batch_first=True)

    def forward(self, inputs):
        inputs = self.embedding(inputs)
        #output: output_state, (hidden_state, cell_state)
        #output_state shape: (N,L,D∗H​)
        output, (hidden, cell) = self.lstm(inputs)
        #concat two hidden state of the bi-LSTM in the last layer
        a,b,c,d = hidden[0], hidden[1], hidden[2], hidden[3]
        out = torch.cat((c,d),1) #size = (N, hidden_size * 2)
        return out
    
class Header(torch.nn.Module):
    def __init__(self, dropout, hidden_dim):
        super(Header, self).__init__()
        self.fc1 = torch.nn.Sequential(torch.nn.Dropout(dropout),
                                        torch.nn.Linear(hidden_dim, hidden_dim),
                                        torch.nn.LeakyReLU(negative_slope=0.05),
                                        torch.nn.BatchNorm1d(hidden_dim),
                                        torch.nn.Dropout(dropout))

        self.fc2 = torch.nn.Sequential(torch.nn.Linear(hidden_dim, 1),
                                         torch.nn.Sigmoid())
    
    def forward(self, inputs, lengths):
        out = self.fc1(inputs)
        out = self.fc2(out).squeeze()
        return out

In [7]:
def train(train_loader, backbone, header, optimizer, criterion, device, epoch):

    total_loss = []
    total_acc = []
    for i, (idx_list, lengths, texts, labels) in enumerate(train_loader):
        lengths, inputs, labels = lengths.to(device), texts.to(device), labels.to(device)
        
        optimizer.zero_grad()
        if not backbone is None:
            #output shape: (batch_size, max_sequence_length, hidden layer size)
            inputs = backbone(inputs)
        soft_predicted = header(inputs, lengths)
        loss = criterion(soft_predicted, labels)
        total_loss.append(loss.item())

        loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            hard_predicted = (soft_predicted >= 0.5).int()
            correct = sum(hard_predicted == labels).item()
            acc = correct * 100 / len(labels)
            total_acc.append(acc)

    return np.mean(total_loss), np.mean(total_acc)

def valid(valid_loader, backbone, header, criterion, device, epoch):
    backbone.eval()
    header.eval()
    with torch.no_grad():
        total_loss = []
        total_acc = []
        
        for i, (idx_list, lengths, texts, labels) in enumerate(valid_loader):
            lengths, inputs, labels = lengths.to(device), texts.to(device), labels.to(device)

            if not backbone is None:
                inputs = backbone(inputs)
            soft_predicted = header(inputs, lengths)
            loss = criterion(soft_predicted, labels)
            total_loss.append(loss.item())
            
            hard_predicted = (soft_predicted >= 0.5).int()
            correct = sum(hard_predicted == labels).item()
            acc = correct * 100 / len(labels)
            total_acc.append(acc)
            
            #print('[Validation in epoch {:}] loss:{:.3f} acc:{:.3f}'.format(epoch+1, np.mean(total_loss), np.mean(total_acc)), end='\r')
    backbone.train()
    header.train()
    return np.mean(total_loss), np.mean(total_acc)

            
def run_training(train_loader, valid_loader, backbone, header, epoch_num, lr, patience, device, model_dir): 

    def no_improve(best_acc, best_loss, valid_loss, valid_acc):
        return (best_acc > valid_acc) and (best_loss < valid_loss)
    
    best_acc = 0
    best_loss = np.inf
    best_epoch = 0
    cnt_no_improve = 0
    change_embedding = False
    patience = patience

    best_backbone = None
    best_header = None

    if backbone is None:
        trainable_paras = header.parameters()
    else:
        trainable_paras = list(backbone.parameters()) + list(header.parameters())
        
    optimizer = torch.optim.Adam(trainable_paras, lr=lr)
    
    backbone.train()
    header.train()
    backbone = backbone.to(device)
    header = header.to(device)
    criterion = torch.nn.BCELoss()

    start = time.time()
    for epoch in range(epoch_num):
        train_loss, train_acc = train(train_loader, backbone, header, optimizer, criterion, device, epoch)
        print('Epoch {:} [Train] loss:{:.3f} acc:{:.3f} /'.\
            format(epoch+1, train_loss, train_acc), end = " ")
        valid_loss, valid_acc = valid(valid_loader, backbone, header, criterion, device, epoch)
        end = time.time()
        elapsed_minutes = (end - start) / 60
        print('[Valid] loss:{:.3f} acc:{:.3f}, {:.2f} minutes elapsed'.\
            format(valid_loss, valid_acc, elapsed_minutes))
        
        if no_improve(best_acc, best_loss, valid_loss, valid_acc):
            cnt_no_improve += 1
        else:
            best_acc = valid_acc
            best_loss = valid_loss
            best_epoch = epoch
            cnt_no_improve = 0
            best_backbone = copy.deepcopy(backbone)
            best_header = copy.deepcopy(header)

        if cnt_no_improve == patience:
            if change_embedding is False:
                print("Embedding now become tunable")
                #change embedding one time
                backbone.embedding.weight.requires_grad = True
                change_embedding = True
                cnt_no_improve = 0
                patience = 3
            else:
                print("Model result: Epoch {:}, [Valid] loss:{:.3f} acc:{:.3f}".\
                    format(best_epoch+1, best_loss, best_acc))
                return best_backbone, best_header, best_acc, best_loss
    return best_backbone, best_header, best_acc, best_loss
    
def run_testing(test_loader, backbone, header, device, output_path):
    with open(output_path, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['id', 'label'])
        backbone.eval()
        header.eval()
        with torch.no_grad():
            for i, (idx_list, lengths, texts) in enumerate(test_loader):
                lengths, inputs = lengths.to(device), texts.to(device)
                if not backbone is None:
                    inputs = backbone(inputs)
                soft_predicted = header(inputs, lengths)
                hard_predicted = (soft_predicted >= 0.5).int()
                for i, p in zip(idx_list, hard_predicted):
                    writer.writerow([str(i.item()), str(p.item())])

# Inference

In [8]:
MODEL_DIR = 'best_RNN_model.pth'
W2V_DIR = 'best_w2v_model.model'
BATCH_SIZE = 128

checkpoint = torch.load(MODEL_DIR)
best_backbone = checkpoint["backbone"]
best_header = checkpoint["header"]
best_backbone = best_backbone.to(device)
best_header = best_header.to(device)

#prepare dataset
test_idx, test_text = load_test('test.csv')

param_grid = {
    'dim': 256, 'hs': 0, 
    'iter': 2, 'min_count': 2, 
    'negative': 5, 'path': W2V_DIR, 
    'sg': 1, 'window': 5}

preprocessor = Preprocessor(None, param_grid)
test_dataset = TwitterDataset(test_idx, test_text, None, preprocessor)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                            batch_size = BATCH_SIZE,
                                            shuffle = False,
                                            collate_fn = test_dataset.collate_fn,
                                            num_workers = 2)
print("Predicting testing dataset...")
run_testing(test_loader, best_backbone, best_header, device, 'submission.csv')

loading word2vec model ...
total words: 30885
Predicting testing dataset...


# Training Record

In [22]:
w2v_param_grid = {
    'dim': [256], 'hs': [0], 
    'iter': [2], 'min_count': [5], 
    'negative': [5], 'path': ["-"], 
    'sg': [1], 'window': [5]}

lstm_config = {
    'hidden_dim': 256, 'num_layers': 2, 
    'bidirectional': True, 'fix_embedding': True}

header_config = {
    'dropout': 0.5, 
    'hidden_dim': 512}

BATCH_SIZE = 128
MAX_POSITIONS_LEN = 500
MODEL_DIR = "best_RNN_model.pth"
W2V_DIR = "best_w2v_model.model"
EPOCH_NUM = 100

lr = 0.001
patience = 1

global_best_backbone = None
global_best_header = None
global_best_acc = 0
global_best_loss = np.inf
global_best_preprocessor = None
global_best_params = None

for param_grid in ParameterGrid(w2v_param_grid):
    print(param_grid)
    train_idx, train_label_text, label = load_train_label('train_label.csv')
    test_idx, test_text = load_test('test.csv')
    preprocessor = Preprocessor(train_label_text, param_grid)
    train_idx, valid_idx, train_label_text, valid_label_text, train_label, valid_label = train_test_split(train_idx, train_label_text, label, test_size=0.12)
    train_dataset, valid_dataset = TwitterDataset(train_idx, train_label_text, train_label, preprocessor), TwitterDataset(valid_idx, valid_label_text, valid_label, preprocessor)
    test_dataset = TwitterDataset(test_idx, test_text, None, preprocessor)

    train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                                batch_size = BATCH_SIZE,
                                                shuffle = True,
                                                collate_fn = train_dataset.collate_fn,
                                                num_workers = 2)
    valid_loader = torch.utils.data.DataLoader(dataset = valid_dataset,
                                                batch_size = BATCH_SIZE,
                                                shuffle = False,
                                                collate_fn = valid_dataset.collate_fn,
                                                num_workers = 2)
    test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                                batch_size = BATCH_SIZE,
                                                shuffle = False,
                                                collate_fn = test_dataset.collate_fn,
                                               num_workers = 2)

    print("Building LSTM model...")
    backbone = LSTM_Backbone(preprocessor.embedding_matrix, **lstm_config)
    header = Header(**header_config)
    local_best_backbone, local_best_header, local_best_acc, local_best_loss = \
        run_training(
        train_loader, valid_loader, backbone, header, 
        EPOCH_NUM, lr, patience, device, MODEL_DIR)
    if local_best_acc > global_best_acc and local_best_loss < global_best_loss:
        global_best_backbone = local_best_backbone
        global_best_header = local_best_header
        global_best_preprocessor = preprocessor
        global_best_params = param_grid
        
        #update global loss and acc
        global_best_acc = local_best_acc
        global_best_loss = local_best_loss 


print("Tuning result:")
print(f"Best param sets: {global_best_params}")
print(f"Best loss {global_best_loss}, best acc {global_best_acc}")
torch.save(
    {'backbone': global_best_backbone, 'header': global_best_header}, 
    MODEL_DIR)
global_best_preprocessor.w2v_model.save(W2V_DIR)

{'dim': 256, 'hs': 0, 'iter': 2, 'min_count': 5, 'negative': 5, 'path': '-', 'sg': 1, 'window': 5}
training word2vec model ...
total words: 14826
Building LSTM model...
Epoch 1 [Train] loss:0.521 acc:74.027 / [Valid] loss:0.483 acc:76.717, 0.38 minutes elapsed
Epoch 2 [Train] loss:0.473 acc:77.456 / [Valid] loss:0.466 acc:77.562, 0.77 minutes elapsed
Epoch 3 [Train] loss:0.454 acc:78.582 / [Valid] loss:0.448 acc:78.760, 1.17 minutes elapsed
Epoch 4 [Train] loss:0.442 acc:79.258 / [Valid] loss:0.437 acc:79.588, 1.57 minutes elapsed
Epoch 5 [Train] loss:0.428 acc:79.948 / [Valid] loss:0.433 acc:79.891, 1.97 minutes elapsed
Epoch 6 [Train] loss:0.415 acc:80.770 / [Valid] loss:0.435 acc:79.741, 2.40 minutes elapsed
Embedding now become non-tunable
Epoch 7 [Train] loss:0.395 acc:82.137 / [Valid] loss:0.404 acc:81.613, 2.84 minutes elapsed
Epoch 8 [Train] loss:0.319 acc:86.150 / [Valid] loss:0.443 acc:81.030, 3.27 minutes elapsed
Epoch 9 [Train] loss:0.237 acc:90.103 / [Valid] loss:0.514 acc