In [1]:
import os
import re
from os import path
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from tensorflow import keras
import sys

sys.path.extend(['../input/smartevent/'])
from tokenization import FullTokenizer
import random
import torch
import torch.nn as nn

In [2]:
def seed_torch(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [3]:
class BertTokenizer():
    def __init__(self, tokenizer, max_length=30, labels=None):
        self.tokenizer = tokenizer
        self.maxLength = max_length
        self.labels = labels

    def clean_whitespace(self, token):
        _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
        return _RE_COMBINE_WHITESPACE.sub(" ", token).strip()

    def to_lowercase(self, text):
        # when tokenize chinese with english text, you must transform to lowercase for tokenize
        return text.lower()

    def get_id(self, token):
        PAD = 0
        if len(token) >= self.maxLength:
            return self.tokenizer.convert_tokens_to_ids(token)[:self.maxLength]
        else:
            return self.tokenizer.convert_tokens_to_ids(token) + [PAD] * (self.maxLength - len(token))

#     def fit(self, texts):
#         for idx, text in enumerate(texts):
#             if isinstance(text, str):
#                 text = self.to_lowercase(text)
#                 print(idx)
#                 clean_text = self.clean_whitespace(text)
#                 word_token = self.tokenizer.tokenize_chinese(clean_text)
#                 yield np.array(self.get_id(word_token))

    def fit(self, text):
        text = '[CLS]' + self.to_lowercase(text)
        clean_text = self.clean_whitespace(text)
        word_token = self.tokenizer.tokenize_chinese(clean_text)
        return np.array(self.get_id(word_token))

In [4]:
class CommentsDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        df,
        feature_col,
        label_col,
        tokenizer,
        device=None
    ):
        self.df = df
        self.feature_col = feature_col
        self.label_col = label_col
        self.tokenizer = tokenizer
        self.device = device
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        idx_row = self.df.iloc[idx]
        feature = torch.tensor(self.tokenizer.fit(idx_row[self.feature_col]), dtype=torch.long).to(self.device)
        label = torch.tensor(self.tokenizer.fit(idx_row[self.label_col]), dtype=torch.long).to(self.device)
        return feature, label

In [5]:
VOCAB_DIR = '/kaggle/input/smartevent/vocab.txt'

def convert_data(df):
    head_cols = df.columns.tolist()[:7]
    for _, row in df.iterrows():
        h_data = [row[col] for col in head_cols]
        yield h_data

def _load_dataframe(file_dir):
    files = [path.join(file_dir, f) for f in os.listdir(file_dir) if f.endswith('.csv')]
    df = pd.concat([pd.read_csv(open(f,'rU'), encoding='utf-8', engine='c') for f in files], ignore_index=True)
    shuffled_df = shuffle(df).reset_index(drop=True)
    return shuffled_df

def load_comments():
    train_dir = '/kaggle/input/smartevent'
    df = _load_dataframe(train_dir)
    df['text'] = df['text'].astype('str')
    print('text length: {}'.format(len(df['text'])))
    split_length = int(np.ceil(len(df) * 0.8))
    h_data = list(convert_data(df))
    h_train = h_data[:split_length]
    h_val = h_data[split_length:]
    bertTokenizer = BertTokenizer(tokenizer=FullTokenizer(VOCAB_DIR), max_length=50)
    train_dataset = CommentsDataset(df[:split_length], 'text', 'text', bertTokenizer, device)
    val_dataset = CommentsDataset(df[split_length:], 'text', 'text', bertTokenizer, device)
    return h_data, h_val, train_dataset, val_dataset

In [6]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        #src = [batch size, src len]
        embedded = self.dropout(self.embedding(src))
        #embedded = [batch size, src len, embdim]
        outputs, (hidden, cell) = self.lstm(embedded)
        #outputs = [batch size, src len, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]

        #outputs are always from the top hidden layer
        return outputs, hidden, cell
    

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, inp, hidden, cell):
        #inp = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        inp = inp.unsqueeze(0)
        #inp = [1, batch size]
        embedded = self.dropout(self.embedding(inp))
        #embedded = [1, batch size, emb dim]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        #prediction = [batch size, output dim]
        return prediction, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder=None, device=None):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        #teacher_forcing_ratio is probability to use teacher forcing
        trg_vocab_size = self.decoder.output_dim
        #tensor to store decoder outputs
        target_length = trg.size(1)
        batch_size = trg.size(0)
        outputs = torch.zeros(target_length, batch_size, trg_vocab_size).to(self.device)

        #last hidden state of the encoder is used as the initial hidden state of the decoder
        result, hidden, cell = self.encoder(src)
        if not self.decoder:
            return result
        #first input to the decoder is the [CLS] tokens
        inp = trg[:, 0]
        
        for t in range(target_length):
            output, hidden, cell = self.decoder(inp, hidden, cell)
            outputs[t] = output
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1)
            inp = trg[:, t] if teacher_force else top1

        return outputs

In [7]:
class Trainer():
    def __init__(self, batch_size, model, optimizer, epochs, device):
        self.batch_size = batch_size
        self.epochs = epochs
        self.model = model.to(device)
        self.optimizer = optimizer
        self.loss_fn = nn.CrossEntropyLoss()

    def _save_model(self, epoch):
        model_path = str(epoch)
        torch.save(self.model.state_dict(), model_path)

    def train_epoch(self, train_loader):
        self.model.train()
        avg_loss = 0.
        for x_batch, y_batch in train_loader:
            y_pred = self.model(x_batch, y_batch)
            pred_dim = y_pred.shape[-1]
            output = y_pred.view(-1, pred_dim)
            y_batch = y_batch.view(-1)
            loss = self.loss_fn(output, y_batch)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            avg_loss += loss.item() / len(train_loader)
        return avg_loss

    def train_val(self, val_loader):
        avg_loss = 0.
        for x_batch, y_batch in val_loader:
            y_pred = self.model(x_batch, y_batch).detach()
            pred_dim = y_pred.shape[-1]
            output = y_pred.view(-1, pred_dim)
            y_batch = y_batch.view(-1)
            avg_loss += self.loss_fn(output, y_batch).item() / len(val_loader)
        return avg_loss

    def fit(self, train_dataset, valid_dataset):
        train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=self.batch_size, shuffle=True)
        valid_loader = torch.utils.data.DataLoader(
            valid_dataset, batch_size=self.batch_size, shuffle=False)
        for epoch in range(self.epochs):
            print('training on epoch: {}'.format(epoch))
            avg_train_loss = self.train_epoch(train_loader)
            avg_val_loss = self.train_val(valid_loader)
            print('Epoch {} train loss: {}, val loss: {}'.format(
                epoch, avg_train_loss, avg_val_loss))
            self._save_model(epoch)

In [8]:
def main():
    max_features = 21128
    BATCH_SIZE = 128
    INPUT_DIM = max_features
    OUTPUT_DIM = max_features
    ENC_EMB_DIM = 256
    DEC_EMB_DIM = 256
    HID_DIM = 512
    N_LAYERS = 2
    ENC_DROPOUT = 0.5
    DEC_DROPOUT = 0.5
    h_train, h_val, train_dataset, valid_dataset = load_comments()
#     h_train, h_val, t_train, t_val = train_test_split(h_train, text_train, test_size=0.2, random_state=42)
    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
    model = Seq2Seq(enc, dec, device)
    optimizer = torch.optim.Adam(model.parameters())

    trainer = Trainer(
        batch_size=BATCH_SIZE,
        model=model,
        optimizer=optimizer,
        epochs=30,
        device=device)
    trainer.fit(train_dataset, valid_dataset)

In [10]:
seed_torch(1029)
main()

  # This is added back by InteractiveShellApp.init_path()


text length: 1655429
training on epoch: 0


KeyboardInterrupt: 