In [2]:
cfg = {
    'dev_train_len': 5*10**3,
    'dev_validation_len': 1*10**3,
    'learning_rate': 0.001,
    'epochs': 100,
    'embedding_dim': 16,
    'batch_size': 32,
    'dropout': 0.1,
    'optimizer': 'Adam',
    'num_layers': 2
}

cfg['hidden_dim'] = cfg['embedding_dim']


In [15]:
DEV_TRAIN_LEN = cfg['dev_train_len']
DEV_VALIDATION_LEN = cfg['dev_validation_len']
LEARNING_RATE = cfg['learning_rate']
EPOCHS = cfg['epochs']
EMBEDDING_DIM = cfg['embedding_dim']
BATCH_SIZE = cfg['batch_size']
DROPOUT = cfg['dropout']
OPTIMIZER = cfg['optimizer']
NUM_LAYERS = cfg['num_layers']
HIDDEN_DIM = cfg['hidden_dim']

DIR = '/scratch/shu7bh/RES/PRE'

In [6]:
import os
if not os.path.exists(DIR):
    os.makedirs(DIR)

In [4]:
import torch
import os

if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8" # ":4096:2"
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
else:
    DEVICE = torch.device('cpu')
print(DEVICE)

cuda


In [7]:
import unicodedata
import random
import re
import pandas as pd

def normalize_unicode(text: str) -> str:
    return unicodedata.normalize('NFD', text)

unique_chars = set()

def normalize_string(text: str) -> str:
    text = normalize_unicode(text.lower().strip())
    text = re.sub(r"([.!?])", r" \1", text)
    text = re.sub(r"[^a-zA-Z.!?]+", r" ", text)
    for char in text:
        unique_chars.add(char)
    return text

def read_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df = df.sample(frac=1, random_state=0).reset_index(drop=True)
    df['Description'] = df['Description'].apply(normalize_string)
    return df

In [10]:
df = read_data('data/train.csv')
df

Unnamed: 0,Class Index,Description
0,4,london british airline magnate richard branson...
1,4,leave it to amazon .com nasdaq amzn . apparent...
2,4,regardless space competitions are poised to be...
3,1,cbs millions of folded paper cranes fluttered...
4,3,struggling under the weight of a bloated trade...
...,...,...
119995,4,com september am pt . there s no doubt the int...
119996,3,new york reuters fedex corp . lt a href http w...
119997,1,pakistani security forces have arrested more s...
119998,4,palmsource finally unveiled its new os version...


In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CharCNN(nn.Module):
    def __init__(
            self, 
            char_vocab: int,
            char_embed_dim: int, 
            char_out_channels: list, 
            char_kernel_sizes: list, 
            dropout: float
        ) -> None:

        super(CharCNN, self).__init__()
        self.char_embed = nn.Embedding(char_vocab, char_embed_dim)
        self.char_conv = nn.ModuleList([
            nn.Conv1d(char_embed_dim, char_out_channels[i], char_kernel_sizes[i]) 
            for i in range(len(char_out_channels))
        ])

        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.char_embed(x)
        x = x.transpose(1, 2)
        x = [F.relu(char_conv(x)) for char_conv in self.char_conv]
        x = [F.max_pool1d(char_conv, char_conv.shape[2]).squeeze(2) for char_conv in x]
        x = torch.cat(x, dim=1)
        x = self.dropout(x)
        return x

In [14]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class ELMo(nn.Module):
    def __init__(
            self, 
            char_vocab: int, 
            char_embed_dim: int, 
            char_out_channels: list, 
            char_kernel_sizes: list, 
            dropout: float, 
            num_layers: int, 
            hidden_dim: int, 
            filename: str = None
        ) -> None:

        super(ELMo, self).__init__()

        self.char_cnn = CharCNN(
            char_vocab=char_vocab, 
            char_embed_dim=char_embed_dim, 
            char_out_channels=char_out_channels, 
            char_kernel_sizes=char_kernel_sizes, 
            dropout=dropout
        )

        self.lstm = nn.LSTM(
            input_size=sum(char_out_channels), 
            hidden_size=hidden_dim, 
            num_layers=num_layers, 
            bidirectional=True, 
            dropout=dropout
        )

        self.dropout = nn.Dropout(dropout)
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

        if filename:
            self.load_state_dict(torch.load(filename))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.char_cnn(x)
        x = x.view(x.shape[1], x.shape[0], x.shape[2])
        x = pack_padded_sequence(x, lengths=[x.shape[0]]*x.shape[1])
        x, _ = self.lstm(x)
        x, _ = pad_packed_sequence(x)
        x = x.view(x.shape[1], x.shape[0], 2, self.hidden_dim)
        x = x[:, :, 0, :] + x[:, :, 1, :]
        x = self.dropout(x)
        return x

In [72]:
class LM(nn.Module):
    def __init__(self, hidden_dim: int, vocab_size: int, filename: str) -> None:
        super(LM, self).__init__()
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.elmo = ELMo(
            char_vocab=len(unique_chars), 
            char_embed_dim=EMBEDDING_DIM, 
            char_out_channels=[32, 32, 32, 32, 32], 
            char_kernel_sizes=[3, 4, 5, 6, 7], 
            dropout=DROPOUT, 
            num_layers=NUM_LAYERS, 
            hidden_dim=HIDDEN_DIM
        )
        self.linear = nn.Linear(hidden_dim, vocab_size)

        if filename:
            self.load_state_dict(torch.load(filename))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.elmo(x)
        x = self.linear(x)
        return x

In [73]:
# Create a dictionary of all characters
char_to_idx = {char: idx + 1 for idx, char in enumerate(unique_chars)}

# Add special tokens
char_to_idx['<pad>'] = 0
char_to_idx['<sos>'] = len(char_to_idx)
char_to_idx['<eos>'] = len(char_to_idx)

# Create a dictionary of all characters
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# print the character to index mapping
print(char_to_idx)
print(idx_to_char)

{'n': 1, 'c': 2, 'w': 3, 'f': 4, ' ': 5, 'x': 6, '?': 7, '!': 8, 'u': 9, 's': 10, 'm': 11, 'l': 12, 'r': 13, 't': 14, 'p': 15, 'j': 16, 'v': 17, 'a': 18, 'y': 19, 'z': 20, 'o': 21, 'g': 22, 'q': 23, '.': 24, 'e': 25, 'b': 26, 'h': 27, 'i': 28, 'd': 29, 'k': 30, '<pad>': 0, '<sos>': 31, '<eos>': 32}
{1: 'n', 2: 'c', 3: 'w', 4: 'f', 5: ' ', 6: 'x', 7: '?', 8: '!', 9: 'u', 10: 's', 11: 'm', 12: 'l', 13: 'r', 14: 't', 15: 'p', 16: 'j', 17: 'v', 18: 'a', 19: 'y', 20: 'z', 21: 'o', 22: 'g', 23: 'q', 24: '.', 25: 'e', 26: 'b', 27: 'h', 28: 'i', 29: 'd', 30: 'k', 0: '<pad>', 31: '<sos>', 32: '<eos>'}


In [33]:
dev_train_raw = df[:DEV_TRAIN_LEN]
dev_validation_raw = df[DEV_TRAIN_LEN:DEV_TRAIN_LEN+DEV_VALIDATION_LEN]

In [58]:
# Create a DataSet class
from torch.utils.data import Dataset

class Sentences(Dataset):
    def __init__(self, df: pd.DataFrame, char_to_idx: dict) -> None:
        self.X = df['Description'].tolist()
        self.Y = df['Class Index'].tolist()
        self.X = [[char_to_idx[char] for char in sentence] for sentence in self.X]

    def __len__(self) -> int:
        return len(self.X)

    def __getitem__(self, idx: int) -> tuple:
        return torch.tensor(self.X[idx]), torch.tensor(self.Y[idx]), torch.tensor(len(self.X[idx]))

In [59]:
dev_train_dataset = Sentences(dev_train_raw, char_to_idx)
dev_validation_dataset = Sentences(dev_validation_raw, char_to_idx)

In [69]:
def collate_fn(batch: list) -> tuple:
    x, y, l = zip(*batch)
    x = torch.nn.utils.rnn.pad_sequence(x, padding_value=char_to_idx['<pad>'])
    return x, torch.stack(y), torch.stack(l)

In [70]:
from torch.utils.data import DataLoader

dev_train_loader = DataLoader(dev_train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
dev_validation_loader = DataLoader(dev_validation_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

In [None]:
lm = LM(hidden_dim=HIDDEN_DIM, vocab_size=len(char_to_idx))

In [71]:
for X, Y, L in dev_train_loader:

torch.Size([537, 32])
torch.Size([32])
torch.Size([32])
