In [2]:
cfg = {
    'dev_train_len': 5*10**3,
    'dev_validation_len': 1*10**3,
    'learning_rate': 0.001,
    'epochs': 100,
    'embedding_dim': 16,
    'batch_size': 32,
    'dropout': 0.1,
    'optimizer': 'Adam',
    'num_layers': 2,
    'word_emb_dim': 300,
}

cfg['hidden_dim'] = cfg['embedding_dim']


In [3]:
DEV_TRAIN_LEN = cfg['dev_train_len']
DEV_VALIDATION_LEN = cfg['dev_validation_len']
LEARNING_RATE = cfg['learning_rate']
EPOCHS = cfg['epochs']
EMBEDDING_DIM = cfg['embedding_dim']
BATCH_SIZE = cfg['batch_size']
DROPOUT = cfg['dropout']
OPTIMIZER = cfg['optimizer']
NUM_LAYERS = cfg['num_layers']
HIDDEN_DIM = cfg['hidden_dim']
WORD_EMB_DIM = cfg['word_emb_dim']

DIR = '/scratch/shu7bh/RES/PRE'

In [4]:
import os
if not os.path.exists(DIR):
    os.makedirs(DIR)

In [5]:
import torch
import os

if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8" # ":4096:2"
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
else:
    DEVICE = torch.device('cpu')
print(DEVICE)

cuda


In [6]:
from nltk.tokenize import word_tokenize
import pandas as pd
import unicodedata
import re

def normalize_unicode(text: str) -> str:
    return unicodedata.normalize('NFD', text)

unique_chars = set()

def clean_data(text: str) -> str:
    text = normalize_unicode(text.lower().strip())
    text = re.sub(r"([.!?])", r" \1", text)
    text = re.sub(r"[^a-zA-Z.!?]+", r" ", text)
    for char in text:
        unique_chars.add(char)
    return text

unique_words = set()
def tokenize_data(text: str) -> list:
    text = word_tokenize(text)
    for word in text:
        unique_words.add(word)
    return text

def read_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df = df.sample(frac=1, random_state=0).reset_index(drop=True)
    df['Description'] = df['Description'].apply(clean_data)
    df['Description'] = df['Description'].apply(tokenize_data)
    return df

In [7]:
df = read_data('data/train.csv')
df

Unnamed: 0,Class Index,Description
0,4,"[london, british, airline, magnate, richard, b..."
1,4,"[leave, it, to, amazon, .com, nasdaq, amzn, .,..."
2,4,"[regardless, space, competitions, are, poised,..."
3,1,"[cbs, millions, of, folded, paper, cranes, flu..."
4,3,"[struggling, under, the, weight, of, a, bloate..."
...,...,...
119995,4,"[com, september, am, pt, ., there, s, no, doub..."
119996,3,"[new, york, reuters, fedex, corp, ., lt, a, hr..."
119997,1,"[pakistani, security, forces, have, arrested, ..."
119998,4,"[palmsource, finally, unveiled, its, new, os, ..."


In [101]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CharCNN(nn.Module):
    def __init__(
            self, 
            char_vocab: int,         # the length of the character vocabulary
            char_embed_dim: int,     # the size of each character embedding vector
            char_out_channels: list, # the number of output channels for each convolutional layer 
            char_kernel_sizes: list, # the kernel size of each convolutional layer
            dropout: float,          # the dropout probability
            word_embed_dim: int      # the size of the word embedding vector which we will output
        ) -> None:

        super(CharCNN, self).__init__()

        self.char_embed = nn.Embedding(char_vocab, char_embed_dim) # the character embedding layer
        self.dropout = nn.Dropout(dropout)

        # the convolutional layers

        self.char_conv = nn.ModuleList([
            nn.Conv1d(char_embed_dim, char_out_channels[i], char_kernel_sizes[i]) 
            for i in range(len(char_out_channels))
        ])

        self.max_pool = nn.AdaptiveMaxPool1d(1) # to check what to do over here
        self.fc = nn.Linear(sum(char_out_channels), word_embed_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        print(1, x.shape)
        x = self.char_embed(x)
        print(2, x.shape)
        x = x.transpose(1, 2)
        print(3, x.shape)
        x = [F.relu(char_conv(x)) for char_conv in self.char_conv]
        print(4, [i.shape for i in x])
        x = [torch.flatten(self.max_pool(i), 1) for i in x]
        print(5, [i.shape for i in x])
        x = torch.cat(x, dim=1)
        x = self.dropout(x)
        x = self.fc(x)
        return x

In [102]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class ELMo(nn.Module):
    def __init__(
            self, 
            char_vocab: int, 
            char_embed_dim: int, 
            char_out_channels: list, 
            char_kernel_sizes: list, 
            dropout: float, 
            num_layers: int, 
            hidden_dim: int, 
            word_embed_dim: int,
            filename: str = None
        ) -> None:

        super(ELMo, self).__init__()

        self.char_cnn = CharCNN(
            char_vocab=char_vocab, 
            char_embed_dim=char_embed_dim, 
            char_out_channels=char_out_channels, 
            char_kernel_sizes=char_kernel_sizes, 
            dropout=dropout,
            word_embed_dim=word_embed_dim
        )

        self.lstm = nn.LSTM(
            input_size=sum(char_out_channels), 
            hidden_size=hidden_dim, 
            num_layers=num_layers, 
            bidirectional=True, 
            dropout=dropout
        )

        self.dropout = nn.Dropout(dropout)
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

        if filename:
            self.load_state_dict(torch.load(filename))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.char_cnn(x)
        x = x.view(x.shape[1], x.shape[0], x.shape[2])
        x = pack_padded_sequence(x, lengths=[x.shape[0]]*x.shape[1])
        x, _ = self.lstm(x)
        x, _ = pad_packed_sequence(x)
        x = x.view(x.shape[1], x.shape[0], 2, self.hidden_dim)
        x = x[:, :, 0, :] + x[:, :, 1, :]
        x = self.dropout(x)
        return x

In [103]:
class LM(nn.Module):
    def __init__(self, 
            char_vocab: int,
            hidden_dim: int, 
            vocab_size: int, 
            filename: str = None
        ) -> None:

        super(LM, self).__init__()
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.elmo = ELMo(
            char_vocab=char_vocab, 
            char_embed_dim=EMBEDDING_DIM, 
            char_out_channels=[32, 32, 32, 32, 32], 
            char_kernel_sizes=[3, 4, 5, 6, 7], 
            dropout=DROPOUT, 
            num_layers=NUM_LAYERS, 
            hidden_dim=HIDDEN_DIM,
            word_embed_dim=WORD_EMB_DIM
        )
        self.linear = nn.Linear(hidden_dim, vocab_size)

        if filename:
            self.load_state_dict(torch.load(filename))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.elmo(x)
        x = self.linear(x)
        return x

In [104]:
# Create a dictionary of all characters
char_to_idx = {char: idx + 1 for idx, char in enumerate(unique_chars)}

# Add special tokens
char_to_idx['<pad>'] = 0
char_to_idx['<sos>'] = len(char_to_idx)
char_to_idx['<eos>'] = len(char_to_idx)

# Create a dictionary of all characters
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# print the character to index mapping
print(char_to_idx)
print(idx_to_char)

# Create a dictionary of all words
word_to_idx = {word: idx + 1 for idx, word in enumerate(unique_words)}

# Add special tokens
word_to_idx['<pad>'] = 0
word_to_idx['<sos>'] = len(word_to_idx)
word_to_idx['<eos>'] = len(word_to_idx)
word_to_idx['<unk>'] = len(word_to_idx)

# Create a dictionary of all words
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# print the length of the word to index mapping
print(len(word_to_idx))

{'!': 1, 'n': 2, 'j': 3, '?': 4, 'u': 5, 'y': 6, 'l': 7, 'e': 8, 'p': 9, 'g': 10, 'k': 11, ' ': 12, 'i': 13, '.': 14, 'z': 15, 'd': 16, 's': 17, 'h': 18, 'x': 19, 'v': 20, 'a': 21, 'b': 22, 'c': 23, 'f': 24, 'm': 25, 'w': 26, 'o': 27, 't': 28, 'q': 29, 'r': 30, '<pad>': 0, '<sos>': 31, '<eos>': 32}
{1: '!', 2: 'n', 3: 'j', 4: '?', 5: 'u', 6: 'y', 7: 'l', 8: 'e', 9: 'p', 10: 'g', 11: 'k', 12: ' ', 13: 'i', 14: '.', 15: 'z', 16: 'd', 17: 's', 18: 'h', 19: 'x', 20: 'v', 21: 'a', 22: 'b', 23: 'c', 24: 'f', 25: 'm', 26: 'w', 27: 'o', 28: 't', 29: 'q', 30: 'r', 0: '<pad>', 31: '<sos>', 32: '<eos>'}
58325


In [105]:
dev_train_raw = df[:DEV_TRAIN_LEN]
dev_validation_raw = df[DEV_TRAIN_LEN:DEV_TRAIN_LEN+DEV_VALIDATION_LEN]

In [106]:
# Create a DataSet class
from torch.utils.data import Dataset

class Sentences(Dataset):
    def __init__(self, df: pd.DataFrame, char_to_idx: dict) -> None:
        self.X = []
        for sentence in df['Description'].tolist():
            sent = [char_to_idx['<sos>']]
            for word in sentence:
                for char in word:
                    sent += [char_to_idx[char]]
            sent += [char_to_idx['<eos>']]
            self.X += [sent]
        self.Y = df['Class Index'].tolist()
        # self.X = [char_to_idx[char] for char in word for word in sentence for sentence in self.X]

    def __len__(self) -> int:
        return len(self.X)

    def __getitem__(self, idx: int) -> tuple:
        return torch.tensor(self.X[idx]), torch.tensor(self.Y[idx]), torch.tensor(len(self.X[idx]))

In [107]:
dev_train_dataset = Sentences(dev_train_raw, char_to_idx)
dev_validation_dataset = Sentences(dev_validation_raw, char_to_idx)

In [108]:
def collate_fn(batch: list) -> tuple:
    x, y, l = zip(*batch)
    x = torch.nn.utils.rnn.pad_sequence(x, padding_value=char_to_idx['<pad>'])
    return x, torch.stack(y), torch.stack(l)

In [109]:
from torch.utils.data import DataLoader

dev_train_loader = DataLoader(dev_train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
dev_validation_loader = DataLoader(dev_validation_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

In [110]:
lm = LM(char_vocab=len(char_to_idx), hidden_dim=HIDDEN_DIM, vocab_size=len(word_to_idx)).to(DEVICE)
print(lm)

LM(
  (elmo): ELMo(
    (char_cnn): CharCNN(
      (char_embed): Embedding(33, 16)
      (dropout): Dropout(p=0.1, inplace=False)
      (char_conv): ModuleList(
        (0): Conv1d(16, 32, kernel_size=(3,), stride=(1,))
        (1): Conv1d(16, 32, kernel_size=(4,), stride=(1,))
        (2): Conv1d(16, 32, kernel_size=(5,), stride=(1,))
        (3): Conv1d(16, 32, kernel_size=(6,), stride=(1,))
        (4): Conv1d(16, 32, kernel_size=(7,), stride=(1,))
      )
      (max_pool): AdaptiveMaxPool1d(output_size=3)
      (fc): Linear(in_features=160, out_features=300, bias=True)
    )
    (lstm): LSTM(160, 16, num_layers=2, dropout=0.1, bidirectional=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (linear): Linear(in_features=16, out_features=58325, bias=True)
)


In [111]:
for X, Y, L in dev_train_loader:
    print(X.shape, Y.shape, L.shape)
    X = X.to(DEVICE)
    Y = Y.to(DEVICE)
    print(lm(X).shape)
    break

torch.Size([230, 32]) torch.Size([32]) torch.Size([32])
1 torch.Size([230, 32])
2 torch.Size([230, 32, 16])
3 torch.Size([230, 16, 32])
4 [torch.Size([230, 32, 30]), torch.Size([230, 32, 29]), torch.Size([230, 32, 28]), torch.Size([230, 32, 27]), torch.Size([230, 32, 26])]
5 [torch.Size([230, 96]), torch.Size([230, 96]), torch.Size([230, 96]), torch.Size([230, 96]), torch.Size([230, 96])]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (230x480 and 160x300)