# 🚀 Neural Network-Based Text Compression

#### 🖋️ Authors
- Feidnand Eide
- Seran Shanmugathas


### Install libaries

In [11]:
%pip install pytorch-lightning lightning-transformers --quiet

Note: you may need to restart the kernel to use updated packages.


### Import Dependencies

In [17]:
from enum import Enum
import ast

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.utilities.types import TRAIN_DATALOADERS
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, random_split
import transformers
from transformers import AutoTokenizer

import pandas as pd


### Config

In [None]:
config: dict = {
    "data_path": "data/uncompressed_and_compressed.csv",
    "batch_size": 32,
    "max_length": 512,
    "vocab_size": None,
    "embedding_dim": 128,
    "hidden_dim": 256,
    "num_layers": 2,
    "dropout_rate": 0.5,
    "max_epochs": 10,
    "gpus": 1
}

### Load and Preprocess the Dataset

In [16]:
class Columns(Enum):
    """
    Enum containing the columns of the dataset
    """

    UNCOMPRESSED = "uncompressed"
    COMPRESSED = "compressed"


def tokenize(text):
    return text.split()

# Convert the compressed data from hex to int
def hex_to_int(hex_string):
    return [int(byte, 16) for byte in hex_string.split('\\') if byte]

class CompressionDataset(Dataset):
    def __init__(self, csv_file: str, delimiter: str = ";", max_length: int = 512):
        self.df = pd.read_csv(csv_file, delimiter=delimiter)
        self.max_length = max_length
        
        # Preprocessing steps
        self.tokenize_and_encode()

    def tokenize_and_encode(self):
        # Tokenization and encoding of uncompressed data
        self.df[Columns.UNCOMPRESSED.value] = self.df[Columns.UNCOMPRESSED.value].apply(tokenize)
        
        # Conversion of compressed data
        self.df[Columns.COMPRESSED.value] = self.df[Columns.COMPRESSED.value].apply(hex_to_int)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):
        uncompressed = self.df.iloc[idx][Columns.UNCOMPRESSED.value]
        compressed = self.df.iloc[idx][Columns.COMPRESSED.value]
        
        # Padding
        if len(uncompressed) > self.max_length:
            uncompressed = uncompressed[:self.max_length]
        if len(compressed) > self.max_length:
            compressed = compressed[:self.max_length]
        
        return {
            Columns.UNCOMPRESSED.value: torch.tensor(uncompressed, dtype=torch.long),
            Columns.COMPRESSED.value: torch.tensor(compressed, dtype=torch.long)
        }

def collate_fn(batch):
    uncompressed_batch = [item[Columns.UNCOMPRESSED.value] for item in batch]
    compressed_batch = [item[Columns.COMPRESSED.value] for item in batch]
    
    uncompressed_batch = pad_sequence(uncompressed_batch, batch_first=True)
    compressed_batch = pad_sequence(compressed_batch, batch_first=True)
    
    return uncompressed_batch, compressed_batch

### The model

In [None]:
class LSTMCompressor(pl.LightningModule):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder_lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout_rate, batch_first=True)
        self.decoder_lstm = nn.LSTM(hidden_dim, embedding_dim, num_layers, dropout=dropout_rate, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        # Embedding
        x = self.embedding(x)

        # Encoding
        x, (hidden, cell) = self.encoder_lstm(x)
        
        # Decoding (if necessary, or you can have a separate method for it)
        x, (hidden, cell) = self.decoder_lstm(x)

        # Fully connected layer
        x = self.fc(x)
        
        return x

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        return optimizer

    def training_step(self, batch, batch_idx):
        # Unpack the batch
        uncompressed, compressed = batch
        
        # Forward pass
        predicted_compressed = self.forward(uncompressed)
        
        # Calculate loss
        loss = F.cross_entropy(predicted_compressed, compressed)
        
        # Logging to TensorBoard by default
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        uncompressed, compressed = batch
        predicted_compressed = self.forward(uncompressed)
        loss = F.cross_entropy(predicted_compressed, compressed)
        self.log('val_loss', loss)


### Training

In [None]:
dataset = CompressionDataset(
    csv_file=config["data_path"], max_length=config["max_length"]
)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(
    train_dataset, batch_size=config["batch_size"], collate_fn=collate_fn, shuffle=True
)
val_loader = DataLoader(
    val_dataset, batch_size=config["batch_size"], collate_fn=collate_fn
)


model = LSTMCompressor(
    config["vocab_size"],
    config["embedding_dim"],
    config["hidden_dim"],
    config["num_layers"],
    config["dropout_rate"],
)

# Initialize the Trainer
checkpoint_callback = ModelCheckpoint(
    monitor="val_loss",
    save_top_k=1,
    mode="min",
    filename="lstm-compressor-{epoch:02d}-{val_loss:.2f}",
)

trainer = pl.Trainer(
    max_epochs=config["max_epochs"],
    gpus=config["gpus"],
    callbacks=[checkpoint_callback],
)

# Train the model
trainer.fit(model, train_loader, val_loader)