# 🚀 Neural Network-Based Text Compression

#### 🖋️ Authors
- Feidnand Eide
- Seran Shanmugathas


## 📚 Install Libraries
We will need the following libraries:
- `pytorch-lightning`
- `lightning-transformers`

In [7]:
%pip install pytorch-lightning lightning-transformers --quiet

Note: you may need to restart the kernel to use updated packages.


## 📌 Import Dependencies
The following libraries are used in this project:
- Standard libraries: `enum`, `ast`
- PyTorch and PyTorch Lightning for model building and training
- Transformers from Hugging Face for NLP tasks
- Pandas for data handling

In [8]:
from enum import Enum
import zipfile
import io
import ast
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.utilities.types import TRAIN_DATALOADERS
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, random_split
import transformers
from transformers import AutoTokenizer
import pandas as pd

## 🔧 Configuration
Set up the configuration for the model training.

In [9]:
config: dict = {
    "data_path": "data/uncompressed_and_compressed.csv",
    "batch_size": 32,
    "max_length": 512,
    "vocab_size": 256,
    "embedding_dim": 128,
    "hidden_dim": 256,
    "num_layers": 2,
    "dropout_rate": 0.5,
    "max_epochs": 10,
    "gpus": 0
}

## 🗂️ Load and Preprocess the Dataset
We define a custom dataset class for handling our text compression data.

In [10]:
class Columns(Enum):
    """
    Enum containing the columns of the dataset.
    """
    UNCOMPRESSED = "uncompressed"
    COMPRESSED = "compressed"

def tokenize(text: str) -> list[str]:
    return text.split()

def zip_to_int(zip_string: str) -> list[int]:
    # Assuming the zip_string is a binary string starting with "b'PK", 
    # we first need to convert it into actual binary data.
    # Example input: "b'PK...'"
    # Remove the leading "b'" and trailing "'" to get the raw binary content
    raw_binary = zip_string[2:-1]

    # Convert the raw binary string to bytes
    zip_bytes = bytes(raw_binary, 'utf-8')

    # Now, you can process the zip_bytes as needed, for example:
    # 1. Unzip and then process the data
    # 2. Convert directly to a list of integers (byte values)

    # Example: Converting to a list of integers
    int_values = list(zip_bytes)
    return int_values

class CompressionDataset(Dataset):
    def __init__(self, csv_file: str, delimiter: str = ";", max_length: int = 512):
        self.df = pd.read_csv(csv_file, delimiter=delimiter)
        self.max_length = max_length
        
        self.tokenize_and_encode()

    def tokenize_and_encode(self):
        self.df[Columns.UNCOMPRESSED.value] = self.df[Columns.UNCOMPRESSED.value].apply(tokenize)
        self.df[Columns.COMPRESSED.value] = self.df[Columns.COMPRESSED.value].apply(zip_to_int)

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
        uncompressed = self.df.iloc[idx][Columns.UNCOMPRESSED.value]
        compressed = self.df.iloc[idx][Columns.COMPRESSED.value]
        
        if len(uncompressed) > self.max_length:
            uncompressed = uncompressed[:self.max_length]
        if len(compressed) > self.max_length:
            compressed = compressed[:self.max_length]
        
        return {
            Columns.UNCOMPRESSED.value: torch.tensor(uncompressed, dtype=torch.long),
            Columns.COMPRESSED.value: torch.tensor(compressed, dtype=torch.long)
        }

def collate_fn(batch):
    uncompressed_batch = [item[Columns.UNCOMPRESSED.value] for item in batch]
    compressed_batch = [item[Columns.COMPRESSED.value] for item in batch]
    
    uncompressed_batch = pad_sequence(uncompressed_batch, batch_first=True)
    compressed_batch = pad_sequence(compressed_batch, batch_first=True)
    
    return uncompressed_batch, compressed_batch


## 🤖 The Model
Here we define our LSTM-based compression model.

In [11]:
class LSTMCompressor(pl.LightningModule):
    def __init__(
        self, 
        vocab_size: int, 
        embedding_dim: int, 
        hidden_dim: int, 
        num_layers: int, 
        dropout_rate: float
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder_lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers,
            dropout=dropout_rate,
            batch_first=True,
        )
        self.fc = nn.Linear(hidden_dim, 256)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x)
        x, (hidden, cell) = self.encoder_lstm(x)
        x = self.fc(x)
        return x

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        return optimizer

    def training_step(self, batch, batch_idx):
        uncompressed, compressed = batch
        predicted_compressed = self.forward(uncompressed)
        loss = F.cross_entropy(predicted_compressed, compressed)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        uncompressed, compressed = batch
        predicted_compressed = self.forward(uncompressed)
        loss = F.cross_entropy(predicted_compressed, compressed)
        self.log("val_loss", loss)


## 🏋️‍♂️ Training
Setting up the training environment and initiating the training process.

In [13]:
dataset = CompressionDataset(
    csv_file=config["data_path"], max_length=config["max_length"]
)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(
    train_dataset, batch_size=config["batch_size"], collate_fn=collate_fn, shuffle=True
)
val_loader = DataLoader(
    val_dataset, batch_size=config["batch_size"], collate_fn=collate_fn
)

model = LSTMCompressor(
    config["vocab_size"],
    config["embedding_dim"],
    config["hidden_dim"],
    config["num_layers"],
    config["dropout_rate"],
)

# Initialize the Trainer
checkpoint_callback = ModelCheckpoint(
    monitor="val_loss",
    save_top_k=1,
    mode="min",
    filename="lstm-compressor-{epoch:02d}-{val_loss:.2f}",
)

trainer = pl.Trainer(
    max_epochs=config["max_epochs"],
    # gpus=config["gpus"],
    callbacks=[checkpoint_callback],
)

# Train the model
trainer.fit(model, train_loader, val_loader)


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /Users/seranshanmugathas/Documents/Skole/NTNU/Høst 2023/Deep learning/Neural-Network-Based-Text-Compression/lightning_logs

  | Name         | Type      | Params
-------------------------------------------
0 | embedding    | Embedding | 32.8 K
1 | encoder_lstm | LSTM      | 921 K 
2 | fc           | Linear    | 65.8 K
-------------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.081     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/seranshanmugathas/anaconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


ValueError: too many dimensions 'str'