# 🚀 Neural Network-Based Text Compression

#### 🖋️ Authors
- Feidnand Eide
- Seran Shanmugathas


## 📚 Install Libraries
We will need the following libraries:
- `pytorch`
- `pytorch-lightning`

In [7]:
%pip install pytorch-lightning --quiet

Note: you may need to restart the kernel to use updated packages.


## 📌 Import Dependencies
The following libraries are used in this project:
- Standard libraries: `enum`, `ast`
- PyTorch and PyTorch Lightning for model building and training
- Transformers from Hugging Face for NLP tasks
- Pandas for data handling

In [8]:
from enum import Enum
import pandas as pd
from transformers import AutoTokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

## 🔧 Configuration
Set up the configuration for the model training.

In [9]:
config: dict = {
    "data_path": "data/uncompressed_and_compressed.csv",
    "batch_size": 32,
    "max_length": 512,
    "vocab_size": 256,
    "embedding_dim": 128,
    "hidden_dim": 256,
    "num_layers": 2,
    "dropout_rate": 0.5,
    "max_epochs": 10,
    "gpus": 0
}

## 🗂️ Load and Preprocess the Dataset
We define a custom dataset class for handling our text compression data.

In [10]:
class Columns(Enum):
    """
    Enum containing the columns of the dataset.
    """
    UNCOMPRESSED = "uncompressed"
    COMPRESSED = "compressed"

class TextCompressionDataset(Dataset):
    def __init__(self, uncompressed_texts, compressed_texts):
        self.uncompressed_texts = uncompressed_texts
        self.compressed_texts = compressed_texts

    def __len__(self):
        return len(self.uncompressed_texts)

    def __getitem__(self, idx):
        # Convert each character in uncompressed text to its corresponding integer value
        uncompressed_text = [ord(c) for c in self.uncompressed_texts[idx]]
        uncompressed_text_tensor = torch.tensor(uncompressed_text, dtype=torch.long)

        # Handle compressed text
        compressed_text = self.compressed_texts[idx]
        if isinstance(compressed_text, str):
            # If it's a string representation of bytes, convert it to actual bytes
            compressed_text = bytes(compressed_text, encoding='latin1')
        compressed_text = [b for b in compressed_text]
        compressed_text_tensor = torch.tensor(compressed_text, dtype=torch.long)
        return uncompressed_text_tensor, compressed_text_tensor


In [11]:
def collate_batch(batch):
    uncompressed_texts, compressed_texts = zip(*batch)

    max_seq_length = min(
        max(len(t) for t in uncompressed_texts), max(len(t) for t in compressed_texts)
    )

    uncompressed_texts_padded = pad_sequence(
        [t[:max_seq_length] for t in uncompressed_texts],
        batch_first=True,
        padding_value=0,
    )

    # Truncate or pad the compressed texts
    compressed_texts_padded = pad_sequence(
        [t[:max_seq_length] for t in compressed_texts],
        batch_first=True,
        padding_value=0,
    )

    return uncompressed_texts_padded, compressed_texts_padded

In [12]:
class TextDataModule(pl.LightningDataModule):
    def __init__(self, dataset, batch_size=32, collate_fn=None):
        super().__init__()
        self.dataset = dataset
        self.batch_size = batch_size
        self.collate_fn = collate_fn

    def setup(self, stage=None):
        # Here you could split your dataset into train, val, and test sets if needed
        # For example:
        # self.train, self.val, self.test = random_split(self.dataset, [sizes])

        # If you don't need to split, just assign the dataset to self.train
        self.train = self.dataset

    def train_dataloader(self):
        return DataLoader(self.train, batch_size=self.batch_size, shuffle=True, collate_fn=self.collate_fn)

    # Implement val_dataloader and test_dataloader if you have validation and test sets
    # def val_dataloader(self):
    #     return DataLoader(self.val, batch_size=self.batch_size, shuffle=False, collate_fn=self.collate_fn)

    # def test_dataloader(self):
    #     return DataLoader(self.test, batch_size=self.batch_size, shuffle=False, collate_fn=self.collate_fn)

## 🤖 The Model
Here we define our LSTM-based compression model.

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class LSTMTextCompressor(pl.LightningModule):
    def __init__(self, vocab_size, hidden_dim, num_layers):
        super(LSTMTextCompressor, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, hidden_dim).to(device)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_dim, vocab_size).to(device)

    def forward(self, x):
        x = x.to(device)
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = self.linear(lstm_out)
        return out

    def training_step(self, batch, batch_idx):
        uncompressed_text, compressed_text = batch
        output = self(uncompressed_text)

        loss = F.cross_entropy(
            output.view(-1, self.vocab_size), compressed_text.view(-1)
        )
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)


    def train_dataloader(self, dataset):
        return DataLoader(
            dataset, batch_size=32, shuffle=True, collate_fn=collate_batch
        )

## 🏋️‍♂️ Training
Setting up the training environment and initiating the training process.

In [16]:
df = pd.read_csv(config["data_path"], delimiter=";")

dataset = TextCompressionDataset(
    uncompressed_texts=df["uncompressed"].values,
    compressed_texts=df["compressed"].values,
)
text_datamodule = TextDataModule(dataset, batch_size=32, collate_fn=collate_batch)

model = LSTMTextCompressor(
    vocab_size=config["vocab_size"],
    hidden_dim=config["hidden_dim"],
    num_layers=config["num_layers"],
)

trainer = pl.Trainer(
    max_epochs=config["max_epochs"],
    enable_progress_bar=True,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices="auto"
)

trainer.fit(model, text_datamodule)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 65.5 K
1 | lstm      | LSTM      | 1.1 M 
2 | linear    | Linear    | 65.8 K
----------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.736     Total estimated model params size (MB)
/Users/seranshanmugathas/anaconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]