# 🚀 Neural Network-Based Text Compression

#### 🖋️ Authors
- Feidnand Eide
- Seran Shanmugathas


### Install libaries

In [11]:
%pip install pytorch-lightning lightning-transformers --quiet

Note: you may need to restart the kernel to use updated packages.


### Import Dependencies

In [17]:
from enum import Enum
import ast

import pytorch_lightning as pl
from pytorch_lightning.utilities.types import TRAIN_DATALOADERS
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import AutoTokenizer

import pandas as pd


### Config

In [None]:
config: dict = {
    "data_path": "data/uncompressed_and_compressed.csv",
    "batch_size": 32,
}

### Load and Preprocess the Dataset

In [16]:
class Columns(Enum):
    """
    Enum containing the columns of the dataset
    """

    UNCOMPRESSED = "uncompressed"
    COMPRESSED = "compressed"


class CompressionDataset(Dataset):
    """
    Class to load text files
    """

    def __init__(self, csv_file: str, delimiter: str = ";"):
        """
        Class to load text files

        Parameters
        ----------

        csv_file: str
            path to the csv file containing the text files to load
        delimiter: str
            delimiter of the csv file
        """
        self.df = pd.read_csv(csv_file, delimiter=delimiter)
        self.df["text"] = self.df[Columns.COMPRESSED.value] = self.df[
            Columns.COMPRESSED.value
        ].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

    def __len__(self) -> int:
        """
        Returns the length of the dataset

        Returns
        -------
        int
            length of the dataset
        """
        return len(self.df)

    def __getitem__(self, idx: int) -> str:
        """
        Returns the text of the file at index idx

        Parameters
        ----------
        idx: int
            index of the file to load

        Returns
        -------
        str
            text of the file at index idx
        """
        return {
            Columns.UNCOMPRESSED.value: self.df.iloc[[idx]][Columns.UNCOMPRESSED.value],
            Columns.COMPRESSED.value: self.df.iloc[[idx]][Columns.COMPRESSED.value],
        }

### Custom Data Module

In [None]:
class CompressionDataModule(pl.LightningDataModule):
    """
    Custom DataModule for the text compression task
    """

    def train_dataloader(self) -> TRAIN_DATALOADERS:
        """
        Returns the training dataloader

        Returns
        -------
        TRAIN_DATALOADERS
            training dataloader
        """
        dataset = CompressionDataset(config["data_path"])
        return DataLoader(dataset, batch_size=config["batch_size"], shuffle=True)

### The transformer

In [None]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        """
        A single layer of the transformer encoder.

        Parameters
        ----------
        d_model : int
            The number of expected features in the input (required).
        nhead : int
            The number of heads in the multiheadattention models (required).
        dim_feedforward : int, optional
            The dimension of the feedforward network model (default=2048).
        dropout : float, optional
            The dropout value (default=0.1).
        """
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        self.activation = nn.ReLU()

    def forward(self, src):
        """
        Pass the input through the encoder layer.

        Parameters
        ----------
        src : torch.Tensor
            The sequence to the encoder layer (required).

        Returns
        -------
        torch.Tensor
            The output of the encoder layer.
        """
        src2 = self.norm1(src)
        q = k = v = src2
        src = src + self.dropout1(self.self_attn(q, k, v)[0])
        src2 = self.norm2(src)
        src = src + self.dropout2(self.linear2(self.dropout(self.activation(self.linear1(src2)))))
        return src

class TransformerEncoder(nn.Module):
    def __init__(self, d_model, nhead, num_layers, dim_feedforward=2048, dropout=0.1):
        """
        The Transformer Encoder consisting of a stack of N encoder layers.

        Parameters
        ----------
        d_model : int
            The number of expected features in the encoder input (required).
        nhead : int
            The number of heads in the multiheadattention mechanism (required).
        num_layers : int
            The number of sub-encoder-layers in the encoder (required).
        dim_feedforward : int, optional
            The dimension of the feedforward network model (default=2048).
        dropout : float, optional
            The dropout value (default=0.1).
        """
        super(TransformerEncoder, self).init()
        layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout)
        self.layers = nn.ModuleList([layer for _ in range(num_layers)])
        self.num_layers = num_layers