# 🚀 Neural Network-Based Text Compression

#### 🖋️ Authors
- Feidnand Eide
- Seran Shanmugathas


### Install libaries

In [1]:
%pip install pandas transformers torch sentencepiece

Note: you may need to restart the kernel to use updated packages.


### Import Dependencies

In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    T5ForConditionalGeneration,
    AdamW
)
from sklearn.model_selection import train_test_split

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load and Preprocess the Dataset

In [3]:
file_path = "data/uncompressed_and_compressed.csv"
dataset = pd.read_csv(file_path, sep=";")

dataset.head()

Unnamed: 0,uncompressed,compressed
0,Budget to set scene for election\n \n Gordon B...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x9d\x86\'...
1,Army chiefs in regiments decision\n \n Militar...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x9d\x86\'...
2,Howard denies split over ID cards\n \n Michael...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x9d\x86\'...
3,Observers to monitor UK election\n \n Minister...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x9d\x86\'...
4,Kilroy names election seat target\n \n Ex-chat...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x9d\x86\'...


### Define a Custom Dataset Class

In [4]:
class TextCompressionDataset(Dataset):
    """
    Dataset for text compression
    """

    def __init__(self, uncompressed, compressed, tokenizer, max_length=512):
        self.uncompressed = uncompressed
        self.compressed = compressed
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        """
        Returns the length of the dataset

        Returns
        -------
        int
            Length of the dataset
        """
        return len(self.uncompressed)

    def __getitem__(self, idx):
        """
        Returns a sample from the dataset

        Parameters
        ----------
        idx : int
            Index of the sample to return

        Returns
        -------
        dict
            Sample from the dataset
        """
        uncompressed_text = self.uncompressed[idx]
        compressed_text = self.compressed[idx]

        # Tokenize texts
        source = self.tokenizer.encode_plus(
            uncompressed_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        target = self.tokenizer.encode_plus(
            compressed_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": source["input_ids"].squeeze(),
            "attention_mask": source["attention_mask"].squeeze(),
            "labels": target["input_ids"].squeeze(),
        }

### Initialize the Tokenizer and Model

In [5]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")  # TODO: Change to t5-base
model = T5ForConditionalGeneration.from_pretrained("t5-small")

compression_dataset = TextCompressionDataset(
    dataset["uncompressed"], dataset["compressed"], tokenizer
)

### Create Data Loaders

In [6]:
from sklearn.model_selection import train_test_split

# Split data into training and validation
train_data, val_data = train_test_split(compression_dataset, test_size=0.1)

# Create data loaders
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
val_loader = DataLoader(val_data, batch_size=16)

### Defining the Training Loop

In [8]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)


def train(model, loader):
    """
    Training loop for the model

    Parameters
    ----------
    model : transformers.T5ForConditionalGeneration
        Model to train
    loader : torch.utils.data.DataLoader
        Data loader for the training data

    Returns
    -------
    float
        Average loss of the epoch
    """
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(
            input_ids=input_ids, attention_mask=attention_mask, labels=labels
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(loader)


# Example training loop
for epoch in range(3):  # number of epochs
    loss = train(model, train_loader)
    print(f"Epoch {epoch} Loss: {loss}")



KeyboardInterrupt: 

### Evaluate the Model

In [None]:
def evaluate(model, loader):
    """
    Evaluation loop for the model

    Parameters
    ----------
    model : transformers.T5ForConditionalGeneration
        Model to evaluate
    loader : torch.utils.data.DataLoader
        Data loader for the validation data

    Returns
    -------
    float
        Average loss of the epoch
    """
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["labels"]

            outputs = model(
                input_ids=input_ids, attention_mask=attention_mask, labels=labels
            )
            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(loader)


# Evaluate the model
val_loss = evaluate(model, val_loader)
print(f"Validation Loss: {val_loss}")

Validation Loss: 3.7860815014157976


### Save the Model

In [None]:
model.save_pretrained('./compression_model')