# Problem Statement:
Develop a machine learning model that takes a concatenated text string without spaces (e.g., `"helloworld"`) as input and outputs the correctly spaced version (e.g., `"hello world"`). The goal is to accurately predict word boundaries and insert spaces in the appropriate positions.

# Let's create a dataset

In [35]:
from datasets import load_dataset, load_from_disk

In [2]:
book_corpus = load_dataset("bookcorpus/bookcorpus")

README.md:   0%|          | 0.00/18.5k [00:00<?, ?B/s]

bookcorpus.py:   0%|          | 0.00/3.25k [00:00<?, ?B/s]

The repository for bookcorpus/bookcorpus contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/bookcorpus/bookcorpus.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/74004228 [00:00<?, ? examples/s]

In [27]:
book_corpus

Dataset({
    features: ['text'],
    num_rows: 74004228
})

In [13]:
def process_text(example):
    example['text_no_space'] = example['text'].replace(" ", "")
    return example

In [30]:
text_data = book_corpus.select(range(0, 1_000_000)).map(process_text)

In [31]:
text_data

Dataset({
    features: ['text', 'text_no_space'],
    num_rows: 1000000
})

In [32]:
text_data.save_to_disk(f"data/processed_bookcorpus_0")

Saving the dataset (0/1 shards):   0%|          | 0/1000000 [00:00<?, ? examples/s]

In [36]:
text_data = load_from_disk("data/processed_bookcorpus_0")

In [43]:
text_data[0:10]

{'text': ['usually , he would be tearing around the living room , playing with his toys .',
  'but just one look at a minion sent him practically catatonic .',
  "that had been megan 's plan when she got him dressed earlier .",
  "he 'd seen the movie almost by mistake , considering he was a little young for the pg cartoon , but with older cousins , along with her brothers , mason was often exposed to things that were older .",
  'she liked to think being surrounded by adults and older kids was one reason why he was a such a good talker for his age .',
  "`` are n't you being a good boy ? ''",
  'she said .',
  'mason barely acknowledged her .',
  'instead , his baby blues remained focused on the television .',
  'since the movie was almost over , megan knew she better slip into the bedroom and finish getting ready .'],
 'text_no_space': ['usually,hewouldbetearingaroundthelivingroom,playingwithhistoys.',
  'butjustonelookataminionsenthimpracticallycatatonic.',
  "thathadbeenmegan'splan

In [45]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [66]:
class WordSplitterDataset(Dataset):

    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]

        n = len(text)

        inp = []
        out = []

        for i in range(0, n):
            if text[i] == ' ':
                continue;
            inp.append(text[i])
            if i < n - 1 and text[i + 1] == ' ':
                out.append(1)
            else:
                out.append(0)
        return inp, out

In [49]:
data_splits = text_data.train_test_split(0.2)
test_split = data_splits["test"].train_test_split(0.5)

train_data = data_splits["train"]
val_data = test_split["train"]
test_data = test_split["test"]

In [51]:
train_data, val_data, test_data

(Dataset({
     features: ['text', 'text_no_space'],
     num_rows: 800000
 }),
 Dataset({
     features: ['text', 'text_no_space'],
     num_rows: 100000
 }),
 Dataset({
     features: ['text', 'text_no_space'],
     num_rows: 100000
 }))

In [67]:
word_splitter_train = WordSplitterDataset(train_data)
word_splitter_val =  WordSplitterDataset(val_data)

In [68]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class LSTMWordSplitterModel(nn.Module):
    def __init__(self, n_vocab, emd_dim, inp_size, hidden_size,
                 num_lstm_layers):
        super(LSTMWordSplitterModel, self).__init__()

        # embedding layer
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=emd_dim,
        )

        
        # LSMT layer
        self.lstm = nn.LSTM(input_size=inp_size, hidden_size=hidden_size,
                            num_layers=num_lstm_layers, dropout=0.1,
                            bidirectional=True, batch_first=True)

        # FCN layer
        self.first_fcn = nn.Linear(hidden_size, hidden_size)
        self.second_fcn = nn.Linear(hidden_size, 2)
        
        self.relu = nn.ReLU()



    def forward(self, x, prev_state):
        embed = self.embedding(x)
        out, state = self.lstm(embed, prev_state)
        out = self.first_fcn(out)
        out = self.relu(out)
        logits = self.second_fcn(out)
        return logits, state