In [8]:
import pandas as pd

# Login using e.g. `huggingface-cli login` to access this dataset
df = pd.read_csv("hf://datasets/prithivMLmods/Math-IIO-68K-Mini/mathematics_68K.csv")

In [10]:
import csv
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import pandas as pd

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Step 1: Data Preparation

class TextDataset(Dataset):
    def __init__(self, data, max_length=50):
        self.data = []
        self.max_length = max_length
        self.token_to_idx = {'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3}
        self.idx_to_token = {v: k for k, v in self.token_to_idx.items()}
        self.vocab_size = len(self.token_to_idx)

        for _, row in data.iterrows():
            input_text = row['input']
            output_text = row['output']
            self.data.append((input_text, output_text))

        self.build_vocab()
        self.data = self.preprocess_data()

    def build_vocab(self):
        token_counter = Counter()
        for input_text, output_text in self.data:
            token_counter.update(input_text.split())
            token_counter.update(output_text.split())

        for token, count in token_counter.items():
            if token not in self.token_to_idx:
                self.token_to_idx[token] = self.vocab_size
                self.idx_to_token[self.vocab_size] = token
                self.vocab_size += 1

    def preprocess_data(self):
        processed_data = []
        for input_text, output_text in self.data:
            input_tokens = ['<SOS>'] + input_text.split() + ['<EOS>']
            output_tokens = ['<SOS>'] + output_text.split() + ['<EOS>']

            input_indices = [self.token_to_idx.get(token, self.token_to_idx['<UNK>']) for token in input_tokens]
            output_indices = [self.token_to_idx.get(token, self.token_to_idx['<UNK>']) for token in output_tokens]

            input_indices = input_indices[:self.max_length]
            output_indices = output_indices[:self.max_length]

            input_indices += [self.token_to_idx['<PAD>']] * (self.max_length - len(input_indices))
            output_indices += [self.token_to_idx['<PAD>']] * (self.max_length - len(output_indices))

            processed_data.append((torch.tensor(input_indices), torch.tensor(output_indices)))
        return processed_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Step 2: Model Architecture

class LSTMTextGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(LSTMTextGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)  # _ is a tuple of (hidden, cell)
        output = self.fc(lstm_out)
        return output

# Step 3: Training the Model

def train(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)  # Move data to GPU
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader)}')

# Step 4: Generating Text

def generate_text(model, input_text, max_length=50):
    model.eval()
    input_tokens = ['<SOS>'] + input_text.split() + ['<EOS>']
    input_indices = [dataset.token_to_idx.get(token, dataset.token_to_idx['<UNK>']) for token in input_tokens]
    input_indices = input_indices[:max_length]
    input_indices += [dataset.token_to_idx['<PAD>']] * (max_length - len(input_indices))
    input_tensor = torch.tensor([input_indices]).to(device)  # Move data to GPU

    with torch.no_grad():
        output_sequence = []
        for _ in range(max_length):
            output = model(input_tensor)
            output_token_idx = torch.argmax(output[0, -1]).item()
            output_sequence.append(output_token_idx)
            input_tensor = torch.tensor([[output_token_idx]]).to(device)  # Move data to GPU
            if output_token_idx == dataset.token_to_idx['<EOS>']:
                break

    output_tokens = [dataset.idx_to_token[idx] for idx in output_sequence]
    return ' '.join(output_tokens)

# Main Execution

# Example DataFrame

max_length = 50
batch_size = 32
embedding_dim = 256
hidden_dim = 512
num_layers = 2
learning_rate = 0.001
num_epochs = 1

dataset = TextDataset(df, max_length)
vocab_size = dataset.vocab_size
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = LSTMTextGenerator(vocab_size, embedding_dim, hidden_dim, num_layers).to(device)  # Move model to GPU
criterion = nn.CrossEntropyLoss(ignore_index=dataset.token_to_idx['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

train(model, train_loader, criterion, optimizer, num_epochs)

# Example of generating text
input_text = "example input sentence"
generated_text = generate_text(model, input_text)
print(f'Generated Text: {generated_text}')

Using device: cuda
Epoch [1/1], Loss: 1.0585394299030304
Generated Text: <EOS>
