In [None]:
!pip install torch transformers numpy



In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
import random

# Load a pre-trained tokenizer and model for embeddings
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Text embedding utility
def get_text_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

class TextSequenceEnvironment:
    def __init__(self, sequences):
        self.sequences = sequences
        self.n = len(sequences)
        self.current_state = list(range(self.n))  # Start with default order
        self.target_order = None  # Optional: Define a "perfect" target order for evaluation

    def reset(self):
        random.shuffle(self.current_state)
        return self.current_state

    def step(self, action):
        """
        Action: Swap current_state[action] with the next item in the list.
        """
        i, j = action
        self.current_state[i], self.current_state[j] = self.current_state[j], self.current_state[i]
        reward = self._evaluate_sequence()
        done = self._is_terminal()
        return self.current_state, reward, done

    def _evaluate_sequence(self):
        # Reward based on semantic similarity of adjacent items
        embeddings = [get_text_embedding(self.sequences[i]) for i in self.current_state]
        reward = sum(
            np.dot(embeddings[i], embeddings[i + 1].T) for i in range(len(embeddings) - 1)
        )
        return reward

    def _is_terminal(self):
        # Terminal condition: No more actions or max steps reached
        return len(self.current_state) == self.n

    def valid_actions(self):
        # All possible swaps
        return [(i, j) for i in range(len(self.current_state)) for j in range(i + 1, len(self.current_state))]


In [None]:
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)


In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size, lr=0.001, gamma=0.99, epsilon=1.0, epsilon_decay=0.995):
        self.state_size = state_size
        self.action_size = action_size
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = 0.01
        self.gamma = gamma
        self.memory = []
        self.batch_size = 32
        self.q_network = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        self.loss_fn = nn.MSELoss()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.choice(range(self.action_size))
        state_tensor = torch.tensor(state, dtype=torch.float32)
        q_values = self.q_network(state_tensor)
        return torch.argmax(q_values).item()

    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        batch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in batch:
            target = reward
            if not done:
                next_state_tensor = torch.tensor(next_state, dtype=torch.float32)
                target += self.gamma * torch.max(self.q_network(next_state_tensor)).item()
            state_tensor = torch.tensor(state, dtype=torch.float32)
            target_f = self.q_network(state_tensor).detach()
            # Ensure target is converted to a scalar tensor
            target = torch.tensor(target, dtype=torch.float32)
            target_f[action] = target
            self.optimizer.zero_grad()
            loss = self.loss_fn(self.q_network(state_tensor), target_f)
            loss.backward()
            self.optimizer.step()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [None]:
# Sample text sequences
sequences = [
    "The cat sat on the mat.",
    "It was a sunny day.",
    "The dog barked loudly.",
    "A bird flew across the sky."
]

# Initialize environment and agent
env = TextSequenceEnvironment(sequences)
state_size = len(sequences)
action_size = len(env.valid_actions())
agent = DQNAgent(state_size, action_size)

# Training loop
episodes = 500
for episode in range(episodes):
    state = env.reset()
    done = False
    total_reward = 0
    while not done:
        action = agent.act(state)
        next_state, reward, done = env.step(env.valid_actions()[action])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
    agent.replay()
    print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}")

print("Training complete!")


Episode 1/500, Total Reward: [[187.0863]]
Episode 2/500, Total Reward: [[196.15091]]
Episode 3/500, Total Reward: [[202.6315]]
Episode 4/500, Total Reward: [[194.81343]]
Episode 5/500, Total Reward: [[196.15091]]
Episode 6/500, Total Reward: [[206.80132]]
Episode 7/500, Total Reward: [[202.6315]]
Episode 8/500, Total Reward: [[199.43164]]
Episode 9/500, Total Reward: [[199.43164]]
Episode 10/500, Total Reward: [[214.97684]]
Episode 11/500, Total Reward: [[187.0863]]
Episode 12/500, Total Reward: [[196.15091]]
Episode 13/500, Total Reward: [[196.15091]]
Episode 14/500, Total Reward: [[205.55478]]
Episode 15/500, Total Reward: [[195.26183]]
Episode 16/500, Total Reward: [[195.26183]]
Episode 17/500, Total Reward: [[205.55478]]
Episode 18/500, Total Reward: [[205.91223]]
Episode 19/500, Total Reward: [[205.55478]]
Episode 20/500, Total Reward: [[205.91223]]
Episode 21/500, Total Reward: [[196.50836]]
Episode 22/500, Total Reward: [[207.24971]]
Episode 23/500, Total Reward: [[214.97684]]
E

In [None]:
state = env.reset()
done = False
while not done:
    action = agent.act(state)
    state, _, done = env.step(env.valid_actions()[action])

# Output the ordered sequences
ordered_sequences = [sequences[i] for i in state]
print("Ordered Sequences:", ordered_sequences)


Ordered Sequences: ['A bird flew across the sky.', 'The cat sat on the mat.', 'It was a sunny day.', 'The dog barked loudly.']
