# Purpose
The goal of this is to explore the current setup and find an efficient way to run simulations during our search process

### Recreate Main
We want to start up the board as main does, but run multiple plays rather than just doing one and terminating

In [1]:
# Import
import scrabbler as sc
import random 

# Q learning with value function approximation

In [None]:
import random
import numpy as np
import tensorflow as tf
from scrabble_environment import ScrabbleEnvironment

# Define hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate
num_episodes = 1000  # Number of training episodes
batch_size = 32  # Batch size for experience replay
num_hidden_units = 32  # Number of hidden units in the neural network

# Define the function approximator
inputs = tf.keras.Input(shape=(15, 15, 27), name='board')  # Input layer
flatten = tf.keras.layers.Flatten()(inputs)  # Flatten the input
hidden = tf.keras.layers.Dense(num_hidden_units, activation='relu')(flatten)  # Hidden layer
outputs = tf.keras.layers.Dense(7)(hidden)  # Output layer
model = tf.keras.Model(inputs=inputs, outputs=outputs)

# Define the loss function and optimizer
loss_fn = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam()

# Define the Scrabble environment
env = ScrabbleEnvironment()

# Define the replay buffer
replay_buffer = []

# Main loop for training
for episode in range(num_episodes):
    state = env.reset()
    done = False
    total_reward = 0
    
    while not done:
        # Choose an action
        if random.uniform(0, 1) < epsilon:
            action = random.randint(0, 6)
        else:
            q_values = model.predict(np.array([state]))
            action = np.argmax(q_values[0])
        
        # Take the action and observe the next state and reward
        next_state, reward, done = env.step(action)
        total_reward += reward
        
        # Add the experience to the replay buffer
        replay_buffer.append((state, action, reward, next_state, done))
        
        # Update the Q-values using experience replay
        if len(replay_buffer) >= batch_size:
            # Sample a batch of experiences from the replay buffer
            batch = random.sample(replay_buffer, batch_size)
            
            # Compute the target Q-values
            targets = []
            states = []
            for experience in batch:
                state, action, reward, next_state, done = experience
                if done:
                    target = reward
                else:
                    q_values = model.predict(np.array([next_state]))
                    target = reward + gamma * np.max(q_values[0])
                targets.append(target)
                states.append(state)
            targets = np.array(targets)
            states = np.array(states)
            
            # Update the Q-values using gradient descent
            with tf.GradientTape() as tape:
                q_values = model(states, training=True)
                actions_one_hot = tf.one_hot(batch[:, 1], depth=7)
                q_values = tf.reduce_sum(actions_one_hot * q_values, axis=1)
                loss = loss_fn(targets, q_values)
            grads = tape.gradient(loss, model.trainable_weights)
            optimizer.apply_gradients(zip(grads, model.trainable_weights))
            
    # Decay the exploration rate
    epsilon = max(0.1, epsilon * 0.99)
    
    # Print the total reward for the episode
    print(f'Episode {episode}: Total reward = {total_reward}')


# Other
Vectorize the leaves

In [2]:
letters_left =['V','N','A','A'] # Example leave


def vectorizedLetterCombos(letters):
    vectorized = [0]*26 # Our vectorized version of the leaves which we will update to represent the leaves below
    for letter in letters:
        index = ord(letter) - 65
        vectorized[index] += 1
    return vectorized
vectorizedLetterCombos(letters_left)

    

[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]

Now, let's vectorized what hasn't been seen yet, ie what's left in the bag + the person's hand

In [6]:
# BAG example:

#We will initialize this at the start of the game, and as the game progress, update this list
unseen = [
    "A","A",
    "B",
    "K",
    "L","L","L","L",
    "M", "M", 
    "N","N","N","N","N","N",
    "O","O","O",
    "P", "P",
    "Q",
    "R","R","R","R","R","R",
    "S","S","S","S",
    "T","T","T","T",
    "U",
    "V", "V",
    "W", "W",
    "X",
    "Y", "Y"
    ]
player_rack = ["A","C","D","A","E","M","N"]

vec_bag = vectorizedLetterCombos(unseen) + vectorizedLetterCombos(player_rack) # the vectorized version of unseen tiles
vec_bag



[2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 2, 6, 3, 2, 1, 6, 4, 4, 1, 2, 2, 1, 2, 0]