# All imports

In [1]:
import os
import numpy as np
import random
from tqdm import tqdm
import gymnasium as gym
from collections import deque, namedtuple
from collections import defaultdict
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from gymnasium.envs.toy_text.frozen_lake import generate_random_map

# w2v required
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics.pairwise import cosine_similarity

# DQN required
import argparse
import numpy as np
# import logging
# from matplotlib import animation # will be needed for rendering

# All global variables needed
Since some or all of these variables are needed for each cell below, it's difficult to put these inside the main function.\
Perhaps, using some args method might work?

## Common Functions needed

### Function to generate custom map

In [2]:
# ============================================================================================
# Function to generate custom map
# ============================================================================================
def make_env(env_name, env_dim = 4, seed = 42, stochastic = False):
    env = gym.make(env_name, desc=generate_random_map(size=env_dim, seed=seed), 
                   is_slippery = stochastic, render_mode = 'rgb_array')
    return env

## Global Variables

In [10]:
'''
env_name: str
env_dim: int --> Dimension of the game: 4x4 or 8x8
seed
stochastic = boolean --> Whether we use is_slippery = True or False
 '''
env_name = "FrozenLake-v1"
env_dim = 4
stochastic = False
seed = 42
gamma = 0.99 # discount factor in Q computation
alpha = 0.1 # learning rate in the table
num_episodes_q_table = 100_000
convergence_threshold = 1e-4
epsilon_start = 1
epsilon_decay_q_table = 0.99995
epsilon_end = 0.01
check_env_details = True

# Creating the environment
env = make_env(env_name=env_name, env_dim=env_dim, seed = seed, stochastic=stochastic)
state_dim = env.observation_space.n
action_dim = env.action_space.n
print("State space: ", env.observation_space.n)
print("Action space: ", env.action_space.n)

# state and trajectories related variables
num_episodes_trajectories = 10_000
num_states = state_dim
num_actions = action_dim
max_eps_len = 100
modified = "medium" # perfect - for perfect trajectories, random - for purely random trajectories or "medium" for the combined trajs

# w2v related variables
''' Potential values for embedding dimensions = {4, 8, 12, 16, 20, 32, 64} '''
# w2v hyperparameters
embed_dim = 32
window_size = 2
batch_size = 16
w2v_epochs = 60
w2v_lr = 0.01
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# DQN variables
parser = argparse.ArgumentParser()
parser.add_argument("--env", default="FrozenLake-v1")          # Gymnasium environment name # Default = MountainCar-v0
parser.add_argument("--seed", default=42, type=int)              # sets Gym, PyTorch and Numpy seeds
parser.add_argument("--n-episodes", default=2500, type=int)     # maximum number of training episodes
parser.add_argument("--batch-size", default=64, type=int)       # training batch size
parser.add_argument("--discount", default=0.99)                 # discount factor
parser.add_argument("--lr", default=5e-4)                       # learning rate
parser.add_argument("--tau", default=0.001)                     # soft update of target network
parser.add_argument("--max-size", default=int(1e5),type=int)    # experience replay buffer length
parser.add_argument("--update-freq", default=4, type=int)       # update frequency of target network
parser.add_argument("--gpu-index", default=0,type=int)		      # GPU index
parser.add_argument("--max-esp-len", default=100, type=int)    # maximum time of an episode
#exploration strategy
parser.add_argument("--epsilon-start", default=1)               # start value of epsilon
parser.add_argument("--epsilon-end", default=0.01)              # end value of epsilon
parser.add_argument("--epsilon-decay", default=0.995)           # decay value of epsilon
parser.add_argument("--save-filename", default = "dqn_w2v_FrozenLake-v1" )
args, unknown = parser.parse_known_args()

# Filename for saving purposes - common part for all files
filename = f"{env_name}_map_size_{env_dim}_stochastic_{stochastic}_seed_{seed}"

# This folder path will be used to store all the saved models and the associated data
runs_folder_path = "mdp/runs_frozen"

# Check if the folder exists
if not os.path.exists(runs_folder_path):
    # Create the folder
    os.makedirs(runs_folder_path)
    print(f"Folder '{runs_folder_path}' created successfully.")
else:
    print(f"Folder '{runs_folder_path}' already exists.")


State space:  16
Action space:  4
Folder 'mdp/runs_frozen' already exists.


# Tabular Q learning

### Q-Learning Agent class

In [None]:
# ============================================================================================
# Q-Learning Agent class
# ============================================================================================
class QLearningAgent:
    """Q-learning agent."""
    def __init__(self, num_states, num_actions, gamma=0.99, epsilon=0.1, alpha=0.1):
        self.num_states = num_states
        self.num_actions = num_actions
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.q_table = np.zeros((num_states, num_actions))  # Initialize Q-table

    def choose_action(self, state, epsilon):
        """Epsilon-greedy action selection."""
        if random.uniform(0, 1) < epsilon:
            return random.randint(0, self.num_actions - 1)  # Explore
        else:
            return np.argmax(self.q_table[state, :])  # Exploit

    def update_q_value(self, state, action, reward, next_state):
        """Q-learning update rule."""
        best_next_action = np.argmax(self.q_table[next_state, :])  # Greedy action for next state
        td_target = reward + self.gamma * self.q_table[next_state, best_next_action]
        td_error = td_target - self.q_table[state, action]
        self.q_table[state, action] += self.alpha * td_error  # Update Q-table

    def get_optimal_policy(self):
        """Extracts the optimal policy after training."""
        return np.argmax(self.q_table, axis=1)

### Function to train the model using Q-learning for Frozen Lake

In [None]:
# ============================================================================================
# Function to train the model using Q-learning for Frozen Lake
# ============================================================================================
def run_tabular_q_frozen(env, agent, num_episodes=10, max_eps_len = 100, convergence_threshold=1e-4,
                         epsilon_start = 1, epsilon_decay = 0.995, epsilon_end = 0.01, seed=42):
    reward_curve = [] # this will store the moving avg of rewards
    moving_window = deque(maxlen=100)
    epsilon = epsilon_start
    prev_q_table = np.copy(agent.q_table)  # Store old Q-table

    for episode in tqdm(range(num_episodes)):
        state,_ = env.reset(seed=seed)
        # print(f"\nIn episode {episode}, After reset initial state = {state} and epsilon = {epsilon}")
        curr_reward = 0
        flag = False

        for _ in range(max_eps_len):
            action = agent.choose_action(state, epsilon)
            next_state, reward, done, _, _ = env.step(action)
            agent.update_q_value(state, action, reward, next_state)
            state = next_state
            curr_reward += reward

            # # Compute max Q-value change
            # q_change = np.max(np.abs(agent.q_table - prev_q_table))
            # prev_q_table = np.copy(agent.q_table)

            # # Check Q-value convergence
            # if q_change < convergence_threshold:
            #     print(f"Q-values converged at Episode {episode+1} with max Q-change: {q_change}")
            #     flag = True
            #     break

            if done:
                break
        # end while inside an episode
        
        # Epsilon decay performed at the end of each episode
        epsilon *= epsilon_decay
        epsilon = max(epsilon, epsilon_end)

        # Appending the smoothened reward
        moving_window.append(curr_reward)
        reward_curve.append(np.mean(moving_window))

        if episode % 1000 == 0:
            print(f"Tabular Q: Episode {episode}: epsilon = {epsilon}, avg reward = {np.mean(moving_window)}")
        # end if

        # if flag:
        #     break
    # end for num_episode

    return agent.q_table, agent.get_optimal_policy(), reward_curve

### Running the Tabular Q learning

In [None]:
# env = make_env(env_name=env_name, env_dim=env_dim, seed = seed, stochastic=stochastic)

# Setting seeds
np.random.seed(seed)
random.seed(seed)

if check_env_details:
    # Extract the environment description (grid layout)
    lake_grid = env.unwrapped.desc  # Gets the grid representation

    # Print state-to-symbol mapping
    print("Frozen Lake Grid Layout:")
    for row in lake_grid:
        print(" ".join(row.astype(str)))

    goal_state = None
    rows, cols = lake_grid.shape
    for i in range(rows):
        for j in range(cols):
            if lake_grid[i, j] == b'G':  # 'G' is stored as a byte-string
                goal_state = i * cols + j  # Convert (row, col) to state number
                break
        # end for j
    # end for i
    print(f"Goal State: {goal_state}")
# end if check_env

state_dim = env.observation_space.n
action_dim = env.action_space.n
print("State space: ", env.observation_space.n)
print("Action space: ", env.action_space.n)

learner = QLearningAgent(num_states=state_dim, num_actions=action_dim, gamma=gamma
                            , epsilon=epsilon_start, alpha=alpha) # Creating the learning Agent

final_q_table, final_policy, reward_curve = run_tabular_q_frozen(
                env, learner, num_episodes=num_episodes_q_table, max_eps_len=max_eps_len, convergence_threshold=convergence_threshold,
                epsilon_start = epsilon_start, epsilon_decay = epsilon_decay_q_table, epsilon_end = epsilon_end, seed=seed)

Val_f = np.max(final_q_table, axis=1)

state = 0
# Define action map
action_map = {
0: "Left",
1: "Down",
2: "Right",
3: "Up"
}
print("State: Type -    V(s),    action taken")
lake_grid = env.unwrapped.desc  # Gets the grid representation
for row in lake_grid:
    for cell in row:
        print(f"     {state}:   {cell.decode('utf-8')} - {Val_f[state]:.2f}, {final_policy[state]}-->{action_map[final_policy[state]]}")  # Convert byte to string
        state += 1
# assert False, "c1"

# Print the final table and policy
print("Final Q function: ", final_q_table)
# print("Final Policy: ", final_policy)
# print("Final Value function: ", Val_f)

# # Plot heatmap of the Value function
# plt.figure(figsize=(5,5))
# plt.imshow(Val_f.reshape(4,4), cmap="coolwarm", interpolation="nearest")
# for i in range(4):
#     for j in range(4):
#         plt.text(j, i, f"{Val_f[i*4+j]:.2f}", ha='center', va='center', color='black')


# Plot the reward curve



# Save the current Q-function
save_model = f"{runs_folder_path}/Q_table_{filename}.npy"
np.save(save_model, final_q_table)

# Collecting Trajectories
Here we may need to ensure that the trajectories sufficiently explore each state.\
Thus, we may want to start from specific states when reseting the environment during trajectory collection.

In [None]:
def choose_action(q_table, state, epsilon):
    """Epsilon-greedy action selection."""
    if random.uniform(0, 1) < epsilon:
        return random.randint(0, num_actions - 1)  # Explore
    else:
        return np.argmax(q_table[state, :])  # Exploit

In [None]:
q_table = np.load(f"{runs_folder_path}/Q_table_{filename}.npy")
# env = make_env(env_name=env_name, env_dim=env_dim, seed = seed, stochastic=stochastic)
flag = False
traj = []
tuple_traj = False
for e in tqdm(range(num_episodes_trajectories)):
    if modified=="perfect":
        _, _ = env.reset()
        epsilon = 0.01
        state = int(e%16)
        if flag==False:
            print(f"We're dealing with {modified} trajs!")
            flag = True
    # end if perfect

    elif modified=="random":
        epsilon = 1
        if random.uniform(0, 1) < 0.1:
            if random.uniform(0, 1) < 0.5:
                state = 3
            elif random.uniform(0, 1) < 0.65:
                state = 6
            else:
                state = 7
            # end if choosing state
        else:
            state, _ = env.reset(seed=seed)
        
        if flag==False:
            print(f"We're dealing with {modified} trajs!")
            flag = True
    # end if random trajectories
    
    else:
        # continue    
        if e<0.1*num_episodes_trajectories:
          epsilon = 1
        elif e<0.2*num_episodes_trajectories:
          epsilon = 0.8
        elif e<0.3*num_episodes_trajectories:
          epsilon = 0.6
        elif e<0.4*num_episodes_trajectories:
          epsilon = 0.4
        elif e<0.5*num_episodes_trajectories:
          epsilon = 0.2
        elif e<0.6*num_episodes_trajectories:
          epsilon = 0.1
        elif e<0.7*num_episodes_trajectories:
          epsilon = 0.05
        else:
          epsilon = 0.01
        # end epsilon if
        if random.uniform(0, 1) < 0.1:
            state, _ = env.reset(seed=seed)
            if random.uniform(0, 1) < 0.5:
                state = 3
            elif random.uniform(0, 1) < 0.65:
                state = 6
            else:
                state = 7
            # end if choosing state
        else:
            state, _ = env.reset(seed=seed)
        
        if flag==False:
            print(f"We're dealing with {modified} trajs!")
            flag = True
        # end flag if
    
    if not tuple_traj:
       traj.append('s_'+str(state))
    '''
    Action Stochasticity (is_slippery=True): The seed affects how the agent slips 
    (randomly moves instead of following the chosen action).
    Random Hole Placement (if map is generated dynamically): If the map has random 
    elements, different seeds can affect the placement of H (holes).
    '''
    curr_reward = 0
    rep_count = 0
    for t in range(max_eps_len):
      action = choose_action(q_table, state, epsilon) #To be implemented
      n_state,reward,done,_,_ = env.step(action)
      
      # We store the current tuple
      if tuple_traj:
        traj.append((state, action, reward, n_state, done))
      else:
        temp = 's_'+str(n_state)
        # traj.append('s_'+str(n_state))
        traj.append(temp)

      state = n_state
      curr_reward += reward
      if done:
        if rep_count>=5: # This forces repetitions to occur when done becomes True thereby repeating ending states
            if not tuple_traj:
                if temp=='s_15':
                    traj.append('s_'+str(16))
                else:
                    traj.append('s_'+str(17))
            break
        else:
            if not tuple_traj:
                traj.append(temp)
        rep_count+=1
    # end for
# end for
if tuple_traj:
    save_file_name = f"{runs_folder_path}/modified_tuple_trajectories_{filename}.npy"
    np.save(save_file_name, traj)
else:
    save_file_name = f"{runs_folder_path}/modified_{modified}_trajectories_{filename}.npy"
    np.save(save_file_name, traj)
print(f"Trajectories Saved in {save_file_name}!")

# assert False, "No w2v business here"

# Performing w2v

In [None]:
traj_file = f"{runs_folder_path}/modified_{modified}_trajectories_{filename}.npy"
print("The trajectories being used are: ", traj_file)

text = np.load(traj_file)
np.random.seed(seed=seed)
torch.manual_seed(seed)

### Now we define the functions for word2vec algorithm

In [None]:
def build_vocab(text): # Again we already have a vocabulary hence don't need to use this function directly
    word_counts = Counter(text)
    vocab = {word: i for i, word in enumerate(word_counts.keys())}
    reverse_vocab = {i: word for word, i in vocab.items()}
    return vocab, reverse_vocab, word_counts

def generate_skipgram_pairs(text, window_size=2): # This function gives the word and context pairs.
    # words = tokenize_text(text)
    words = text
    pairs = []
    for i, target_word in enumerate(words):
        window_start = max(i - window_size, 0)
        window_end = min(i + window_size + 1, len(words))
        for j in range(window_start, window_end):
            if i != j:
                pairs.append((words[i], words[j]))
    return pairs

vocab,_,wcounts = build_vocab(text)
print("vocab: ", vocab)
print("word counts: ", wcounts)
# assert False, "Checking the word counts!"

### Word2vec class definition

In [None]:
# ============================================================================================
# Classes: Word2vec Dataset creator and SkipGram model
# ============================================================================================

class Word2VecDataset(Dataset):
    def __init__(self, text, vocab, window_size=2):
        self.vocab = vocab
        self.data = generate_skipgram_pairs(text, window_size)
        self.vocab_size = len(vocab) # why is this needed here?

    def __len__(self): # what does this function do?
        return len(self.data)

    def __getitem__(self, idx):
        target, context = self.data[idx]
        ''' Doesn't the above idx reflect the idx numbered pair instead in pairs instead of the pairs
        corresponding to the word at idx? '''
        target_idx = torch.tensor(self.vocab[target], dtype=torch.long)
        context_idx = torch.tensor(self.vocab[context], dtype=torch.long)
        # target_idx = self.vocab[target]
        # context_idx = self.vocab[context]

        return target_idx, context_idx
    

    # ============================================================================================
# Class: SkipGram using softmax over entire vocabulary
# ============================================================================================

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embedding_dim = embedding_dim

        # Input word embedding
        self.in_embedding = nn.Embedding(vocab_size, embedding_dim)
        # Output word embedding (used for context words)
        self.out_embedding = nn.Embedding(vocab_size, embedding_dim)

        # Initialize weights (better stability)
        init_range = 0.5 / embedding_dim
        self.in_embedding.weight.data.uniform_(-init_range, init_range)
        self.out_embedding.weight.data.uniform_(-init_range, init_range)

    def forward(self, center_word_idx):
        """Compute word embeddings and softmax probabilities for the context words."""
        center_embed = F.relu(self.in_embedding(center_word_idx))  # Shape: (batch_size, embedding_dim)
        scores = torch.matmul(center_embed, self.out_embedding.weight.T)  # Compute dot product
        y_pred = torch.softmax(scores, dim=1)  # Apply softmax over output vocab
        return y_pred

    def get_word_vector(self, word_idx):
        """Return the learned embedding vector for a given word index."""
        return self.in_embedding(word_idx).detach().cpu().numpy()

### Training function

In [None]:
# ============================================================================================
# Function: to train the w2v skipgram model
# ============================================================================================

def train_skipgram(model, data_loader, epochs=6, lr=0.01, device = device):
    """Train the SkipGram model using Adam optimizer."""
    criterion = nn.CrossEntropyLoss()  # Cross-entropy for multi-class classification
    optimizer = optim.Adam(model.parameters(), lr)

    for epoch in tqdm(range(epochs)):
        total_loss = 0
        for center_word_idx, context_word_idx in data_loader:
            center_word_idx = center_word_idx.to(device)
            context_word_idx = context_word_idx.to(device)

            optimizer.zero_grad()
            y_pred = model(center_word_idx)  # Forward pass
            loss = criterion(y_pred, context_word_idx)  # Compute loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update weights

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

    return model

### Main function for w2v

In [None]:
# ============================================================================================
#  Main function for calling the w2v agent
# ============================================================================================
# First we create the dataset and dataloader
vocab, _, _ = build_vocab(text)
dataset = Word2VecDataset(text, vocab, window_size)
dataloader = DataLoader(dataset, batch_size, shuffle=True)
model = SkipGram(vocab_size=len(vocab), embedding_dim=embed_dim).to(device)
model = train_skipgram(model, dataloader, epochs = w2v_epochs, lr = w2v_lr)

# ============================================================================================
#  Saving the w2v generated vector embeddings as a dictionary
# ============================================================================================
word_embeddings = {}
for word in vocab:
    word_idx = torch.tensor([vocab[word]], dtype=torch.long).to(device)
    updated_embedding = model.get_word_vector(word_idx)
    # print(f"Updated embedding for '{word}': {updated_embedding}")
    # Store the embedding in the dictionary
    word_embeddings[word] = updated_embedding.flatten()  # Flatten to 1D array

# Saving the w2v model
save_w2v_file = f"{runs_folder_path}/modified_{modified}_w2v_embed_dim_{embed_dim}_{filename}_epochs_{w2v_epochs}.npy"
np.save(save_w2v_file, word_embeddings)
print("W2v vetors stored in: ", save_w2v_file)

### Similarity check

In [None]:
# ============================================================================================
# Similarity checking and visualizing 
# ============================================================================================
# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity

# Load saved embeddings
word_embeddings = np.load(f"{runs_folder_path}/modified_{modified}_w2v_embed_dim_{embed_dim}_{filename}_epochs_{w2v_epochs}.npy", 
                          allow_pickle=True).item()

# Convert to a NumPy array for fast computation
words = list(word_embeddings.keys())
vectors = np.array(list(word_embeddings.values()))

# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(vectors)

# Function to find top-N similar words
def find_similar_words(target_word, top_n=5):
    if target_word not in word_embeddings:
        print(f"Word '{target_word}' not found in vocabulary.")
        return []

    # Get index of target word
    target_idx = words.index(target_word)

    # Get similarity scores for the target word
    similarity_scores = cosine_sim_matrix[target_idx]

    # Get top-N most similar words (excluding itself)
    similar_indices = np.argsort(similarity_scores)[::-1][1:top_n+1]  # Sort in descending order

    # Return words with their similarity scores
    return [(words[i], similarity_scores[i]) for i in similar_indices]

# Example usage
for target_word in word_embeddings:
    top_similar_words = find_similar_words(target_word, top_n=5)

    print(f"Top 5 words similar to '{target_word}':")
    for word, score in top_similar_words:
        print(f"{word}: {score:.4f}")

# Now we need to ensure that the same states are sampled to compare the performance of w2v imbued DQN and Vanilla DQN. 
Thus we reset np, random and torch.manual_seed before starting both the following cells

# DQN common set up

### Defining all classes
We define the following classes:
1. Experience Replay
2. DQNAgent 

In [11]:
class ExperienceReplay:
	"""
	Based on the Replay Buffer implementation of TD3
	Reference: https://github.com/sfujim/TD3/blob/master/utils.py
	"""
	def __init__(self, state_dim, action_dim,max_size,batch_size,gpu_index=0):
		self.max_size = max_size
		self.ptr = 0
		self.size = 0
		self.state = np.zeros((max_size, state_dim))
		self.action = np.zeros((max_size, action_dim))
		self.next_state = np.zeros((max_size, state_dim))
		self.reward = np.zeros((max_size, 1))
		self.done = np.zeros((max_size, 1))
		self.batch_size = batch_size
		self.device = torch.device('cuda', index=gpu_index) if torch.cuda.is_available() else torch.device('cpu')


	def add(self, state, action,reward,next_state, done):
		self.state[self.ptr] = state
		self.action[self.ptr] = action
		self.next_state[self.ptr] = next_state
		self.reward[self.ptr] = reward
		self.done[self.ptr] = done
		self.ptr = (self.ptr + 1) % self.max_size
		self.size = min(self.size + 1, self.max_size)

	def sample(self):
		ind = np.random.randint(0, self.size, size=self.batch_size)

		return (
			torch.FloatTensor(self.state[ind]).to(self.device),
			torch.FloatTensor(self.action[ind]).long().to(self.device),
			torch.FloatTensor(self.reward[ind]).to(self.device),
			torch.FloatTensor(self.next_state[ind]).to(self.device),
			torch.FloatTensor(self.done[ind]).to(self.device)
		)



## Class of QNetwork with w2v
Here, the w2v input is given to a 2 layer NN. We remove the 64x64 layer since word vectors are already learnt and thus we don't require that layer

In [12]:
class w2v_QNetwork(nn.Module):
  """
  Q Network: designed to take state as input and give out Q values of actions as output
  """

  def __init__(self, state_dim, action_dim):
    """
      state_dim (int): state dimenssion
      action_dim (int): action dimenssion
    """
    super(w2v_QNetwork, self).__init__()
    self.l1 = nn.Linear(state_dim, 64)
    # self.l2 = nn.Linear(64, 64)
    self.l3 = nn.Linear(64, action_dim)

  def forward(self, state):
    q = F.relu(self.l1(state))
    # q = F.relu(self.l2(q))
    return self.l3(q)


### DQNAgent class

In [13]:
class w2v_DQNAgent():
  def __init__(self,
   embed_dim,
   action_dim,
   discount=0.99,
   tau=1e-3,
   lr=5e-4,
   update_freq=4,
   max_size=int(1e5),
   batch_size=64,
   gpu_index=0
   ):
    """
      state_size (int): dimension of each state
      action_size (int): dimension of each action
      discount (float): discount factor
      tau (float): used to update q-target
      lr (float): learning rate
      update_freq (int): update frequency of target network
      max_size (int): experience replay buffer size
      batch_size (int): training batch size
      gpu_index (int): GPU used for training
    """
    self.embed_dim = embed_dim
    self.action_dim = action_dim
    self.discount = discount
    self.tau = tau
    self.lr = lr
    self.update_freq = update_freq
    self.batch_size = batch_size
    self.device = torch.device('cuda', index=gpu_index) if torch.cuda.is_available() else torch.device('cpu')


    # Setting up the NNs
    self.Q = w2v_QNetwork(embed_dim, action_dim).to(self.device)
    self.Q_target = w2v_QNetwork(embed_dim, action_dim).to(self.device)
    self.optimizer = optim.Adam(self.Q.parameters(), lr=self.lr)

    # Experience Replay Buffer
    self.memory = ExperienceReplay(embed_dim,1,max_size,self.batch_size,gpu_index)

    self.t_train = 0

  def step(self, state, action, reward, next_state, done):
    """
    1. Adds (s,a,r,s') to the experience replay buffer, and updates the networks
    2. Learns when the experience replay buffer has enough samples
    3. Updates target netowork
    """
    self.memory.add(state, action, reward, next_state, done)
    self.t_train += 1

    if self.memory.size > self.batch_size:
      experiences = self.memory.sample()
      self.learn(experiences, self.discount) #To be implemented

    if (self.t_train % self.update_freq) == 0:
      self.target_update(self.Q, self.Q_target, self.tau) #To be implemented

  def select_action(self, state, epsilon):
    """
    TODO: Complete this block to select action using epsilon greedy exploration
    strategy
    Input: state, epsilon
    Return: Action
    Return Type: int
    """
    ###### TYPE YOUR CODE HERE ######
    # We generate a random number between 0 and 1
    rand_num = np.random.random()
    state = torch.from_numpy(state).to(self.device)
    a_opt = np.argmax(self.Q(state).cpu().detach().numpy())
    if rand_num<epsilon:
      a_list = [y for y in range(self.action_dim)]
      # print('a_list = ', a_list)
      a_list.remove(a_opt)
      # print('a_list = ', a_list)
      at = np.random.choice(np.array(a_list))
      return (at)
    else:
      return(a_opt)
    #################################

  def learn(self, experiences, discount):
    """
    TODO: Complete this block to update the Q-Network using the target network
    1. Compute target using  self.Q_target ( target = r + discount * max_b [Q_target(s,b)] )
    2. Compute Q(s,a) using self.Q
    3. Compute MSE loss between step 1 and step 2
    4. Update your network
    Input: experiences consisting of states,actions,rewards,next_states and discount factor
    Return: None
    """
    states, actions, rewards, next_states, dones = experiences
    ###### TYPE YOUR CODE HERE ######
    # Step 1:
    target = rewards + discount * torch.max(self.Q_target(next_states), axis = 1, keepdim = True).values * (1 - dones)# change Q_target to Q
    # Step 2:
    Q_sa = torch.take_along_dim(self.Q(states), actions, dim = 1)
    # Step 3:
    loss = nn.MSELoss()
    mse_loss = loss(target, Q_sa)
    # Step 4:
    self.optimizer.zero_grad()
    mse_loss.backward()
    self.optimizer.step()
    #################################

  def target_update(self, Q, Q_target, tau):
    """
    TODO: Update the target network parameters (param_target) using current Q parameters (param_Q)
    Perform the update using tau, this ensures that we do not change the target network drastically
    1. param_target = tau * param_Q + (1 - tau) * param_target
    Input: Q,Q_target,tau
    Return: None
    """
    ###### TYPE YOUR CODE HERE ######
    for target_param, param in zip(Q_target.parameters(), Q.parameters()):
        target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
    #################################

  def get_optimal_policy(self, states):
    """Extracts the optimal policy after training."""
    policy = []
    for state in states:
      state = torch.from_numpy(state).to(self.device)
      policy.append(np.argmax(self.Q(state).cpu().detach().numpy()))
    return policy


In [14]:
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

word_embeddings = np.load(f"{runs_folder_path}/modified_{modified}_w2v_embed_dim_{embed_dim}_{filename}_epochs_{w2v_epochs}.npy", allow_pickle=True).item()
print("Word embeddings loaded from: ", f"{runs_folder_path}/modified_{modified}_w2v_embed_dim_{embed_dim}_{filename}_epochs_{w2v_epochs}.npy")

state_embeddings = []
states = []
words = list(word_embeddings.keys())
#   print(word_embeddings['s_0'])
for i in range(state_dim):
  #  states.append(i)
  if 's_'+str(i) in words:
      state_embeddings.append(word_embeddings['s_'+str(i)])
      # print(f"s_{i}: {word_embeddings['s_'+str(i)]}")
  else:
      temp = np.random.randn(embed_dim).astype(np.float32)
      temp /= np.abs(np.max(temp))
      state_embeddings.append(temp)
      print(f"s_{i} not present in the w2v model, hence random vector initialized")
      # print(f"s_{i}: {temp}")

kwargs = {
    "embed_dim":embed_dim,
    "action_dim":action_dim,
    "discount":gamma,
    "tau":args.tau,
    "lr":args.lr,
    "update_freq":args.update_freq,
    "max_size":args.max_size,
    "batch_size":args.batch_size,
    "gpu_index":args.gpu_index
  }
learner = w2v_DQNAgent(**kwargs)

Word embeddings loaded from:  mdp/runs_frozen/modified_medium_w2v_embed_dim_32_FrozenLake-v1_map_size_4_stochastic_False_seed_42_epochs_60.npy


In [15]:
# one_hot_state = state_embeddings
print("Embeddings are loaded. Now we train dqn")
# print(one_hot_state)

# f = open('dqn_mountaincar.txt', 'w') # file to store the training log
temp_file = f"{runs_folder_path}/w2v_logger_{filename}.txt"
f = open(temp_file, 'w') # file to store the training log
reward_curve = [] # this will store the moving avg of rewards
moving_window = deque(maxlen=100)
epsilon = args.epsilon_start
count = 0
for e in tqdm(range(args.n_episodes)):
    state, _ = env.reset(seed=seed)
    curr_reward = 0
    for t in range(args.max_esp_len):
        action = learner.select_action(state_embeddings[state],epsilon)
        # n_state,reward,terminated,truncated,_ = env.step(action)
        # done = terminated or truncated
        n_state,reward,done,_,_ = env.step(action)
        learner.step(state_embeddings[state],action,reward,state_embeddings[n_state],done)
        state = n_state
        curr_reward += reward
        if done:
            break
    moving_window.append(curr_reward)
    reward_curve.append(np.mean(moving_window))

    """"
    TODO: Write code for decaying the exploration rate using args.epsilon_decay
    and args.epsilon_end. Note that epsilon has been initialized to args.epsilon_start
    1. You are encouraged to try new methods
    """
    ###### TYPE YOUR CODE HERE ######
    epsilon *= args.epsilon_decay
    epsilon = max(epsilon, args.epsilon_end)
    # print('current epsilon = ', epsilon)
    #################################

    if e % 100 == 0:
        print('Episode Number {} Average Episodic Reward (over 100 episodes): {:.2f}'.format(e, np.mean(moving_window)))

    f.write('Episode Number {} Average Episodic Reward (over 100 episodes): {:.2f} \n'.format(e, np.mean(moving_window)))
    #################################

f.close() # to close the file

# Now we save the trained model
rew_file = f"{runs_folder_path}/rewards_w2v_dqn_modified_{modified}_w2v_embed_dim_{embed_dim}_{filename}_epochs_{w2v_epochs}.npy"
np.save(rew_file, reward_curve)
model_saved_file = f"{runs_folder_path}/w2v_dqn_modified_{modified}_w2v_embed_dim_{embed_dim}_{filename}_epochs_{w2v_epochs}.pt"
torch.save(learner.Q.state_dict(), model_saved_file)
print("Model saved at: ", model_saved_file)

final_policy = learner.get_optimal_policy(state_embeddings)
print("Final policy: ", final_policy)
state = 0
# Define action map
action_map = {
0: "Left",
1: "Down",
2: "Right",
3: "Up"
}
print("State: Type -    action taken")
lake_grid = env.unwrapped.desc  # Gets the grid representation
for row in lake_grid:
    for cell in row:
        print(f"     {state}:   {cell.decode('utf-8')} - {final_policy[state]}-->{action_map[final_policy[state]]}")  # Convert byte to string
        state += 1

print('It was successful!')


Embeddings are loaded. Now we train dqn


  1%|          | 27/2500 [00:00<00:09, 261.63it/s]

Episode Number 0 Average Episodic Reward (over 100 episodes): 0.00


  5%|▌         | 132/2500 [00:00<00:10, 229.84it/s]

Episode Number 100 Average Episodic Reward (over 100 episodes): 0.01


  9%|▉         | 235/2500 [00:00<00:09, 235.12it/s]

Episode Number 200 Average Episodic Reward (over 100 episodes): 0.00


 12%|█▏        | 304/2500 [00:01<00:19, 115.32it/s]

Episode Number 300 Average Episodic Reward (over 100 episodes): 0.00


 16%|█▌        | 405/2500 [00:03<00:46, 45.19it/s] 

Episode Number 400 Average Episodic Reward (over 100 episodes): 0.04


 22%|██▏       | 542/2500 [00:04<00:10, 178.39it/s]

Episode Number 500 Average Episodic Reward (over 100 episodes): 0.46


 26%|██▌       | 647/2500 [00:05<00:07, 235.96it/s]

Episode Number 600 Average Episodic Reward (over 100 episodes): 0.88


 30%|███       | 750/2500 [00:05<00:07, 249.36it/s]

Episode Number 700 Average Episodic Reward (over 100 episodes): 0.90


 33%|███▎      | 830/2500 [00:06<00:06, 252.49it/s]

Episode Number 800 Average Episodic Reward (over 100 episodes): 0.97


 37%|███▋      | 937/2500 [00:06<00:06, 254.72it/s]

Episode Number 900 Average Episodic Reward (over 100 episodes): 0.98


 42%|████▏     | 1044/2500 [00:06<00:05, 262.04it/s]

Episode Number 1000 Average Episodic Reward (over 100 episodes): 0.97


 45%|████▌     | 1125/2500 [00:07<00:05, 257.75it/s]

Episode Number 1100 Average Episodic Reward (over 100 episodes): 0.96


 49%|████▉     | 1232/2500 [00:07<00:04, 258.01it/s]

Episode Number 1200 Average Episodic Reward (over 100 episodes): 0.96


 54%|█████▎    | 1340/2500 [00:08<00:04, 261.52it/s]

Episode Number 1300 Average Episodic Reward (over 100 episodes): 0.98


 58%|█████▊    | 1448/2500 [00:08<00:04, 259.17it/s]

Episode Number 1400 Average Episodic Reward (over 100 episodes): 0.98


 61%|██████    | 1529/2500 [00:08<00:03, 264.29it/s]

Episode Number 1500 Average Episodic Reward (over 100 episodes): 0.98


 66%|██████▌   | 1638/2500 [00:09<00:03, 262.74it/s]

Episode Number 1600 Average Episodic Reward (over 100 episodes): 0.99


 70%|██████▉   | 1746/2500 [00:09<00:02, 263.94it/s]

Episode Number 1700 Average Episodic Reward (over 100 episodes): 0.98


 73%|███████▎  | 1827/2500 [00:09<00:02, 262.89it/s]

Episode Number 1800 Average Episodic Reward (over 100 episodes): 0.99


 77%|███████▋  | 1934/2500 [00:10<00:02, 257.65it/s]

Episode Number 1900 Average Episodic Reward (over 100 episodes): 0.98


 82%|████████▏ | 2039/2500 [00:10<00:01, 256.82it/s]

Episode Number 2000 Average Episodic Reward (over 100 episodes): 0.97


 86%|████████▌ | 2149/2500 [00:11<00:01, 262.37it/s]

Episode Number 2100 Average Episodic Reward (over 100 episodes): 0.95


 89%|████████▉ | 2230/2500 [00:11<00:01, 263.50it/s]

Episode Number 2200 Average Episodic Reward (over 100 episodes): 1.00


 93%|█████████▎| 2337/2500 [00:11<00:00, 249.64it/s]

Episode Number 2300 Average Episodic Reward (over 100 episodes): 0.97


 98%|█████████▊| 2446/2500 [00:12<00:00, 262.90it/s]

Episode Number 2400 Average Episodic Reward (over 100 episodes): 0.97


100%|██████████| 2500/2500 [00:12<00:00, 199.28it/s]

Model saved at:  mdp/runs_frozen/w2v_dqn_modified_medium_w2v_embed_dim_32_FrozenLake-v1_map_size_4_stochastic_False_seed_42_epochs_60.pt
Final policy:  [1, 0, 2, 1, 1, 0, 1, 1, 2, 2, 1, 1, 3, 0, 2, 0]
State: Type -    action taken
     0:   S - 1-->Down
     1:   F - 0-->Left
     2:   H - 2-->Right
     3:   F - 1-->Down
     4:   F - 1-->Down
     5:   H - 0-->Left
     6:   F - 1-->Down
     7:   F - 1-->Down
     8:   F - 2-->Right
     9:   F - 2-->Right
     10:   F - 1-->Down
     11:   H - 1-->Down
     12:   F - 3-->Up
     13:   H - 0-->Left
     14:   F - 2-->Right
     15:   G - 0-->Left
It was successful!





In [None]:
if check_env_details:
    # Extract the environment description (grid layout)
    lake_grid = env.unwrapped.desc  # Gets the grid representation

    # Print state-to-symbol mapping
    print("Frozen Lake Grid Layout:")
    for row in lake_grid:
        print(" ".join(row.astype(str)))

    goal_state = None
    rows, cols = lake_grid.shape
    for i in range(rows):
        for j in range(cols):
            if lake_grid[i, j] == b'G':  # 'G' is stored as a byte-string
                goal_state = i * cols + j  # Convert (row, col) to state number
                break
        # end for j
    # end for i
    print(f"Goal State: {goal_state}")

## Vanilla DQN (without w2v)

In [None]:
class QNetwork(nn.Module):
  """
  Q Network: designed to take state as input and give out Q values of actions as output
  """

  def __init__(self, state_dim, action_dim):
    """
      state_dim (int): state dimenssion
      action_dim (int): action dimenssion
    """
    super(QNetwork, self).__init__()
    self.l1 = nn.Linear(state_dim, 64)
    self.l2 = nn.Linear(64, 64)
    self.l3 = nn.Linear(64, action_dim)

  def forward(self, state):
    q = F.relu(self.l1(state))
    q = F.relu(self.l2(q))
    return self.l3(q)


In [None]:

class DQNAgent():

  def __init__(self,
   state_dim,
   action_dim,
   discount=0.99,
   tau=1e-3,
   lr=5e-4,
   update_freq=4,
   max_size=int(1e5),
   batch_size=64,
   gpu_index=0
   ):
    """
      state_size (int): dimension of each state
      action_size (int): dimension of each action
      discount (float): discount factor
      tau (float): used to update q-target
      lr (float): learning rate
      update_freq (int): update frequency of target network
      max_size (int): experience replay buffer size
      batch_size (int): training batch size
      gpu_index (int): GPU used for training
    """
    self.state_dim = state_dim
    self.action_dim = action_dim
    self.discount = discount
    self.tau = tau
    self.lr = lr
    self.update_freq = update_freq
    self.batch_size = batch_size
    self.device = torch.device('cuda', index=gpu_index) if torch.cuda.is_available() else torch.device('cpu')


    # Setting up the NNs
    self.Q = QNetwork(state_dim, action_dim).to(self.device)
    self.Q_target = QNetwork(state_dim, action_dim).to(self.device)
    self.optimizer = optim.Adam(self.Q.parameters(), lr=self.lr)

    # Experience Replay Buffer
    self.memory = ExperienceReplay(state_dim,1,max_size,self.batch_size,gpu_index)

    self.t_train = 0

  def step(self, state, action, reward, next_state, done):
    """
    1. Adds (s,a,r,s') to the experience replay buffer, and updates the networks
    2. Learns when the experience replay buffer has enough samples
    3. Updates target netowork
    """
    self.memory.add(state, action, reward, next_state, done)
    self.t_train += 1

    if self.memory.size > self.batch_size:
      experiences = self.memory.sample()
      self.learn(experiences, self.discount) #To be implemented

    if (self.t_train % self.update_freq) == 0:
      self.target_update(self.Q, self.Q_target, self.tau) #To be implemented

  def select_action(self, state, epsilon):
    """
    TODO: Complete this block to select action using epsilon greedy exploration
    strategy
    Input: state, epsilon
    Return: Action
    Return Type: int
    """
    ###### TYPE YOUR CODE HERE ######
    # We generate a random number between 0 and 1
    rand_num = np.random.random()
    state = torch.from_numpy(state).to(self.device)
    a_opt = np.argmax(self.Q(state).cpu().detach().numpy())
    if rand_num<epsilon:
      a_list = [y for y in range(self.action_dim)]
      # print('a_list = ', a_list)
      a_list.remove(a_opt)
      # print('a_list = ', a_list)
      at = np.random.choice(np.array(a_list))
      return (at)
    else:
      return(a_opt)
    #################################

  def learn(self, experiences, discount):
    """
    TODO: Complete this block to update the Q-Network using the target network
    1. Compute target using  self.Q_target ( target = r + discount * max_b [Q_target(s,b)] )
    2. Compute Q(s,a) using self.Q
    3. Compute MSE loss between step 1 and step 2
    4. Update your network
    Input: experiences consisting of states,actions,rewards,next_states and discount factor
    Return: None
    """
    states, actions, rewards, next_states, dones = experiences
    ###### TYPE YOUR CODE HERE ######
    # Step 1:
    target = rewards + discount * torch.max(self.Q_target(next_states), axis = 1, keepdim = True).values * (1 - dones)# change Q_target to Q
    # Step 2:
    Q_sa = torch.take_along_dim(self.Q(states), actions, dim = 1)
    # Step 3:
    loss = nn.MSELoss()
    mse_loss = loss(target, Q_sa)
    # Step 4:
    self.optimizer.zero_grad()
    mse_loss.backward()
    self.optimizer.step()
    #################################

  def target_update(self, Q, Q_target, tau):
    """
    TODO: Update the target network parameters (param_target) using current Q parameters (param_Q)
    Perform the update using tau, this ensures that we do not change the target network drastically
    1. param_target = tau * param_Q + (1 - tau) * param_target
    Input: Q,Q_target,tau
    Return: None
    """
    ###### TYPE YOUR CODE HERE ######
    for target_param, param in zip(Q_target.parameters(), Q.parameters()):
        target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
    #################################

  def get_optimal_policy(self, states):
    """Extracts the optimal policy after training."""
    policy = []
    for state in states:
      state = torch.from_numpy(state).to(self.device)
      policy.append(np.argmax(self.Q(state).cpu().detach().numpy()))
    return policy

### To run the DQN code

In [None]:
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

kwargs = {
    "state_dim":state_dim,
    "action_dim":action_dim,
    "discount":gamma,
    "tau":args.tau,
    "lr":args.lr,
    "update_freq":args.update_freq,
    "max_size":args.max_size,
    "batch_size":args.batch_size,
    "gpu_index":args.gpu_index
  }
learner = DQNAgent(**kwargs)

In [None]:
one_hot_state = np.float32(np.eye(state_dim))
print("One hot state defined. Now we train dqn")

temp_file = f"{runs_folder_path}/dqn_logger_{filename}.txt"
f = open(temp_file, 'w') # file to store the training log
reward_curve = [] # this will store the moving avg of rewards
moving_window = deque(maxlen=100)
epsilon = args.epsilon_start
count = 0
for e in tqdm(range(args.n_episodes)):
    state, _ = env.reset(seed=seed)
    curr_reward = 0
    for t in range(args.max_esp_len):
        action = learner.select_action(one_hot_state[state],epsilon)
        # n_state,reward,terminated,truncated,_ = env.step(action)
        # done = terminated or truncated
        n_state,reward,done,_,_ = env.step(action)
        learner.step(one_hot_state[state],action,reward,one_hot_state[n_state],done)
        state = n_state
        curr_reward += reward
        if done:
            break
    moving_window.append(curr_reward)
    reward_curve.append(np.mean(moving_window))
    
    epsilon *= args.epsilon_decay
    epsilon = max(epsilon, args.epsilon_end)

    if e % 100 == 0:
        print('Episode Number {} Average Episodic Reward (over 100 episodes): {:.2f}'.format(e, np.mean(moving_window)))

    f.write('Episode Number {} Average Episodic Reward (over 100 episodes): {:.2f} \n'.format(e, np.mean(moving_window)))
    #################################

f.close() # to close the file

# Now we save the trained model
rew_file = f"{runs_folder_path}/rewards_dqn_{filename}.npy"
np.save(rew_file, reward_curve)
model_saved_file = f"{runs_folder_path}/dqn_{filename}.pt"
torch.save(learner.Q.state_dict(), model_saved_file)
print("Model saved at: ", model_saved_file)

final_policy = learner.get_optimal_policy(one_hot_state)
print("Final policy: ", final_policy)
state = 0
# Define action map
action_map = {
0: "Left",
1: "Down",
2: "Right",
3: "Up"
}
print("State: Type -    action taken")
lake_grid = env.unwrapped.desc  # Gets the grid representation
for row in lake_grid:
    for cell in row:
        print(f"     {state}:   {cell.decode('utf-8')} - {final_policy[state]}-->{action_map[final_policy[state]]}")  # Convert byte to string
        state += 1

print('It was successful!')


In [None]:
if check_env_details:
    # Extract the environment description (grid layout)
    lake_grid = env.unwrapped.desc  # Gets the grid representation

    # Print state-to-symbol mapping
    print("Frozen Lake Grid Layout:")
    for row in lake_grid:
        print(" ".join(row.astype(str)))

    goal_state = None
    rows, cols = lake_grid.shape
    for i in range(rows):
        for j in range(cols):
            if lake_grid[i, j] == b'G':  # 'G' is stored as a byte-string
                goal_state = i * cols + j  # Convert (row, col) to state number
                break
        # end for j
    # end for i
    print(f"Goal State: {goal_state}")