# All imports

In [3]:
import os
import numpy as np
import random
from tqdm import tqdm
import gymnasium as gym
from collections import deque, namedtuple
from collections import defaultdict
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from gymnasium.envs.toy_text.frozen_lake import generate_random_map

# w2v required
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics.pairwise import cosine_similarity

# DQN required
import argparse
import numpy as np
import logging
from matplotlib import animation # will be needed for rendering

# All global variables needed
Since some or all of these variables are needed for each cell below, it's difficult to put these inside the main function.\
Perhaps, using some args method might work?

## Common Functions needed

### Function to generate custom map

In [4]:
# ============================================================================================
# Function to generate custom map
# ============================================================================================
def make_env(env_name, env_dim = 4, seed = 42, stochastic = False):
    env = gym.make(env_name, desc=generate_random_map(size=env_dim, seed=seed), 
                   is_slippery = stochastic, render_mode = 'rgb_array')
    return env

## Global Variables

In [5]:
'''
env_name: str
env_dim: int --> Dimension of the game: 4x4 or 8x8
seed
stochastic = boolean --> Whether we use is_slippery = True or False
 '''
env_name = "FrozenLake-v1"
env_dim = 4
stochastic = False
seed = 42
gamma = 0.99 # discount factor in Q computation
alpha = 0.1 # learning rate in the table
num_episodes_q_table = 100_000
convergence_threshold = 1e-4
epsilon_start = 1
epsilon_decay_q_table = 0.99995
epsilon_end = 0.01
check_env_details = True

# Creating the environment
env = make_env(env_name=env_name, env_dim=env_dim, seed = seed, stochastic=stochastic)
state_dim = env.observation_space.n
action_dim = env.action_space.n
print("State space: ", env.observation_space.n)
print("Action space: ", env.action_space.n)

# state and trajectories related variables
num_episodes_trajectories = 10_000
num_states = state_dim
num_actions = action_dim
max_eps_len = 100
modified = "perfect" # or random - for purely random trajectories or "False" for the combined trajs

# w2v related variables
''' Potential values for embedding dimensions = {4, 8, 12, 16, 20, 32, 64} '''
# w2v hyperparameters
embed_dim = 32
window_size = 2
batch_size = 16
w2v_epochs = 50
w2v_lr = 0.01
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Filename for saving purposes - common part for all files
filename = f"{env_name}_map_size_{env_dim}_stochastic_{stochastic}_seed_{seed}"

# This folder path will be used to store all the saved models and the associated data
runs_folder_path = "mdp/runs_frozen"

# Check if the folder exists
if not os.path.exists(runs_folder_path):
    # Create the folder
    os.makedirs(runs_folder_path)
    print(f"Folder '{runs_folder_path}' created successfully.")
else:
    print(f"Folder '{runs_folder_path}' already exists.")


State space:  16
Action space:  4
Folder 'mdp/runs_frozen' created successfully.


# Tabular Q learning

### Q-Learning Agent class

In [6]:
# ============================================================================================
# Q-Learning Agent class
# ============================================================================================
class QLearningAgent:
    """Q-learning agent."""
    def __init__(self, num_states, num_actions, gamma=0.99, epsilon=0.1, alpha=0.1):
        self.num_states = num_states
        self.num_actions = num_actions
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.q_table = np.zeros((num_states, num_actions))  # Initialize Q-table

    def choose_action(self, state, epsilon):
        """Epsilon-greedy action selection."""
        if random.uniform(0, 1) < epsilon:
            return random.randint(0, self.num_actions - 1)  # Explore
        else:
            return np.argmax(self.q_table[state, :])  # Exploit

    def update_q_value(self, state, action, reward, next_state):
        """Q-learning update rule."""
        best_next_action = np.argmax(self.q_table[next_state, :])  # Greedy action for next state
        td_target = reward + self.gamma * self.q_table[next_state, best_next_action]
        td_error = td_target - self.q_table[state, action]
        self.q_table[state, action] += self.alpha * td_error  # Update Q-table

    def get_optimal_policy(self):
        """Extracts the optimal policy after training."""
        return np.argmax(self.q_table, axis=1)

### Function to train the model using Q-learning for Frozen Lake

In [7]:
# ============================================================================================
# Function to train the model using Q-learning for Frozen Lake
# ============================================================================================
def run_tabular_q_frozen(env, agent, num_episodes=10, max_eps_len = 100, convergence_threshold=1e-4,
                         epsilon_start = 1, epsilon_decay = 0.995, epsilon_end = 0.01, seed=42):
    reward_curve = [] # this will store the moving avg of rewards
    moving_window = deque(maxlen=100)
    epsilon = epsilon_start
    prev_q_table = np.copy(agent.q_table)  # Store old Q-table

    for episode in tqdm(range(num_episodes)):
        state,_ = env.reset(seed=seed)
        # print(f"\nIn episode {episode}, After reset initial state = {state} and epsilon = {epsilon}")
        curr_reward = 0
        flag = False

        for _ in range(max_eps_len):
            action = agent.choose_action(state, epsilon)
            next_state, reward, done, _, _ = env.step(action)
            agent.update_q_value(state, action, reward, next_state)
            state = next_state
            curr_reward += reward

            # # Compute max Q-value change
            # q_change = np.max(np.abs(agent.q_table - prev_q_table))
            # prev_q_table = np.copy(agent.q_table)

            # # Check Q-value convergence
            # if q_change < convergence_threshold:
            #     print(f"Q-values converged at Episode {episode+1} with max Q-change: {q_change}")
            #     flag = True
            #     break

            if done:
                break
        # end while inside an episode
        
        # Epsilon decay performed at the end of each episode
        epsilon *= epsilon_decay
        epsilon = max(epsilon, epsilon_end)

        # Appending the smoothened reward
        moving_window.append(curr_reward)
        reward_curve.append(np.mean(moving_window))

        if episode % 1000 == 0:
            print(f"Tabular Q: Episode {episode}: epsilon = {epsilon}, avg reward = {np.mean(moving_window)}")
        # end if

        # if flag:
        #     break
    # end for num_episode

    return agent.q_table, agent.get_optimal_policy(), reward_curve

### Running the Tabular Q learning

In [8]:
# env = make_env(env_name=env_name, env_dim=env_dim, seed = seed, stochastic=stochastic)

# Setting seeds
np.random.seed(seed)
random.seed(seed)

if check_env_details:
    # Extract the environment description (grid layout)
    lake_grid = env.unwrapped.desc  # Gets the grid representation

    # Print state-to-symbol mapping
    print("Frozen Lake Grid Layout:")
    for row in lake_grid:
        print(" ".join(row.astype(str)))

    goal_state = None
    rows, cols = lake_grid.shape
    for i in range(rows):
        for j in range(cols):
            if lake_grid[i, j] == b'G':  # 'G' is stored as a byte-string
                goal_state = i * cols + j  # Convert (row, col) to state number
                break
        # end for j
    # end for i
    print(f"Goal State: {goal_state}")
# end if check_env

state_dim = env.observation_space.n
action_dim = env.action_space.n
print("State space: ", env.observation_space.n)
print("Action space: ", env.action_space.n)

learner = QLearningAgent(num_states=state_dim, num_actions=action_dim, gamma=gamma
                            , epsilon=epsilon_start, alpha=alpha) # Creating the learning Agent

final_q_table, final_policy, reward_curve = run_tabular_q_frozen(
                env, learner, num_episodes=num_episodes_q_table, max_eps_len=max_eps_len, convergence_threshold=convergence_threshold,
                epsilon_start = epsilon_start, epsilon_decay = epsilon_decay_q_table, epsilon_end = epsilon_end, seed=seed)

Val_f = np.max(final_q_table, axis=1)

state = 0
# Define action map
action_map = {
0: "Left",
1: "Down",
2: "Right",
3: "Up"
}
print("State: Type -    V(s),    action taken")
lake_grid = env.unwrapped.desc  # Gets the grid representation
for row in lake_grid:
    for cell in row:
        print(f"     {state}:   {cell.decode('utf-8')} - {Val_f[state]:.2f}, {final_policy[state]}-->{action_map[final_policy[state]]}")  # Convert byte to string
        state += 1
# assert False, "c1"

# Print the final table and policy
print("Final Q function: ", final_q_table)
# print("Final Policy: ", final_policy)
# print("Final Value function: ", Val_f)

# # Plot heatmap of the Value function
# plt.figure(figsize=(5,5))
# plt.imshow(Val_f.reshape(4,4), cmap="coolwarm", interpolation="nearest")
# for i in range(4):
#     for j in range(4):
#         plt.text(j, i, f"{Val_f[i*4+j]:.2f}", ha='center', va='center', color='black')


# Plot the reward curve



# Save the current Q-function
save_model = f"{runs_folder_path}/Q_table_{filename}.npy"
np.save(save_model, final_q_table)

Frozen Lake Grid Layout:
S F H F
F H F F
F F F H
F H F G
Goal State: 15
State space:  16
Action space:  4


  2%|▏         | 1697/100000 [00:00<00:05, 16968.84it/s]

Tabular Q: Episode 0: epsilon = 0.99995, avg reward = 0.0
Tabular Q: Episode 1000: epsilon = 0.9511806740132733, avg reward = 0.0
Tabular Q: Episode 2000: epsilon = 0.904789914112052, avg reward = 0.03
Tabular Q: Episode 3000: epsilon = 0.8606617134311852, avg reward = 0.02


  5%|▌         | 5071/100000 [00:00<00:05, 16529.47it/s]

Tabular Q: Episode 4000: epsilon = 0.8186857229650423, avg reward = 0.04
Tabular Q: Episode 5000: epsilon = 0.7787569756237134, avg reward = 0.04


  7%|▋         | 6725/100000 [00:00<00:05, 16393.38it/s]

Tabular Q: Episode 6000: epsilon = 0.7407756237474893, avg reward = 0.08
Tabular Q: Episode 7000: epsilon = 0.7046466894232127, avg reward = 0.1


  8%|▊         | 8365/100000 [00:00<00:05, 16258.91it/s]

Tabular Q: Episode 8000: epsilon = 0.6702798269781429, avg reward = 0.14
Tabular Q: Episode 9000: epsilon = 0.6375890970574211, avg reward = 0.2


 10%|▉         | 9992/100000 [00:00<00:05, 15913.92it/s]

Tabular Q: Episode 10000: epsilon = 0.6064927517201779, avg reward = 0.23
Tabular Q: Episode 11000: epsilon = 0.5769130300168653, avg reward = 0.19


 13%|█▎        | 13186/100000 [00:00<00:05, 15920.79it/s]

Tabular Q: Episode 12000: epsilon = 0.5487759635366593, avg reward = 0.33
Tabular Q: Episode 13000: epsilon = 0.5220111914386566, avg reward = 0.36


 15%|█▍        | 14844/100000 [00:00<00:05, 16120.74it/s]

Tabular Q: Episode 14000: epsilon = 0.49655178450431714, avg reward = 0.27
Tabular Q: Episode 15000: epsilon = 0.47233407777119996, avg reward = 0.36


 16%|█▋        | 16468/100000 [00:01<00:05, 16154.31it/s]

Tabular Q: Episode 16000: epsilon = 0.44929751132941503, avg reward = 0.32
Tabular Q: Episode 17000: epsilon = 0.4273844788827431, avg reward = 0.4


 18%|█▊        | 18084/100000 [00:01<00:05, 16142.79it/s]

Tabular Q: Episode 18000: epsilon = 0.40654018369568434, avg reward = 0.39
Tabular Q: Episode 19000: epsilon = 0.3867125015662202, avg reward = 0.48


 20%|█▉        | 19707/100000 [00:01<00:04, 16166.37it/s]

Tabular Q: Episode 20000: epsilon = 0.367851850481642, avg reward = 0.54
Tabular Q: Episode 21000: epsilon = 0.34991106663148985, avg reward = 0.6


 23%|██▎       | 22976/100000 [00:01<00:04, 16260.45it/s]

Tabular Q: Episode 22000: epsilon = 0.332845286467567, avg reward = 0.51
Tabular Q: Episode 23000: epsilon = 0.31661183451608793, avg reward = 0.64


 25%|██▍       | 24603/100000 [00:01<00:04, 16199.79it/s]

Tabular Q: Episode 24000: epsilon = 0.30117011666142574, avg reward = 0.64
Tabular Q: Episode 25000: epsilon = 0.286481518634604, avg reward = 0.61


 26%|██▋       | 26258/100000 [00:01<00:04, 16303.08it/s]

Tabular Q: Episode 26000: epsilon = 0.2725093094526817, avg reward = 0.6
Tabular Q: Episode 27000: epsilon = 0.259218549567572, avg reward = 0.64


 28%|██▊       | 27915/100000 [00:01<00:04, 16382.41it/s]

Tabular Q: Episode 28000: epsilon = 0.246576003494601, avg reward = 0.61
Tabular Q: Episode 29000: epsilon = 0.23455005670232934, avg reward = 0.74


 31%|███▏      | 31288/100000 [00:01<00:04, 16626.90it/s]

Tabular Q: Episode 30000: epsilon = 0.22311063655580088, avg reward = 0.7
Tabular Q: Episode 31000: epsilon = 0.21222913711553318, avg reward = 0.75


 33%|███▎      | 32978/100000 [00:02<00:04, 16707.98it/s]

Tabular Q: Episode 32000: epsilon = 0.20187834760418902, avg reward = 0.64
Tabular Q: Episode 33000: epsilon = 0.19203238436205666, avg reward = 0.71


 35%|███▍      | 34673/100000 [00:02<00:03, 16777.97it/s]

Tabular Q: Episode 34000: epsilon = 0.18266662612118353, avg reward = 0.71
Tabular Q: Episode 35000: epsilon = 0.17375765243629993, avg reward = 0.81


 36%|███▋      | 36407/100000 [00:02<00:03, 16944.38it/s]

Tabular Q: Episode 36000: epsilon = 0.16528318511857973, avg reward = 0.79
Tabular Q: Episode 37000: epsilon = 0.15722203252577774, avg reward = 0.78


 38%|███▊      | 38116/100000 [00:02<00:03, 16987.80it/s]

Tabular Q: Episode 38000: epsilon = 0.14955403656943475, avg reward = 0.78
Tabular Q: Episode 39000: epsilon = 0.14226002230663626, avg reward = 0.87


 40%|███▉      | 39834/100000 [00:02<00:03, 17044.62it/s]

Tabular Q: Episode 40000: epsilon = 0.1353217499902697, avg reward = 0.87
Tabular Q: Episode 41000: epsilon = 0.12872186945787317, avg reward = 0.84


 43%|████▎     | 43336/100000 [00:02<00:03, 17280.41it/s]

Tabular Q: Episode 42000: epsilon = 0.12244387674502544, avg reward = 0.82
Tabular Q: Episode 43000: epsilon = 0.11647207281477254, avg reward = 0.83


 45%|████▌     | 45065/100000 [00:02<00:03, 17178.32it/s]

Tabular Q: Episode 44000: epsilon = 0.11079152429989349, avg reward = 0.86
Tabular Q: Episode 45000: epsilon = 0.10538802615983875, avg reward = 0.85


 47%|████▋     | 46803/100000 [00:02<00:03, 17237.81it/s]

Tabular Q: Episode 46000: epsilon = 0.10024806615895202, avg reward = 0.8
Tabular Q: Episode 47000: epsilon = 0.09535879107715316, avg reward = 0.89


 49%|████▊     | 48550/100000 [00:02<00:02, 17304.45it/s]

Tabular Q: Episode 48000: epsilon = 0.09070797456858609, avg reward = 0.89
Tabular Q: Episode 49000: epsilon = 0.08628398658785626, avg reward = 0.88


 50%|█████     | 50288/100000 [00:03<00:02, 17324.25it/s]

Tabular Q: Episode 50000: epsilon = 0.08207576430740496, avg reward = 0.89
Tabular Q: Episode 51000: epsilon = 0.07807278445329505, avg reward = 0.89


 52%|█████▏    | 52051/100000 [00:03<00:02, 17413.21it/s]

Tabular Q: Episode 52000: epsilon = 0.07426503699022786, avg reward = 0.89
Tabular Q: Episode 53000: epsilon = 0.07064300008999028, avg reward = 0.9


 54%|█████▍    | 53832/100000 [00:03<00:02, 17530.57it/s]

Tabular Q: Episode 54000: epsilon = 0.06719761632073298, avg reward = 0.91
Tabular Q: Episode 55000: epsilon = 0.06392026999754027, avg reward = 0.95


 57%|█████▋    | 57371/100000 [00:03<00:02, 17605.23it/s]

Tabular Q: Episode 56000: epsilon = 0.06080276563765279, avg reward = 0.88
Tabular Q: Episode 57000: epsilon = 0.05783730746646704, avg reward = 0.9


 59%|█████▉    | 59132/100000 [00:03<00:02, 17566.66it/s]

Tabular Q: Episode 58000: epsilon = 0.05501647992306336, avg reward = 0.93
Tabular Q: Episode 59000: epsilon = 0.05233322911651293, avg reward = 0.88


 61%|██████    | 60895/100000 [00:03<00:02, 17583.33it/s]

Tabular Q: Episode 60000: epsilon = 0.04978084518659529, avg reward = 0.91
Tabular Q: Episode 61000: epsilon = 0.04735294552481254, avg reward = 0.94


 63%|██████▎   | 62654/100000 [00:03<00:02, 17448.58it/s]

Tabular Q: Episode 62000: epsilon = 0.0450434588137458, avg reward = 0.92
Tabular Q: Episode 63000: epsilon = 0.04284660984484019, avg reward = 0.93


 64%|██████▍   | 64400/100000 [00:03<00:02, 17386.78it/s]

Tabular Q: Episode 64000: epsilon = 0.04075690507665269, avg reward = 0.97
Tabular Q: Episode 65000: epsilon = 0.03876911889745041, avg reward = 0.96


 66%|██████▌   | 66182/100000 [00:03<00:01, 17514.24it/s]

Tabular Q: Episode 66000: epsilon = 0.03687828055780553, avg reward = 0.98
Tabular Q: Episode 67000: epsilon = 0.03507966174051114, avg reward = 0.96


 68%|██████▊   | 67972/100000 [00:04<00:01, 17626.64it/s]

Tabular Q: Episode 68000: epsilon = 0.03336876473673393, avg reward = 0.98
Tabular Q: Episode 69000: epsilon = 0.031741311198836865, avg reward = 0.97


 72%|███████▏  | 71529/100000 [00:04<00:01, 17718.17it/s]

Tabular Q: Episode 70000: epsilon = 0.03019323144174688, avg reward = 0.93
Tabular Q: Episode 71000: epsilon = 0.0287206542661129, avg reward = 0.98


 73%|███████▎  | 73301/100000 [00:04<00:01, 17718.11it/s]

Tabular Q: Episode 72000: epsilon = 0.027319897277807384, avg reward = 0.96
Tabular Q: Episode 73000: epsilon = 0.025987457679562245, avg reward = 0.99


 75%|███████▌  | 75073/100000 [00:04<00:01, 17689.97it/s]

Tabular Q: Episode 74000: epsilon = 0.02472000351171302, avg reward = 0.95
Tabular Q: Episode 75000: epsilon = 0.0235143653201477, avg reward = 0.99


 77%|███████▋  | 76856/100000 [00:04<00:01, 17729.35it/s]

Tabular Q: Episode 76000: epsilon = 0.02236752823062398, avg reward = 0.95
Tabular Q: Episode 77000: epsilon = 0.021276624409636378, avg reward = 0.99


 79%|███████▊  | 78629/100000 [00:04<00:01, 17628.58it/s]

Tabular Q: Episode 78000: epsilon = 0.0202389258929799, avg reward = 0.97
Tabular Q: Episode 79000: epsilon = 0.019251837764077552, avg reward = 0.99


 80%|████████  | 80408/100000 [00:04<00:01, 17674.71it/s]

Tabular Q: Episode 80000: epsilon = 0.018312891665012748, avg reward = 0.99
Tabular Q: Episode 81000: epsilon = 0.017419739624040163, avg reward = 0.97


 82%|████████▏ | 82181/100000 [00:04<00:01, 17689.93it/s]

Tabular Q: Episode 82000: epsilon = 0.016570148184139464, avg reward = 0.99
Tabular Q: Episode 83000: epsilon = 0.015761992817930556, avg reward = 0.98


 84%|████████▍ | 83977/100000 [00:04<00:00, 17767.90it/s]

Tabular Q: Episode 84000: epsilon = 0.014993252614982331, avg reward = 1.0
Tabular Q: Episode 85000: epsilon = 0.014262005228231584, avg reward = 1.0


 88%|████████▊ | 87550/100000 [00:05<00:00, 17819.81it/s]

Tabular Q: Episode 86000: epsilon = 0.01356642206687351, avg reward = 0.98
Tabular Q: Episode 87000: epsilon = 0.012904763723703517, avg reward = 0.99


 89%|████████▉ | 89339/100000 [00:05<00:00, 17838.13it/s]

Tabular Q: Episode 88000: epsilon = 0.012275375625475645, avg reward = 0.98
Tabular Q: Episode 89000: epsilon = 0.011676683895400804, avg reward = 0.99


 91%|█████████ | 91123/100000 [00:05<00:00, 17813.18it/s]

Tabular Q: Episode 90000: epsilon = 0.011107191417438142, avg reward = 0.99
Tabular Q: Episode 91000: epsilon = 0.010565474092537885, avg reward = 0.96


 93%|█████████▎| 92905/100000 [00:05<00:00, 17803.69it/s]

Tabular Q: Episode 92000: epsilon = 0.010050177277473855, avg reward = 0.99
Tabular Q: Episode 93000: epsilon = 0.01, avg reward = 0.97


 95%|█████████▍| 94686/100000 [00:05<00:00, 17594.75it/s]

Tabular Q: Episode 94000: epsilon = 0.01, avg reward = 0.98
Tabular Q: Episode 95000: epsilon = 0.01, avg reward = 0.99


 96%|█████████▋| 96447/100000 [00:05<00:00, 17533.59it/s]

Tabular Q: Episode 96000: epsilon = 0.01, avg reward = 0.98
Tabular Q: Episode 97000: epsilon = 0.01, avg reward = 0.99


 98%|█████████▊| 98265/100000 [00:05<00:00, 17723.00it/s]

Tabular Q: Episode 98000: epsilon = 0.01, avg reward = 0.97
Tabular Q: Episode 99000: epsilon = 0.01, avg reward = 0.98


100%|██████████| 100000/100000 [00:05<00:00, 17153.29it/s]

State: Type -    V(s),    action taken
     0:   S - 0.95, 1-->Down
     1:   F - 0.94, 0-->Left
     2:   H - 0.00, 0-->Left
     3:   F - 0.93, 1-->Down
     4:   F - 0.96, 1-->Down
     5:   H - 0.00, 0-->Left
     6:   F - 0.98, 1-->Down
     7:   F - 0.97, 0-->Left
     8:   F - 0.97, 2-->Right
     9:   F - 0.98, 2-->Right
     10:   F - 0.99, 1-->Down
     11:   H - 0.00, 0-->Left
     12:   F - 0.96, 3-->Up
     13:   H - 0.00, 0-->Left
     14:   F - 1.00, 2-->Right
     15:   G - 0.00, 0-->Left
Final Q function:  [[0.94148015 0.95099005 0.93206535 0.94148015]
 [0.94148015 0.         0.         0.93206535]
 [0.         0.         0.         0.        ]
 [0.         0.92984196 0.49090644 0.23199305]
 [0.95099005 0.96059601 0.         0.94148015]
 [0.         0.         0.         0.        ]
 [0.         0.9801     0.96059601 0.        ]
 [0.970299   0.         0.94446586 0.82700786]
 [0.96059601 0.95099005 0.970299   0.95099005]
 [0.96059601 0.         0.9801     0.        ]
 




# Collecting Trajectories
Here we may need to ensure that the trajectories sufficiently explore each state.\
Thus, we may want to start from specific states when reseting the environment during trajectory collection.

In [9]:
def choose_action(q_table, state, epsilon):
    """Epsilon-greedy action selection."""
    if random.uniform(0, 1) < epsilon:
        return random.randint(0, num_actions - 1)  # Explore
    else:
        return np.argmax(q_table[state, :])  # Exploit

In [10]:
q_table = np.load(f"{runs_folder_path}/Q_table_{filename}.npy")
# env = make_env(env_name=env_name, env_dim=env_dim, seed = seed, stochastic=stochastic)

traj = []
tuple_traj = False
for e in tqdm(range(num_episodes_trajectories)):
    # print(e)
    if modified=="perfect":
        _, _ = env.reset()
        epsilon = 0.01
        state = int(e%16)
    # end if perfect

    elif modified=="random":
        epsilon = 1
        if random.uniform(0, 1) < 0.1:
            if random.uniform(0, 1) < 0.5:
                state = 3
            elif random.uniform(0, 1) < 0.65:
                state = 6
            else:
                state = 7
            # end if choosing state
        else:
            state, _ = env.reset(seed=seed)
    # end if random trajectories
    
    else:
        # continue    
        if e<0.1*num_episodes_trajectories:
          epsilon = 1
        elif e<0.2*num_episodes_trajectories:
          epsilon = 0.8
        elif e<0.3*num_episodes_trajectories:
          epsilon = 0.6
        elif e<0.4*num_episodes_trajectories:
          epsilon = 0.4
        elif e<0.5*num_episodes_trajectories:
          epsilon = 0.2
        elif e<0.6*num_episodes_trajectories:
          epsilon = 0.1
        elif e<0.7*num_episodes_trajectories:
          epsilon = 0.05
        else:
          epsilon = 0.01
        # end epsilon if
        if random.uniform(0, 1) < 0.1:
            if random.uniform(0, 1) < 0.5:
                state = 3
            elif random.uniform(0, 1) < 0.65:
                state = 6
            else:
                state = 7
            # end if choosing state
        else:
            state, _ = env.reset(seed=seed)
    
    if not tuple_traj:
       traj.append('s_'+str(state))
    '''
    Action Stochasticity (is_slippery=True): The seed affects how the agent slips 
    (randomly moves instead of following the chosen action).
    Random Hole Placement (if map is generated dynamically): If the map has random 
    elements, different seeds can affect the placement of H (holes).
    '''
    curr_reward = 0
    rep_count = 0
    for t in range(max_eps_len):
      action = choose_action(q_table, state, epsilon) #To be implemented
      n_state,reward,done,_,_ = env.step(action)
      
      # We store the current tuple
      if tuple_traj:
        traj.append((state, action, reward, n_state, done))
      else:
        temp = 's_'+str(n_state)
        # traj.append('s_'+str(n_state))
        traj.append(temp)

      state = n_state
      curr_reward += reward
      if done:
        if rep_count>=5: # This forces repetitions to occur when done becomes True thereby repeating ending states
            if not tuple_traj:
                if temp=='s_15':
                    traj.append('s_'+str(16))
                else:
                    traj.append('s_'+str(17))
            break
        else:
            if not tuple_traj:
                traj.append(temp)
        rep_count+=1
    # end for
# end for
if tuple_traj:
    save_file_name = f"{runs_folder_path}/modified_tuple_trajectories_{filename}.npy"
    np.save(save_file_name, traj)
else:
    save_file_name = f"{runs_folder_path}/modified_{modified}_trajectories_{filename}.npy"
    np.save(save_file_name, traj)
print(f"Trajectories Saved in {save_file_name}!")

# assert False, "No w2v business here"

100%|██████████| 10000/10000 [00:00<00:00, 18772.43it/s]

Trajectories Saved in mdp/runs_frozen/modified_perfect_trajectories_FrozenLake-v1_map_size_4_stochastic_False_seed_42.npy!





# Performing w2v

In [11]:
traj_file = f"{runs_folder_path}/modified_{modified}_trajectories_{filename}.npy"
print("The trajectories being used are: ", traj_file)

text = np.load(traj_file)
np.random.seed(seed=seed)
torch.manual_seed(seed)

The trajectories being used are:  mdp/runs_frozen/modified_perfect_trajectories_FrozenLake-v1_map_size_4_stochastic_False_seed_42.npy


<torch._C.Generator at 0x7e621dccf010>

### Now we define the functions for word2vec algorithm

In [12]:
def build_vocab(text): # Again we already have a vocabulary hence don't need to use this function directly
    word_counts = Counter(text)
    vocab = {word: i for i, word in enumerate(word_counts.keys())}
    reverse_vocab = {i: word for word, i in vocab.items()}
    return vocab, reverse_vocab, word_counts

def generate_skipgram_pairs(text, window_size=2): # This function gives the word and context pairs.
    # words = tokenize_text(text)
    words = text
    pairs = []
    for i, target_word in enumerate(words):
        window_start = max(i - window_size, 0)
        window_end = min(i + window_size + 1, len(words))
        for j in range(window_start, window_end):
            if i != j:
                pairs.append((words[i], words[j]))
    return pairs

# vocab,_,wcounts = build_vocab(text)
# print("vocab: ", vocab)
# print("word counts: ", wcounts)
# assert False, "Checking the word counts!"

### Word2vec class definition

In [13]:
# ============================================================================================
# Classes: Word2vec Dataset creator and SkipGram model
# ============================================================================================

class Word2VecDataset(Dataset):
    def __init__(self, text, vocab, window_size=2):
        self.vocab = vocab
        self.data = generate_skipgram_pairs(text, window_size)
        self.vocab_size = len(vocab) # why is this needed here?

    def __len__(self): # what does this function do?
        return len(self.data)

    def __getitem__(self, idx):
        target, context = self.data[idx]
        ''' Doesn't the above idx reflect the idx numbered pair instead in pairs instead of the pairs
        corresponding to the word at idx? '''
        target_idx = torch.tensor(self.vocab[target], dtype=torch.long)
        context_idx = torch.tensor(self.vocab[context], dtype=torch.long)
        # target_idx = self.vocab[target]
        # context_idx = self.vocab[context]

        return target_idx, context_idx
    

    # ============================================================================================
# Class: SkipGram using softmax over entire vocabulary
# ============================================================================================

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embedding_dim = embedding_dim

        # Input word embedding
        self.in_embedding = nn.Embedding(vocab_size, embedding_dim)
        # Output word embedding (used for context words)
        self.out_embedding = nn.Embedding(vocab_size, embedding_dim)

        # Initialize weights (better stability)
        init_range = 0.5 / embedding_dim
        self.in_embedding.weight.data.uniform_(-init_range, init_range)
        self.out_embedding.weight.data.uniform_(-init_range, init_range)

    def forward(self, center_word_idx):
        """Compute word embeddings and softmax probabilities for the context words."""
        center_embed = F.relu(self.in_embedding(center_word_idx))  # Shape: (batch_size, embedding_dim)
        scores = torch.matmul(center_embed, self.out_embedding.weight.T)  # Compute dot product
        y_pred = torch.softmax(scores, dim=1)  # Apply softmax over output vocab
        return y_pred

    def get_word_vector(self, word_idx):
        """Return the learned embedding vector for a given word index."""
        return self.in_embedding(word_idx).detach().cpu().numpy()

### Training function

In [14]:
# ============================================================================================
# Function: to train the w2v skipgram model
# ============================================================================================

def train_skipgram(model, data_loader, epochs=6, lr=0.01, device = device):
    """Train the SkipGram model using Adam optimizer."""
    criterion = nn.CrossEntropyLoss()  # Cross-entropy for multi-class classification
    optimizer = optim.Adam(model.parameters(), lr)

    for epoch in tqdm(range(epochs)):
        total_loss = 0
        for center_word_idx, context_word_idx in data_loader:
            center_word_idx = center_word_idx.to(device)
            context_word_idx = context_word_idx.to(device)

            optimizer.zero_grad()
            y_pred = model(center_word_idx)  # Forward pass
            loss = criterion(y_pred, context_word_idx)  # Compute loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update weights

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

    return model

### Main function for w2v

In [15]:
# ============================================================================================
#  Main function for calling the w2v agent
# ============================================================================================
# First we create the dataset and dataloader
vocab, _, _ = build_vocab(text)
dataset = Word2VecDataset(text, vocab, window_size)
dataloader = DataLoader(dataset, batch_size, shuffle=True)
model = SkipGram(vocab_size=len(vocab), embedding_dim=embed_dim).to(device)
model = train_skipgram(model, dataloader, epochs = w2v_epochs, lr = w2v_lr)

# ============================================================================================
#  Saving the w2v generated vector embeddings as a dictionary
# ============================================================================================
word_embeddings = {}
for word in vocab:
    word_idx = torch.tensor([vocab[word]], dtype=torch.long).to(device)
    updated_embedding = model.get_word_vector(word_idx)
    # print(f"Updated embedding for '{word}': {updated_embedding}")
    # Store the embedding in the dictionary
    word_embeddings[word] = updated_embedding.flatten()  # Flatten to 1D array

# Saving the w2v model
save_w2v_file = f"{runs_folder_path}/modified_{modified}_w2v_embed_dim_{embed_dim}_{filename}_epochs_{w2v_epochs}.npy"
np.save(save_w2v_file, word_embeddings)
print("W2v vetors stored in: ", save_w2v_file)

  2%|▏         | 1/50 [00:15<12:35, 15.42s/it]

Epoch 1, Loss: 110887.2898


  4%|▍         | 2/50 [00:32<12:58, 16.22s/it]

Epoch 2, Loss: 110893.1169


  6%|▌         | 3/50 [00:49<12:55, 16.49s/it]

Epoch 3, Loss: 110992.9873


  8%|▊         | 4/50 [01:03<12:09, 15.86s/it]

Epoch 4, Loss: 111317.7144


 10%|█         | 5/50 [01:19<11:49, 15.77s/it]

Epoch 5, Loss: 111958.4771


 12%|█▏        | 6/50 [01:34<11:20, 15.46s/it]

Epoch 6, Loss: 111973.5527


 14%|█▍        | 7/50 [01:49<10:57, 15.28s/it]

Epoch 7, Loss: 112233.7298


 16%|█▌        | 8/50 [02:04<10:37, 15.17s/it]

Epoch 8, Loss: 112483.0632


 18%|█▊        | 9/50 [02:19<10:20, 15.14s/it]

Epoch 9, Loss: 112483.0729


 20%|██        | 10/50 [02:34<10:06, 15.16s/it]

Epoch 10, Loss: 112481.9490


 22%|██▏       | 11/50 [02:49<09:48, 15.08s/it]

Epoch 11, Loss: 112482.9863


 24%|██▍       | 12/50 [03:04<09:30, 15.03s/it]

Epoch 12, Loss: 112483.3948


 26%|██▌       | 13/50 [03:19<09:15, 15.02s/it]

Epoch 13, Loss: 112483.3878


 28%|██▊       | 14/50 [03:34<08:58, 14.96s/it]

Epoch 14, Loss: 112482.7575


 30%|███       | 15/50 [03:49<08:46, 15.04s/it]

Epoch 15, Loss: 112481.4751


 32%|███▏      | 16/50 [04:06<08:51, 15.63s/it]

Epoch 16, Loss: 112483.2553


 34%|███▍      | 17/50 [04:21<08:31, 15.50s/it]

Epoch 17, Loss: 112482.7682


 36%|███▌      | 18/50 [04:36<08:09, 15.30s/it]

Epoch 18, Loss: 112482.8302


 38%|███▊      | 19/50 [04:51<07:53, 15.28s/it]

Epoch 19, Loss: 112482.7034


 40%|████      | 20/50 [05:06<07:38, 15.29s/it]

Epoch 20, Loss: 112482.3457


 42%|████▏     | 21/50 [05:23<07:38, 15.82s/it]

Epoch 21, Loss: 112481.9342


 44%|████▍     | 22/50 [05:40<07:32, 16.14s/it]

Epoch 22, Loss: 112482.3104


 46%|████▌     | 23/50 [05:56<07:08, 15.88s/it]

Epoch 23, Loss: 112797.7654


 48%|████▊     | 24/50 [06:11<06:46, 15.64s/it]

Epoch 24, Loss: 112939.6964


 50%|█████     | 25/50 [06:27<06:38, 15.95s/it]

Epoch 25, Loss: 112940.4721


 52%|█████▏    | 26/50 [06:44<06:24, 16.02s/it]

Epoch 26, Loss: 112940.2037


 54%|█████▍    | 27/50 [07:00<06:13, 16.25s/it]

Epoch 27, Loss: 112940.0061


 56%|█████▌    | 28/50 [07:17<06:01, 16.43s/it]

Epoch 28, Loss: 112939.3943


 58%|█████▊    | 29/50 [07:34<05:44, 16.40s/it]

Epoch 29, Loss: 113239.2079


 60%|██████    | 30/50 [07:48<05:19, 15.95s/it]

Epoch 30, Loss: 113417.2709


 62%|██████▏   | 31/50 [08:03<04:57, 15.64s/it]

Epoch 31, Loss: 113417.2871


 64%|██████▍   | 32/50 [08:20<04:44, 15.80s/it]

Epoch 32, Loss: 113417.2807


 66%|██████▌   | 33/50 [08:37<04:36, 16.25s/it]

Epoch 33, Loss: 113417.2742


 68%|██████▊   | 34/50 [08:54<04:24, 16.53s/it]

Epoch 34, Loss: 113417.2474


 70%|███████   | 35/50 [09:11<04:08, 16.58s/it]

Epoch 35, Loss: 113417.2636


 72%|███████▏  | 36/50 [09:27<03:51, 16.55s/it]

Epoch 36, Loss: 113417.2547


 74%|███████▍  | 37/50 [09:43<03:33, 16.44s/it]

Epoch 37, Loss: 113417.2798


 76%|███████▌  | 38/50 [09:59<03:14, 16.21s/it]

Epoch 38, Loss: 113417.2628


 78%|███████▊  | 39/50 [10:14<02:54, 15.84s/it]

Epoch 39, Loss: 113417.2969


 80%|████████  | 40/50 [10:29<02:36, 15.61s/it]

Epoch 40, Loss: 113417.2790


 82%|████████▏ | 41/50 [10:45<02:21, 15.70s/it]

Epoch 41, Loss: 113417.2806


 84%|████████▍ | 42/50 [11:00<02:03, 15.44s/it]

Epoch 42, Loss: 113417.2644


 86%|████████▌ | 43/50 [11:15<01:46, 15.28s/it]

Epoch 43, Loss: 113417.2782


 88%|████████▊ | 44/50 [11:30<01:32, 15.38s/it]

Epoch 44, Loss: 113417.2969


 90%|█████████ | 45/50 [11:45<01:16, 15.21s/it]

Epoch 45, Loss: 113417.2945


 92%|█████████▏| 46/50 [12:00<01:00, 15.14s/it]

Epoch 46, Loss: 113417.2807


 94%|█████████▍| 47/50 [12:15<00:45, 15.07s/it]

Epoch 47, Loss: 113417.2457


 96%|█████████▌| 48/50 [12:30<00:30, 15.02s/it]

Epoch 48, Loss: 113417.2645


 98%|█████████▊| 49/50 [12:45<00:14, 14.95s/it]

Epoch 49, Loss: 113417.2709


100%|██████████| 50/50 [13:00<00:00, 15.61s/it]

Epoch 50, Loss: 113417.2790
W2v vetors stored in:  mdp/runs_frozen/modified_perfect_w2v_embed_dim_32_FrozenLake-v1_map_size_4_stochastic_False_seed_42_epochs_50.npy





### Similarity check

In [16]:
# ============================================================================================
# Similarity checking and visualizing 
# ============================================================================================
# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity

# Load saved embeddings
word_embeddings = np.load(f"{runs_folder_path}/modified_{modified}_w2v_embed_dim_{embed_dim}_{filename}_epochs_{w2v_epochs}.npy", 
                          allow_pickle=True).item()

# Convert to a NumPy array for fast computation
words = list(word_embeddings.keys())
vectors = np.array(list(word_embeddings.values()))

# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(vectors)

# Function to find top-N similar words
def find_similar_words(target_word, top_n=5):
    if target_word not in word_embeddings:
        print(f"Word '{target_word}' not found in vocabulary.")
        return []

    # Get index of target word
    target_idx = words.index(target_word)

    # Get similarity scores for the target word
    similarity_scores = cosine_sim_matrix[target_idx]

    # Get top-N most similar words (excluding itself)
    similar_indices = np.argsort(similarity_scores)[::-1][1:top_n+1]  # Sort in descending order

    # Return words with their similarity scores
    return [(words[i], similarity_scores[i]) for i in similar_indices]

# Example usage
for target_word in word_embeddings:
    top_similar_words = find_similar_words(target_word, top_n=5)

    print(f"Top 5 words similar to '{target_word}':")
    for word, score in top_similar_words:
        print(f"{word}: {score:.4f}")

Top 5 words similar to 's_0':
s_7: 0.8252
s_9: 0.5571
s_6: 0.5460
s_8: 0.5050
s_12: 0.4732
Top 5 words similar to 's_4':
s_10: 0.5551
s_1: 0.4990
s_0: 0.4210
s_7: 0.4177
s_5: 0.4088
Top 5 words similar to 's_8':
s_11: 0.6781
s_10: 0.5752
s_5: 0.5568
s_3: 0.5273
s_0: 0.5050
Top 5 words similar to 's_9':
s_0: 0.5571
s_5: 0.5321
s_7: 0.4793
s_6: 0.4463
s_1: 0.4430
Top 5 words similar to 's_10':
s_8: 0.5752
s_5: 0.5638
s_4: 0.5551
s_2: 0.5457
s_3: 0.5308
Top 5 words similar to 's_14':
s_16: 0.3096
s_15: 0.0496
s_13: -0.0069
s_17: -0.0252
s_9: -0.0977
Top 5 words similar to 's_15':
s_16: 0.3565
s_14: 0.0496
s_13: -0.0071
s_17: -0.0441
s_8: -0.0796
Top 5 words similar to 's_16':
s_15: 0.3565
s_14: 0.3096
s_17: 0.2524
s_13: -0.0040
s_9: -0.1480
Top 5 words similar to 's_1':
s_5: 0.5043
s_4: 0.4990
s_2: 0.4935
s_9: 0.4430
s_12: 0.4274
Top 5 words similar to 's_11':
s_8: 0.6781
s_2: 0.6654
s_3: 0.6033
s_5: 0.5503
s_7: 0.5113
Top 5 words similar to 's_17':
s_16: 0.2524
s_6: 0.0410
s_8: 0.0119
s_

# Now we need to ensure that the same states are sampled to compare the performance of w2v imbued DQN and Vanilla DQN. 
Thus we reset np, random and torch.manual_seed before starting both the following cells

# DQN common set up

### Defining all classes
We define the following classes:
1. Experience Replay
2. DQNAgent 

### Training using w2v

### Vanilla DQN (without w2v)