Build Environment

# 2048 Game project
* Matteo Zhang
* Arvin Firouzi

## Importing libraries
We use numpy instead of python general list since it is faster than python general list.

In [12]:
import numpy as np
from tabulate import tabulate  # for rendering board
from enum import Enum
from random import randint, choice
from copy import copy, deepcopy

## Defining the game environment

For the puprpose of this project we just implemented 2x2 borad with target 16.


In [13]:

# environment possible actions: swipe to left, right, up, down
class Action(Enum):
    def __str__(self):
        return self.name
    Left = 1
    Right = 2
    Up = 3
    Down = 4

class GameEnvironment:
    def __init__(self, board_size=3, target=64, initial_state=None, calculate_possible_states=True):
        # dynamic board size
        self.board_size = board_size
        self.won = False
        if initial_state == None:
            # start with empty board
            self.__initial_state = np.zeros([board_size, board_size] ,int)
            # generate 2 random tiles on board
            for i in range(2):
                self.__initial_state = self.__generate_new_tile(self.__initial_state)
            
        else:
            # copy to prevent aliassing
            self.__initial_state = copy(initial_state)

        self.target = target
        self.__state = self.__initial_state
        self.__possible_states = []
        # during performance assessment, it's not required
        if calculate_possible_states:
            self.__calculate_possible_states_from_different_initial_states(self.__initial_state)
        # print(len(self.__possible_states))
        
    def __calculate_possible_states_from_different_initial_states(self, state:np.ndarray = None):
        initial_state_seeds = []
        for n in range(30):
            # generate new initial state
            initial_state = np.zeros([self.board_size, self.board_size] ,int)
            # generate unique initial states with 2 random tiles on board
            i = 0
            while i < 20:
                i+=1
                
                for _ in range(2):
                    initial_state = self.__generate_new_tile(initial_state)
                if not self.contains(initial_state_seeds, initial_state):
                    initial_state_seeds.append(initial_state)
                    break
            
            # print(f"state seed {n}: {initial_state}")
            self.__calculate_possible_states(initial_state, depth=2, do_possible_states=True)
            
        
    
    def __calculate_possible_states(self, state:np.ndarray=None, action=None, depth=5, do_singleton=False, do_possible_states=False):
        tile_2_depth = copy(depth)
        tile_4_depth = copy(depth)
        
        if state is None:
            state = self.__initial_state
        
        if action == None:
            possible_actions = self.get_possible_actions(all_actions=True)
        else:
            possible_actions = [action]
            
        # possible states returned to get_transition_prob
        singleton_possible_states = []
            
        # get all possible actions 
        for action in possible_actions:
        # calculate the outcome state
            outcome_state = self.__calculate_transition(action, state, new_tile=False)
        # append to the self.__possible_states (to be used in the utility function)
            empty_tiles = self.get_empty_tiles(outcome_state)
            if len(empty_tiles) > 0:
                # generate new tile at random empty cell
                for tile in empty_tiles:
                    new_state = deepcopy(outcome_state)   
                    #if random generated tile is 2
                    new_state[tile[0]][tile[1]] = 2
                    temp_state = deepcopy(new_state)
                    if do_possible_states and not self.contains(self.__possible_states, temp_state):
                        self.__possible_states.append(temp_state)
                    
                    if do_singleton:
                        singleton_possible_states.append(temp_state)
                    if not self.is_done(temp_state) and tile_2_depth > 0:
                        for calculated_step in self.__calculate_possible_states(deepcopy(temp_state),depth = tile_2_depth - 1, do_singleton=do_singleton, do_possible_states=do_possible_states):
                            singleton_possible_states.append(calculated_step)
                    
                    #if random generated tile is 4
                    new_state[tile[0]][tile[1]] = 4
                    temp_state = deepcopy(new_state)
                    if singleton_possible_states and not self.contains(self.__possible_states, temp_state):
                        self.__possible_states.append(temp_state)
                    
                    if do_singleton:
                        singleton_possible_states.append(temp_state)
                    if not self.is_done(temp_state) and tile_4_depth > 0:
                        for calculated_step in self.__calculate_possible_states(deepcopy(temp_state),depth = tile_4_depth - 1, do_singleton=do_singleton, do_possible_states=do_possible_states):
                            singleton_possible_states.append(calculated_step)
        

        # print(f"possible singleton states: {len(singleton_possible_states)}")
        return singleton_possible_states
        
    def reset(self):
        self.won = False
        self.__state = self.__initial_state
        return self.__state

    # perform action on environment
    def __calculate_transition(self, action:Action, state:np.ndarray=None, new_tile:bool = True):        
        if state is None:
            new_state = self.__state
        else:
            new_state = deepcopy(state)
            
        if self.is_done(state):
            return new_state

        # 1. change the state to reflect the move by the agent,
        # 2. merge same value tiles

        # swipe to left
        if action == Action.Left:
            new_state = self.swipeToLeft(new_state)
            new_state= self.mergeToLeft(new_state)
        # swipe to right
        elif action == Action.Right:
            new_state = self.swipeToRight(new_state)
            new_state = self.mergeToRight(new_state)
        elif action == Action.Up:
            # take transpose, swipe, then re-take transpose
            temp_state = self.transpose(new_state)
            temp_state = self.swipeToLeft(temp_state)
            temp_state = self.mergeToLeft(temp_state)
            new_state = self.transpose(temp_state)
        elif action == Action.Down:
            # take transpose
            temp_state = self.transpose(new_state)
            temp_state = self.swipeToRight(temp_state)
            temp_state = self.mergeToRight(temp_state)
            new_state = self.transpose(temp_state)
        
        # 3. generate a new tile on empty cells
        if new_tile:
            new_state = self.__generate_new_tile(new_state)
            
        return new_state

    def __generate_new_tile(self, state):
        new_state = deepcopy(state)
        empty_state = self.get_empty_tiles(new_state)

        if len(empty_state) > 0:
            # possible generated tile values
            possible_gen_tiles = [2, 4]
            # generate new tile at random empty cell
            row, col = choice(empty_state)
            new_state[row][col] = possible_gen_tiles[randint(0, 1)]
        return new_state
        
    def swipeToLeft(self, state):
        for i in range(self.board_size):
            for j in range(self.board_size - 1):
                # [0,2,2]: if current cell is empty, swap with the right one
                if state[i][j] == 0:

                    # k is the offset of the first found tile
                    for k in range(1, self.board_size - j):
                        if state[i][j + k] != 0:
                            self.swap(state, i, j, i, j + k)
                            break
        return state

    def mergeToLeft(self, state):
        # merge same tiles together
        for i in range(self.board_size):
            for j in range(self.board_size - 1):
                current_tile = state[i][j]
                if current_tile != 0:
                    right_tile = state[i][j + 1]
                    if right_tile == current_tile:
                        # merge same tiles together
                        state[i][j] = current_tile * 2
                        state[i][j + 1] = 0
                        # shift to the left other tiles
                        for k in range(j + 1, self.board_size - 1):
                            # current tile equal right tile
                            state[i][j + k] = state[i][j + k + 1]

                        # last cell is empty
                        state[i][self.board_size - 1] = 0

        return state

    def swipeToRight(self, state):
        for i in range(self.board_size):
            for j in reversed(range(1, self.board_size)):
                # [2,2,0]: if current cell is empty, swap with the left one
                if state[i][j] == 0:

                    # k is the offset of the first found tile
                    for k in range(1, j + 1):
                        if state[i][j - k] != 0:

                            self.swap(state, i, j, i, j - k)
                            break
        return state

    def mergeToRight(self, state):
        # merge same tiles together
        for i in range(self.board_size):
            for j in reversed(range(1, self.board_size)):
                current_tile = state[i][j]
                if current_tile != 0:
                    left_tile = state[i][j - 1]
                    if left_tile == current_tile:
                        # merge same tiles together
                        state[i][j] = current_tile * 2
                        state[i][j - 1] = 0
                        # shift to the right other tiles
                        for k in reversed(range(1, j - 1)):
                            # current tile equal right tile
                            state[i][j - k] = state[i][j - k - 1]
                        # first cell is empty
                        state[i][0] = 0

        return state

    def transpose(self, array):
        transposed_array = np.transpose(array)
        return transposed_array

    def swap(self, state, x1, y1, x2, y2):
        # x and y are the position of the board matrix
        z = state[x1][y1]
        state[x1][y1] = state[x2][y2]
        state[x2][y2] = z

    # unit step on environment
    def step(self, action):
        old_state = self.__state
        # state after agent action
        self.__state = self.__calculate_transition(action)
        observation = self.__state  # environment is fully observable
        done = self.is_done()
        reward = self.get_reward(self.__state)
        info = {}  # optional    debug info
        return observation, done, reward, info

    # render environment (board) on CLI
    def render(self,state:np.ndarray = None):
        if state is None:
            state = deepcopy(self.__state)
        print_state = []
        for item in state:
            print_state.append(['' if x==0 else x for x in item])
        print(tabulate(print_state, tablefmt="grid"))

    def get_state(self):
        return self.__state

    def get_possible_states(self):
        return self.__possible_states

    # get index of empty cells
    def get_empty_tiles(self, state=None):
        if state is None:
            state = self.__state
        empty_cells = []
        for i in range(self.board_size):
            for j in range(self.board_size):
                if state[i][j] == 0:
                    empty_cells.append([i, j])

        return empty_cells

    def get_possible_actions(self, old_state:np.ndarray = None, all_actions=False):
        if old_state is None:
            old_state = copy(self.__initial_state)

        if self.is_done(old_state):
            return []        
        
        if all_actions:
            return [Action.Left, Action.Right, Action.Up, Action.Down]
        
        possible_actions = []
        
        # Check whether 'swipe left' is possible or not
        state = deepcopy(old_state)
        state = self.swipeToLeft(state)
        break_out_flag = False
        for i in range(self.board_size):
            for j in range(self.board_size - 1):
                current_tile = state[i][j]
                if current_tile != 0:
                    right_tile = state[i][j + 1]
                    if right_tile == current_tile:
                        # left swipe merge is possible
                        possible_actions.append(Action.Left)
                        
                        # exit from nested loop
                        break_out_flag = True
                        break
                        
            if break_out_flag:
                break
                
        # Check whether 'swipe right' is possible or not
        state = deepcopy(old_state)
        state = self.swipeToRight(state)
        break_out_flag = False
        for i in range(self.board_size):
            for j in reversed(range(1, self.board_size)):
                current_tile = state[i][j]
                if current_tile != 0:
                    left_tile = state[i][j - 1]
                    if left_tile == current_tile:
                        # right swipe merge is possible
                        possible_actions.append(Action.Right)
                        
                        # exit from nested loop
                        break_out_flag = True
                        break
                        
            if break_out_flag:
                break
 
        # Check whether 'swipe up' is possible or not
        state = deepcopy(old_state)
        state = self.transpose(state)
        state = self.swipeToLeft(state)
        break_out_flag = False
        for i in range(self.board_size):
            for j in range(self.board_size - 1):
                current_tile = state[i][j]
                if current_tile != 0:
                    right_tile = state[i][j + 1]
                    if right_tile == current_tile:
                        # left swipe merge is possible
                        possible_actions.append(Action.Up)
                        
                        # exit from nested loop
                        break_out_flag = True
                        break
                        
            if break_out_flag:
                break
        state = self.transpose(state)
 
        # Check whether 'swipe down' is possible or not
        state = deepcopy(old_state)
        state = self.transpose(state)
        state = self.swipeToRight(state)
        break_out_flag = False
        for i in range(self.board_size):
            for j in reversed(range(1, self.board_size)):
                current_tile = state[i][j]
                if current_tile != 0:
                    left_tile = state[i][j - 1]
                    if left_tile == current_tile:
                        # right swipe merge is possible
                        possible_actions.append(Action.Down)
                        
                        # exit from nested loop
                        break_out_flag = True
                        break
                        
            if break_out_flag:
                break
        state = self.transpose(state)
        
        return possible_actions        
        
    # determine wheter the game is over
    # either: when all cells are occupied and no more merging is possible,
    # or 2048 tile is generated
    def is_done(self, state:np.ndarray = None):
        if state is None:
            state = self.__state

        # detect if a tile has target value (e.g. 2048)
        for i in range(self.board_size):
            for j in range(self.board_size):
                if self.__state[i][j] == self.target:
                    return True

        for i in range(self.board_size):
            for j in range(self.board_size):
                if self.__state[i][j] == 0:
                    return False

        # check if all cells are occupied and no more merging is possible
        if 0 not in state:
            # no more merging is possible
            for i in range(self.board_size - 1):
                for j in range(self.board_size - 1):
                    if (state[i][j] == state[i + 1][j]) or (
                        state[i][j] == state[i][j + 1]
                    ):
                        return False
            # check bottom row
            for j in range(self.board_size - 1):
                if state[self.board_size - 1][j] == state[self.board_size - 1][j + 1]:
                    return False

            # check rightmost column
            for i in range(self.board_size - 1):
                if state[i][self.board_size - 1] == state[i + 1][self.board_size - 1]:
                    return False

            return True

        return False

    def has_won(self, state=None):
        if state is None:
            state = self.stat
    
        # detect if a tile has target value (e.g. 2048)
        for i in range(self.board_size):
            for j in range(self.board_size):
                if state[i][j] == self.target:
                    return True
                    
        return False

    # Reward R(s) for every possible state [-1,1]
    def get_reward(self, state):
        # detect tile with target value (e.g. 2048 tile)
        for i in range(self.board_size):
            for j in range(self.board_size):
                if state[i][j] == self.target:
                    return 1
        
        score_reward = 0.0
        
        score_reward = np.array(state).sum()/(self.target + (self.target/4)*(self.board_size**2))
        
        # check if all cells are occupied and no more merging is possible
        if not self.is_done():
            return score_reward
            
        # game is done
        return -1

    def get_transition_prob(self, action, new_state, old_state=None):
        if old_state is None:
            old_state = self.__state

        # if the game is over, no transition can take place
        if self.is_done(old_state):
            return 0.0
        
        # calculate possible states
        possible_states_after_action = self.__calculate_possible_states(deepcopy(old_state), action, depth=0, do_singleton=True)
        
        # transition probabilities
        prob = 0
        if possible_states_after_action is not None:
            if not self.contains(possible_states_after_action, new_state):
                return 0.0
            prob = self.count(possible_states_after_action, new_state) / (len(possible_states_after_action))
        
        return prob
        
    def contains(self, list, value):
        for x in list:
            if np.array_equal(x, value):
                return True
        return False
        
    def count(self, list, value):
        count = 0
        for x in list:
            if np.array_equal(x, value):
                count += 1
        return count    

## Create a new environment
In the upcoming cell we create a new environment and count all possible states.

In [14]:
mdp = GameEnvironment(2, 16)
print(len(mdp.get_possible_states()))

209


## Calculating transition property

We create a new instance of game with a given initial state. Then we will calculcate the probability that we can reach a desired state after one action.

In [15]:
# mdp = GameEnvironment(3, 64, [[0, 2, 2], [0, 0, 0], [0, 0, 2]])
mdp = GameEnvironment(2, 16, [[0, 2], [0, 2]], calculate_possible_states=False)
new_state = [[0, 2], [0, 4]]
prob = []
for action in mdp.get_possible_actions(all_actions=True):
    Iprob = mdp.get_transition_prob(action, new_state)*100
    print(f"{Iprob:3.2f}%, action: {action}")
    prob.append(Iprob)
    
# mdp.render()
# mdp.render(new_state)

0.00%, action: Left
0.00%, action: Right
0.00%, action: Up
16.67%, action: Down


## Playing with random agent

In this block a random agent will play the game. We print every state.

In [16]:
# example of creation of an environment in the default state
from IPython.display import clear_output
import time

def iterativeOutput(flag):
    if flag:
        time.sleep(1)
        clear_output(wait=True)

# uncomment different board configurations
# mdp = GameEnvironment(3, 64, [[0, 2, 2], [0, 0, 0], [0, 0, 0]], calculate_possible_states=False)
mdp = GameEnvironment(3, 64, calculate_possible_states=False)
# mdp = GameEnvironment(2, 16, calculate_possible_states=False)

update_console = True # update console while playing
i = 1
while not mdp.is_done():
    action = randint(1,4) # random choice
    state, done, reward, info = mdp.step(Action(action))
    print(f"step {i}) Action taken: {Action(action)}, is done: {done}")
    mdp.render()
    i=i+1
    iterativeOutput(update_console)

print('state =', state, ', reward =', reward, ', done =', done)


state = [[ 4  2  4]
 [ 8  4 32]
 [ 4  2  8]] , reward = -1 , done = True


## Progress Bar
The progress bar will help us to reach a better understanding of the utility calculation procedure.

In [17]:
# Print iterations progress
def printProgressBar (iteration, total, prefix = 'Progress', suffix = 'Complete', decimals = 1, length = 50, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()


## Calculating Utility and Q-Value

In the upcoming block we define the utility function to calculate the utility value for each possible state. 
Then we use __Value Iteration__  to solve the Bellman equation.

In [18]:
class Utility:
    def __init__(self, state, value):
        self.__state = state
        self.value = value
        
    def get_state(self):
        return self.__state
    
    @staticmethod
    def findUtilityValue(list, search_state):
        for x in list:
            if np.array_equal(x.__state, search_state):
                # print(x, search_state)
                return x.value
        return None
    @staticmethod
    def findUtilityIndex(list, search_state):
        index = 0
        for x in list:
            if np.array_equal(x.__state, search_state):
                # print(x.__state, search_state)
            
                return index
            index += 1 
        return None

def get_initial_U(mdp):
    U = []
    
    for s in possible_states:
        U.append(Utility(s, mdp.get_reward(s))) 
    return U
    
def Q_Value(mdp, s, a, U):
    Q = 0.0
    
    for s_p in possible_states:
        P = mdp.get_transition_prob(a, s_p, s)
        if P == 0:
            continue
        R = mdp.get_reward(s_p)
        utilityValue = Utility.findUtilityValue(U, s_p)
        if utilityValue is None:
            utilityValue = 0
        Q += P * (R + utilityValue)
        # print(f"Q={Q} , step={i}")
        # i+=1
    return Q

def ValueIteration(mdp, error=0.00001):
    # from AIMA 4th edition without discount gamma 
    U_p = get_initial_U(mdp) # U_p = U'
    print("Created first verison of Utility function")
    delta = float('inf')
    
    iter = 1
    
    while delta > error:
        print(f"Iteration for Utility function: {iter}")
        
        U = deepcopy(U_p)
        
        # print_U(U)  # to illustrate the iteration process
        delta = 0
        for step, s in enumerate(possible_states, 1):
            max_a = float('-inf')
            possible_actions = mdp.get_possible_actions(s)
            for a in possible_actions:
                q = Q_Value(mdp, s, a, U)
                if q > max_a:
                    max_a = q
        
            U_p_index = step - 1
        
            U_p[U_p_index].value = max_a
            
            U_p_value = U_p[U_p_index].value
            U_value = U[Utility.findUtilityIndex(U, s)].value
            
            if abs(U_p_value - U_value) > delta:
                delta = abs(U_p_value - U_value)
                print("delta ", delta)
            printProgressBar(step, len(possible_states))
            
        iter += 1
        # test
        break
    return U_p
    
# show iterative process
def print_U(U):
    for n, utility in enumerate(U, 1):
        print(f"Utility nr. {n:2} = State: {utility.get_state()}; Value: {utility.value:4.2f}")
        
    
def print_policy(pi):
    print('Optimal Policy:')
    for y in pi:
        print(f"State: {y.get_state()}; Action: {y.value}")

print("Calcualte possible states")
mdp = GameEnvironment(2, 16)
print("start Value iteration for Utility")
possible_states = deepcopy(mdp.get_possible_states())
print(f"calculating utility against {len(possible_states)} possible states")

# U(s) = max_a(Q(s,a))
U = ValueIteration(mdp)
print("value iteration done\n")
# print(U)

# pi_star(s) = argmax_a(Q(s,a))
pi_star = []
for step, s in enumerate(possible_states, 1):
    if mdp.is_done(s):
        continue # policy is not needed in stop states
    max_a = float('-inf')
    argmax_a = None
    for action in Action:
        q = Q_Value(mdp, s, action, U) 
        if q > max_a:
            max_a = q
            argmax_a = action
    pi_star.append(Utility(s, argmax_a))
    printProgressBar(step, len(possible_states))
    
    
print("Optimal policy done")
print_policy(pi_star)

Calcualte possible states
start Value iteration for Utility
calculating utility against 212 possible states
Created first verison of Utility function
Iteration for Utility function: 1
delta  0.28125
delta  inf--------------------------------------------------| 0.9% Complete
Progress |██████████████████████████████████████████████████| 100.0% Complete
value iteration done

Progress |██████████████████████████████████████████████████| 100.0% Complete
Optimal policy done
Optimal Policy:
State: [[4 2]
 [4 0]]; Action: Left
State: [[4 2]
 [4 2]]; Action: Left
State: [[8 4]
 [2 0]]; Action: Left
State: [[8 4]
 [0 2]]; Action: Left
State: [[2 0]
 [8 4]]; Action: Left
State: [[0 2]
 [8 4]]; Action: Left
State: [[4 2]
 [8 2]]; Action: Up
State: [[4 2]
 [2 8]]; Action: Left
State: [[8 2]
 [2 4]]; Action: Left
State: [[2 2]
 [8 4]]; Action: Right
State: [[4 2]
 [2 4]]; Action: Left
State: [[8 2]
 [2 0]]; Action: Right
State: [[8 2]
 [2 2]]; Action: Left
State: [[8 2]
 [4 2]]; Action: Down
State: 

## Winning rate and random action
After defining the game environment and calculating the utility values, we will run sample games to calculate the winning rate.

The only issue that remained in our model is that sometimes it can't find the right policy to run. Hence we have to take a random action.

In [19]:
from statistics import mean, stdev

def optimal_policy(state):
    return Utility.findUtilityValue(pi_star, state)

def run_one_episode(policy, episode, max_steps=-1):
    # create game board with 2 random tiles  
    mdp = GameEnvironment(2, 16, calculate_possible_states=False)
    state = mdp.reset()
    # mdp.render()

    total_reward = 0.0
    done = False
    step = 1
    random_actions = 0
    while not done:
        next_action = policy(state)
        if next_action == None:
            next_action = Action(randint(1,4))
            random_actions += 1
        mdp.render()
        state, done, reward, info = mdp.step(next_action)
        # print(state, next_action, done, reward, info)
        total_reward += reward
        step += 1
        if max_steps != -1:
            if step > max_steps:
                break
    
    return total_reward, mdp.has_won(mdp.get_state()), random_actions/step

def measure_performance(policy, nr_episodes=10):
    N = nr_episodes
    print('statistics over', N, 'episodes')
    all_rewards = []
    for n in range(1, N+1):
        episode_reward, has_won, random_actions = run_one_episode(policy, n, max_steps=500)
        # print('episode:', n, 'reward:', episode_reward)
        all_rewards.append([episode_reward, has_won, random_actions])
        print('ep: {:3d}, has won: {:1}, total reward: {:5.2f}, random actions took: {:3.2f}%'.format(n, has_won, episode_reward, random_actions*100))
    
    win_ratio = 0
    for x in all_rewards:
        if x[1]:
            win_ratio += 1
    win_ratio = (win_ratio / len(all_rewards)) * 100
    
    random_actions_ratio = 0
    for x in all_rewards:
        random_actions_ratio += x[2]
    random_actions_ratio = (random_actions_ratio / len(all_rewards)) * 100
    
    print('win ratio: {:3.2f}%; mean: {:6.2f}, sigma: {:6.2f}, random actions ratio: {:3.2f}%'.format(win_ratio, mean(all_rewards[0]), stdev(all_rewards[0]), random_actions_ratio))
    print()
    # for n, el in enumerate(all_rewards, 1):
    #     print('ep: {:3d}, has won: {:1}, total reward: {:5.2f}, random actions took: {:3.2f}'.format(n, el[1], el[0], el[2]))

measure_performance(optimal_policy, nr_episodes = 50)

statistics over 50 episodes
+--+---+
|  | 4 |
+--+---+
|  | 2 |
+--+---+
+---+---+
|   | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+---+
| 2 | 2 |
+---+---+
+---+---+
| 4 | 4 |
+---+--

## References
* [About the reward function design](https://www.youtube.com/watch?v=0R3PnJEisqk&ab_channel=Bonsai )
* [2048 with MDP](https://jdlm.info/articles/2018/03/18/markov-decision-process-2048.html) (we used this article to grasp the general idea about how MDP can be used to model 2048 game)
* [Counting states in 2048](https://jdlm.info/articles/2017/09/17/counting-states-combinatorics-2048.html)
