In [None]:
import numpy as np
import random
import matplotlib.pyplot as plt

use_debug_logging = False

grid_width = 6
grid_height = 6   
container_priorties=[1,2]
num_runs = 500

def action_to_coords( action ):
    return [ action % grid_width, int( action / grid_width ) ] 

class Action:
    def __init__(self):
        self.x=0
        self.y=0
        self.action_id = -1
        self.container_priority = 0

def create_action( container_priority, x, y ):
    action = Action()
    action.x = x
    action.y = y
    action.action_id = y * grid_width + x
    return action

def is_empty_contraint( state, action ):
    return state[action.y][action.x] == 0
    
def is_not_floating_constraint( state, action ):
    return action.y == 0 or state[action.y-1][action.x] != 0
    
def get_possible_actions( container_priority, state ):
    actions=[]
    for y in range(grid_height):
        for x in range(grid_width):
#            action = y * grid_width + x
            action = create_action( container_priority, x, y )
            if ( is_empty_contraint( state, action ) == True and 
                 is_not_floating_constraint( state, action ) ):
                actions.append( action )
    return actions 

def choose_action( container_priority, state ):
    possible_actions = get_possible_actions( container_priority, state )
    if ( len( possible_actions ) == 0 ):
        return Action()

    if use_debug_logging == True:
        print("possible actions: ")
        for i in possible_actions:
            print(str(i), end=" ")
        print("")
    action = random.choice( possible_actions )

    if use_debug_logging == True:
        print("choosing action: " + str(action) + " coords: "+str(action.x)+","+str(action.y))

    return action

def process_action( state, container_prio, action):  
    new_state = state
    new_state[ action.y ][ action.x ] = container_prio
    
    reward = 5
    if ( action.y > 0 ):
        if container_prio < state[ action.y -1 ][ action.x ]:
            reward -= 8
            
        penalty = 0
        for x in range( grid_width ):
            for y in range( grid_height -1 ):
                if container_prio > state[ y ][ x ]:
                    if y > action.y: 
                        penalty = 3
                        break
            if ( penalty != 0 ):
                break
       # reward -= penalty
        
        if container_prio < state[ action.y -1 ][ action.x ]:
            reward -= 8
    
    return new_state, reward

def initial_state():
    return np.zeros( ( grid_width, grid_height ) )

def print_state( state ):
    for y in range(grid_height):
        for x in range(grid_width):
            print(str(int(state[grid_height-1-y][x])),end=" ")
        print("")
          
def render( state, action, reward ):
    if use_debug_logging == True:
        print("new state:")
        for y in range(grid_height):
            for x in range(grid_width):
                print(str(int(state[grid_height-1-y][x])),end=" ")
            print("")
    
def place_containers():
    input_containers = []
    total_reward = 0
    state = initial_state()
    while True:  
        container_priority = random.choice(container_priorties)

        #if use_debug_logging == True:            
        #    print("------- step " + str( step ) + " --------" )
        #    print("incoming container with priority " + str(container_prio) )

        # choose a random action
        action = choose_action( container_priority, state )
        if ( action.action_id == -1 ):
            break

        # take the action and get the information from the environment
        new_state, reward = process_action(state, container_priority, action)

        state=new_state
        
        total_reward += reward
        
        render( state, action, reward )

        # show the current position and reward
        render( state=new_state, action=action, reward=reward)
        
        input_containers.append( container_priority )
    return input_containers, total_reward, state

scores = []
states = []

best_score = 0
best_state = initial_state()
best_input = []

for i in range( num_runs ):
    if ( i % 10 == 0 ):
        print("run: " + str(i))
    input_containers, score, state = place_containers()
    if use_debug_logging == True: 
        print("score: " + str(score) )
    scores.append( score )
    states.append( state )
    if ( score > best_score ):
        best_score = score
        best_state = state
        best_input = input_containers
    
print("best run. score:" + str( best_score ) )
print(*best_input, sep = ", ")
print_state( best_state )

x_axis = [ i for i in range( num_runs ) ]
plt.plot(x_axis,scores)
plt.show()
    
