core/pusher/pusher.go

// Package pusher provides protocol-orchestrating functionality
// over the pushsync protocol. It makes sure that chunks meant
// to be distributed over the network are sent used using the
// pushsync protocol.
package pusher

import (
	"context"
	"errors"
	"fmt"
	"strconv"
	"sync"
	"time"

	"github.com/redesblock/mop/core/cluster"
	"github.com/redesblock/mop/core/crypto"
	"github.com/redesblock/mop/core/incentives/voucher"
	"github.com/redesblock/mop/core/log"
	"github.com/redesblock/mop/core/p2p/topology"
	"github.com/redesblock/mop/core/protocol/pushsync"
	"github.com/redesblock/mop/core/storer/storage"
	"github.com/redesblock/mop/core/tags"
	"github.com/redesblock/mop/core/tracer"
)

// loggerName is the tree path name of the logger for this package.
const loggerName = "pusher"

type Op struct {
	Chunk  cluster.Chunk
	Err    chan error
	Direct bool
}

type OpChan <-chan *Op

type Service struct {
	networkID         uint64
	storer            storage.Storer
	pushSyncer        pushsync.PushSyncer
	validStamp        voucher.ValidStampFn
	depther           topology.NeighborhoodDepther
	logger            log.Logger
	tag               *tags.Tags
	metrics           metrics
	quit              chan struct{}
	chunksWorkerQuitC chan struct{}
	inflight          *inflight
	attempts          *attempts
	sem               chan struct{}
	smugler           chan OpChan
}

var (
	retryInterval    = 5 * time.Second  // time interval between retries
	traceDuration    = 30 * time.Second // duration for every root tracer span
	concurrentPushes = 200              // how many chunks to push simultaneously
	retryCount       = 6
)

var (
	ErrInvalidAddress = errors.New("invalid address")
	ErrShallowReceipt = errors.New("shallow recipt")
)

const chunkStoreTimeout = 2 * time.Second

func New(networkID uint64, storer storage.Storer, depther topology.NeighborhoodDepther, pushSyncer pushsync.PushSyncer, validStamp voucher.ValidStampFn, tagger *tags.Tags, logger log.Logger, tracer *tracer.Tracer, warmupTime time.Duration) *Service {
	p := &Service{
		networkID:         networkID,
		storer:            storer,
		pushSyncer:        pushSyncer,
		validStamp:        validStamp,
		depther:           depther,
		tag:               tagger,
		logger:            logger.WithName(loggerName).Register(),
		metrics:           newMetrics(),
		quit:              make(chan struct{}),
		chunksWorkerQuitC: make(chan struct{}),
		inflight:          newInflight(),
		attempts:          &attempts{attempts: make(map[string]int)},
		sem:               make(chan struct{}, concurrentPushes),
		smugler:           make(chan OpChan),
	}
	go p.chunksWorker(warmupTime, tracer)
	return p
}

// chunksWorker is a loop that keeps looking for chunks that are locally uploaded ( by monitoring pushIndex )
// and pushes them to the closest peer and get a receipt.
func (s *Service) chunksWorker(warmupTime time.Duration, tracer *tracer.Tracer) {
	defer close(s.chunksWorkerQuitC)
	select {
	case <-time.After(warmupTime):
		s.logger.Info("pusher: warmup period complete, worker starting.")
	case <-s.quit:
		return
	}

	var (
		cctx, cancel      = context.WithCancel(context.Background())
		mtx               sync.Mutex
		wg                sync.WaitGroup
		span, logger, ctx = tracer.StartSpanFromContext(cctx, "pusher-chainsync-batch", s.logger)
		loggerV1          = logger.V(1).Build()
		timer             = time.NewTimer(traceDuration)
	)

	// inflight.set handles the backpressure for the maximum amount of inflight chunks
	// and duplicate handling.
	chunks, repeat, unsubscribe := s.storer.SubscribePush(ctx, s.inflight.set)
	go func() {
		<-s.quit
		unsubscribe()
		cancel()
		if !timer.Stop() {
			<-timer.C
		}
	}()

	ctxLogger := func() (context.Context, log.Logger) {
		mtx.Lock()
		defer mtx.Unlock()
		return ctx, logger
	}

	push := func(op *Op) {
		s.metrics.TotalToPush.Inc()
		ctx, logger := ctxLogger()
		startTime := time.Now()
		wg.Add(1)
		go func() {
			defer func() {
				wg.Done()
				<-s.sem
			}()
			if err := s.pushChunk(ctx, op.Chunk, logger, op.Direct); err != nil {
				// warning: ugly flow control
				// if errc is set it means we are in a direct push,
				// we therefore communicate the error into the channel
				// otherwise we assume this is a buffered upload and
				// therefore we repeat().
				if op.Err != nil {
					op.Err <- err
				}
				repeat()
				s.metrics.TotalErrors.Inc()
				s.metrics.ErrorTime.Observe(time.Since(startTime).Seconds())
				loggerV1.Debug("cannot push chunk", "chunk_address", op.Chunk.Address(), "error", err)
				return
			}
			if op.Err != nil {
				op.Err <- nil
			}
			s.metrics.TotalSynced.Inc()
		}()
	}

	go func() {
		for {
			select {
			case <-s.quit:
				return
			case <-timer.C:
				// reset the span
				mtx.Lock()
				span.Finish()
				span, logger, ctx = tracer.StartSpanFromContext(cctx, "pusher-chainsync-batch", s.logger)
				loggerV1 = logger.V(1).Build()
				mtx.Unlock()
			}
		}
	}()

	// fan-in channel
	cc := make(chan *Op)

	go func() {
		for ch := range chunks {
			// If the stamp is invalid, the chunk is not synced with the network
			// since other nodes would reject the chunk, so the chunk is marked as
			// synced which makes it available to the node but not to the network
			if err := s.valid(ch); err != nil {
				logger.Warning("stamp with is no longer valid, skipping syncing for chunk", "batch_id", fmt.Sprintf("%x", ch.Stamp().BatchID()), "chunk_address", ch.Address(), "error", err)

				ctx, cancel := context.WithTimeout(ctx, chunkStoreTimeout)

				if err = s.storer.Set(ctx, storage.ModeSetSync, ch.Address()); err != nil {
					s.logger.Error(err, "set chainsync failed")
				}
				cancel()
			}
			cc <- &Op{Chunk: ch, Direct: false}
		}
	}()

	defer wg.Wait()

	for {
		select {
		case apiC := <-s.smugler:
			go func() {
				for op := range apiC {
					select {
					case cc <- op:
					case <-s.quit:
						return
					}
				}
			}()
		case op, ok := <-cc:
			if !ok {
				chunks = nil
				continue
			}

			select {
			case s.sem <- struct{}{}:
			case <-s.quit:
				return
			}

			push(op)
		case <-s.quit:
			return
		}
	}
}

func (s *Service) pushChunk(ctx context.Context, ch cluster.Chunk, logger log.Logger, directUpload bool) error {
	loggerV1 := logger.V(1).Build()

	defer s.inflight.delete(ch)
	var wantSelf bool
	// Later when we process receipt, get the receipt and process it
	// for now ignoring the receipt and checking only for error
	receipt, err := s.pushSyncer.PushChunkToClosest(ctx, ch)
	if err != nil {
		// when doing a direct upload from a light node this will never happen because the light node
		// never includes self in kademlia iterator. This is only hit when doing a direct upload from a full node
		if directUpload && errors.Is(err, topology.ErrWantSelf) {
			return err
		}
		if !errors.Is(err, topology.ErrWantSelf) {
			return err
		}
		// we are the closest ones - this is fine
		// this is to make sure that the sent number does not diverge from the synced counter
		// the edge case is on the uploader node, in the case where the uploader node is
		// connected to other nodes, but is the closest one to the chunk.
		wantSelf = true
		loggerV1.Debug("chunk stays here, i'm the closest node", "chunk_address", ch.Address())
	} else if err = s.checkReceipt(receipt); err != nil {
		return err
	}

	ctx, cancel := context.WithTimeout(ctx, 2*time.Second)
	defer cancel()
	if err = s.storer.Set(ctx, storage.ModeSetSync, ch.Address()); err != nil {
		return fmt.Errorf("pusher: set chainsync: %w", err)
	}
	if ch.TagID() > 0 {
		// for individual chunks uploaded using the
		// /chunks api endpoint the tag will be missing
		// by default, unless the api consumer specifies one
		t, err := s.tag.Get(ch.TagID())
		if err == nil && t != nil {
			err = t.Inc(tags.StateSynced)
			if err != nil {
				logger.Debug("increment synced failed", "error", err)
				return nil // tag error is non-fatal
			}
			if wantSelf {
				err = t.Inc(tags.StateSent)
				if err != nil {
					logger.Debug("increment sent failed", "error", err)
					return nil // tag error is non-fatal
				}
			}
		}
	}
	return nil
}

func (s *Service) checkReceipt(receipt *pushsync.Receipt) error {
	loggerV1 := s.logger.V(1).Register()

	addr := receipt.Address
	publicKey, err := crypto.Recover(receipt.Signature, addr.Bytes())
	if err != nil {
		return fmt.Errorf("pusher: receipt recover: %w", err)
	}

	peer, err := crypto.NewOverlayAddress(*publicKey, s.networkID, receipt.Nonce)
	if err != nil {
		return fmt.Errorf("pusher: receipt storer address: %w", err)
	}

	po := cluster.Proximity(addr.Bytes(), peer.Bytes())
	d := s.depther.NeighborhoodDepth()

	// if the receipt po is out of depth AND the receipt has not yet hit the maximum retry limit, reject the receipt.
	if po < d && s.attempts.try(addr) {
		s.metrics.ShallowReceiptDepth.WithLabelValues(strconv.Itoa(int(po))).Inc()
		return fmt.Errorf("pusher: shallow receipt depth %d, want at least %d", po, d)
	}
	loggerV1.Debug("chunk pushed", "chunk_address", addr, "peer_address", peer, "proximity_order", po)
	s.metrics.ReceiptDepth.WithLabelValues(strconv.Itoa(int(po))).Inc()
	s.attempts.delete(addr)
	return nil
}

// valid checks whether the stamp for a chunk is valid before sending
// it out on the network.
func (s *Service) valid(ch cluster.Chunk) error {
	stampBytes, err := ch.Stamp().MarshalBinary()
	if err != nil {
		return fmt.Errorf("pusher: valid stamp marshal: %w", err)
	}
	_, err = s.validStamp(ch, stampBytes)
	if err != nil {
		return fmt.Errorf("pusher: valid stamp: %w", err)
	}
	return nil
}

func (s *Service) AddFeed(c <-chan *Op) {
	go func() {
		select {
		case s.smugler <- c:
		case <-s.quit:
			// if we're quitting: don't do anything
		}
	}()
}

func (s *Service) Close() error {
	s.logger.Info("pusher shutting down")
	close(s.quit)

	// Wait for chunks worker to finish
	select {
	case <-s.chunksWorkerQuitC:
	case <-time.After(6 * time.Second):
	}
	return nil
}