This notebook implements a heuristic to choose the best candidates where wildcards should be used. For an intelligent way to apply the wildcards after selecting the best, visit [this notebook](https://www.kaggle.com/miguelgonzalez2/santa-2021-wildcard-insertion-lkh). For a complete pipeline where best pairs of wildcards are selected and applied to the string to reduce its length, please visit [this notebook](https://www.kaggle.com/atmguille/kaggle-2021-global-optimizer).

Input the string, the position of the wildcard, the minimum distance for a permutation to be considered as a candidate and the max elements to be used for building the best pairs of wildcards.

In [None]:
str1="1263547216354726135472631547263514726354172635412357641235746123547612354672135467231546723514672354167235461723546127354612736451276345217634527163452761345276314527634152763412537642153764251376425317642537164253761425376124537612345762134576231457623415762345176234571623457124635712465732146573241657324615732465173246571324657123467521346752314675234167523461752346715234671237645123674512536741256734125634721563472516347256134725631472563417256341257634127653421765342716534276153427651342765314276531246537126753412673542167354261735426713542673154267351426735124675312657342165734261573426517342657134265731426573124756312574361254763215476325147632541763254716325476132547612453672145367241536724513672453167245361724536127463512753642175364271536427513642753164275361427536124573612547362154736251473625417362547136254731625473125647312567431254673126437512436752143675241367524316752436175243671524367125436721543672514367254136725431672543617254361275436123456712356741273456217345627134562731456273415627345162734512673451276543217654327165432761543276514327654132765412367541236547213654723165472361547236514723654172365412734651256374123745612375461237564213756423175642371564237516423756142375612473652147365241736524713652473165247361524736124357612437651234765126473512654732165473261547326514732654173265471326547123657421365742316574236157423651742365714236571243657214365724136572431657243615724365172436512743651264357216435726143572641357264315726435172643512764352176435271643527614352764135276431527643125746312573461256437125364712674532167453261745326714532674153267451326745123647512635741263754126754312457631245637214563724156372451637245613724563172456312736541237654213765423176542371654237615423765142376512374651247653124375612456731264573124673512647532164753261475326417532647153264751326475126347512674351265437216543726154372651437265413726543172654312653741273564125374612534761243567123475612346572134657231465723416572346157234651723465125736421573642517364257136425731642573614257361254637125437612653471264537216453726145372641537264513726453172645312763542176354271635427613542763154276351427635124763521476352417635247163524761352476315247631275643127563421756342715634275163427561342756314275631275346127546321754632715463275146327541632754613275461263745127645312534671235647124735612463751236457126345712657431257643127435612746531274563127453612475362147536241753624715362475136247531624753"
WILDCARD_POS = 0
MIN_COSTLY_DISTANCE = 4
MAX_ELEMENTS_BRUTE_FORCE = 10

In [None]:
import itertools
all_perms = set(itertools.permutations(range(1, 8), 7))
mandatory_perms = set((1, 2) +  _ for _ in itertools.permutations(range(3, 8), 5))
non_mandatory_perms = all_perms - mandatory_perms
ctsp_perms = list(mandatory_perms) + list(mandatory_perms) + list(mandatory_perms) + list(non_mandatory_perms)

# Aux functions

In [None]:
def perm_dist_no_wildcards(p, q, wildcard=False):
    """
    Computes overlapping distance between two lists of integer
    between 1 and 7. 8 is wildcard
    """
    p = list(p)
    q = list(q)
        
    if p==q:
        return 0
    
    # Nope
    if 8 in q and 8 in p:
        # Dist between 1238567 and 1278543 or reverse
        return 7
    
    if 8 in q:
        min_dist = 8
        for i in range(1,8):
            q2 = list(q)
            q2[q2.index(8)] = i
            for j in range(1,8):
                if p[j:]==q2[:-j]:
                    if min_dist > j:
                        min_dist = j
                        break
        return min_dist
                
    if 8 in p:
        min_dist = 8
        for i in range(1,8):
            p2 = list(p)
            p2[p2.index(8)] = i
            for j in range(1,8):
                if p2[j:]==q[:-j]:
                    if min_dist > j:
                        min_dist = j
                        break
        return min_dist
            
    i = p.index(q[0])
    return i if p[i:] == q[:7-i] else 7


def perm_dist(p, q, string_number, use_wildcards=True):
    """
    Computes overlapping distance between two lists of integer
    between 1 and 7. 8 is wildcard
    """
    p = list(p)
    q = list(q)
    
    
    if p==q:
        return 0
    
    if use_wildcards and wildcards[string_number]:
        # Apply wildcards
        for j in range(2):
            if p == list(wildcards[string_number][j]):
                p[WILDCARD_POS] = 8
            if q == list(wildcards[string_number][j]):
                q[WILDCARD_POS] = 8    
    
    # Nope
    if 8 in q and 8 in p:
        return 7
    
    if 8 in q:
        min_dist = 8
        for i in range(1,8):
            q2 = list(q)
            q2[q2.index(8)] = i
            for j in range(1,8):
                if p[j:]==q2[:-j]:
                    if min_dist > j:
                        min_dist = j
                        break
        return min_dist
                
    if 8 in p:
        min_dist = 8
        for i in range(1,8):
            p2 = list(p)
            p2[p2.index(8)] = i
            for j in range(1,8):
                if p2[j:]==q[:-j]:
                    if min_dist > j:
                        min_dist = j
                        break
        return min_dist
            
    i = p.index(q[0])
    return i if p[i:] == q[:7-i] else 7

def perms_to_string(perms, string_number, use_wildcards=True):
    """
    Given list of permutations, compacts them to string
    via removing overlaps
    """
    perms = list(perms)
    s = [*perms[0]]
    for p, q in zip(perms, perms[1:]):
        d = perm_dist(p[-7:], q[:7], string_number, use_wildcards)
        s.extend(q[7-d:])
        if use_wildcards:
            if q == wildcards[string_number][0]:
                s[-(7-WILDCARD_POS)] = 8
            elif q == wildcards[string_number][1]:
                s[-(7-WILDCARD_POS)] = 8
    return s

def distances_matrix(perms, string_number, depot=False, use_wildcards=True):
    """
    Computes distance matrix for TSP
    """
    if depot:
        m = np.zeros((len(perms)+1, len(perms)+1), dtype='int8')
    else:
        m = np.zeros((len(perms), len(perms)), dtype='int8')
    for i, p in enumerate(perms):
        for j, q in enumerate(perms):
            if depot:
                m[i+1, j+1] = perm_dist(p[-7:], q[:7], string_number, use_wildcards) + len(q) - 7
            else:
                m[i,j] = perm_dist(p[-7:], q[:7], string_number, use_wildcards) + len(q) - 7
    if depot:
        m[0,:]=0
        m[:,0]=0
    return m

def distances_matrix_ctsp(perms, depot=False):
    """
    Computes distance matrix for TSP
    """
    if depot:
        m = np.zeros((len(perms)+1, len(perms)+1), dtype='int8')
    else:
        m = np.zeros((len(perms), len(perms)), dtype='int8')
    for i, p in enumerate(perms):
        for j, q in enumerate(perms):
            if 0 <= i < 120:
                string_number = 0
            elif 120 <= i < 240:
                string_number = 1
            else:
                if 0 <= j < 120:
                    string_number = 0
                elif 120 <= j < 240:
                    string_number = 1
                else:
                    string_number = 2
            if depot:
                m[i+1, j+1] = perm_dist(p[-7:], q[:7], string_number) + len(q) - 7
            else:
                m[i,j] = perm_dist(p[-7:], q[:7], string_number) + len(q) - 7
    if depot:
        m[0,:]=0
        m[:,0]=0
    return m

def sym_distances_matrix(matrix, constant=True):
    """
    Given an N x N distance matrix for ATSP, obtains a 2N x 2N matrix for
    SymTSP, where the extra N nodes are "virtual" and should be ommitted
    from the resulting circuit.
    paper: http://home.eng.iastate.edu/~rkumar/PUBS/atsp.pdf
    """
    if constant:
        # Obtain D prime
        d_max = np.max(matrix)
        np.fill_diagonal(matrix, 100)
        d_min = np.min(matrix)
        np.fill_diagonal(matrix, 0)
        if d_max / d_min < 4/3:
            Dprime = matrix
        else:
            Dprime = matrix + 3*d_max - 4*d_min + 1
            np.fill_diagonal(Dprime, 0)
    else:
        Dprime = matrix
    
    # Obtain D bar
    Dbar = np.empty((Dprime.shape[0]*2, Dprime.shape[1]*2))
    Dbar[:Dprime.shape[0], :Dprime.shape[0]] = 100
    Dbar[Dprime.shape[0]:, :Dprime.shape[0]] = Dprime
    Dbar[:Dprime.shape[0], Dprime.shape[0]:] = np.transpose(Dprime)
    Dbar[Dprime.shape[0]:, Dprime.shape[0]:] = 100
    
    # Add depot
    Dbar2 = np.empty((Dbar.shape[0]+1, Dbar.shape[1]+1))
    Dbar2[1:,1:] = Dbar
    Dbar2[0, :] = 0
    Dbar2[:, 0] = 0
    Dbar = Dbar2
    
    np.round(Dbar, 0)
    Dbar = Dbar.astype(int)
    return Dbar

def find_remaining(l):
    """
    Given 6 different numbers from 1 to 7, returns
    the remaining one.
    """
    s = {1,2,3,4,5,6,7}
    return list(s - set(l))[0]

def reduce_perms(perms, factor=2, exclude_mandatory=True, seed=397):
    """
    Given a list of permutations and a reduction factor, 
    returns a list of "glued permutations" with smaller size.
    """
    seen = set()
    result = set()
    perms = list(perms)
    random.Random(seed).shuffle(perms)
    for perm in perms:
        if (exclude_mandatory and perm[1]==1 and perm[2]==2):
            seen.add(perm)
            result.add(perm)
        if perm in seen:
            continue
        seen.add(perm)
        l = list(perm)
        for _ in range(factor-1):
            remaining = find_remaining(l[-6:])
            l.append(remaining)
            if(tuple(l[-7:]) in seen) or (exclude_mandatory and l[-7]==1 and l[-6]==2):
                l.pop()
                break
            seen.add(tuple(l[1:]))
        l = tuple(l)
        result.add(l)
    return result    

def write_params_file(name="mtsp"):
    """
    Sets solver parameters. If multi == True, attempts a multi-solve
    """
    with open(f'{name}.par', 'w') as f:
        print(f'PROBLEM_FILE = {name}.mtsp', file=f)
        print(f'TOUR_FILE = {name}.txt', file=f)
        print(f'INITIAL_TOUR_FILE = {name}.txt', file=f)
        #print('INITIAL_TOUR_ALGORITHM = MTSP', file=f)
        print('PATCHING_C = 4', file=f)
        print('PATCHING_A = 3', file=f)
        print('GAIN23 = YES', file=f)
        print('SEED = 69', file=f)
        print('MAX_TRIALS = 100000', file=f)
        print(f'TIME_LIMIT = {TIME_LIMIT}', file=f) #seconds
        print('TRACE_LEVEL = 1', file=f)


def write_problem_file(distances,name="mtsp"):
    """
    Writes problem.
    """
    with open(f'{name}.mtsp', 'w') as f:
        print('TYPE: ATSP', file=f)
        print(f'DIMENSION: {len(distances)}', file=f)
        print('EDGE_WEIGHT_TYPE: EXPLICIT', file=f)
        print('EDGE_WEIGHT_FORMAT: FULL_MATRIX\n', file=f)
        print('EDGE_WEIGHT_SECTION', file=f)
        for row in distances:
            print(' '.join(str(_) for _ in row), file=f)
            
def write_initial_tour_file(perms):
    """
    Writes starting tour
    """
    with open('initial_tour.txt', 'w') as f:
        print('TOUR_SECTION', file=f)
        print(' '.join(str(_) for _ in range(1, len(perms)+1)), -1, file=f)

def read_output_tour(perms,name="best_tour"):
    """
    Reads resulting tour
    """
    perms = list(perms)
    with open(f'{name}.txt') as f:
        lines = f.readlines()
    tour = lines[lines.index('TOUR_SECTION\n')+2:-2]
    return [perms[int(_) - 2] for _ in tour]

def read_output_tour_mtsp(perms, filename):
    """
    Reads resulting tour, returning 3-tuple of tour indices
    """
    perms = list(perms)
    tour = [[],[],[]]
    dimension = 1 + len(perms)
    with open(filename) as f:
        lines = f.readlines()
    i=-1
    for node_tag in lines[lines.index('TOUR_SECTION\n')+1:-2]:
        tag = int(node_tag)
        if tag == 1 or tag == dimension+1 or tag == dimension+2:
            i+=1
        else:
            tour[i].append(perms[tag-2]) # Sub extra to ignore depot
    return tuple(tour)

def read_output_tour_ctsp(perms, filename, flip=False):
    """
    Reads resulting tour, returning 3-tuple of tour indices
    """
    perms = list(perms)
    tour = [[],[],[]]
    dimension = 1 + 2*len(perms)
    prev = None
    with open(filename) as f:
        lines = f.readlines()
    i=-1
    L = lines[lines.index('TOUR_SECTION\n')+1:-2]
    for node_tag in L:
        tag = int(node_tag)
        if tag == 1 or tag == dimension+1 or tag == dimension+2:
            i+=1
        else:
            if tag >=2 and (perms[(tag-2)%len(perms)] not in tour[i]):
                tour[i].append(perms[(tag-2)%len(perms)]) # Sub extra to ignore depot
        if tag > 1+len(perms) and tag != prev+len(perms):
            continue
            print(f"Careful with {tag}")
        elif tag <= 1+len(perms):
            prev = tag
    return tuple(tour)
    
def solve_atsp(perms, name="santa.par"):
    
    # Run LKH-3 to solve ATSP instance
    !touch lkh.log
    !./LKH $name >> lkh.log
    #tour = read_output_tour(perms)
    #return perms_to_string(tour)
def check_validity(str1, str2, str3):
    all_perms = set(itertools.permutations(range(1, 8), 7))
    mandatory_perms = set((1, 2) +  _ for _ in itertools.permutations(range(3, 8), 5))

    strings_perms = [perms_in_string(str1), perms_in_string(str2), perms_in_string(str3)]
    for i, s in enumerate(strings_perms):
        if mandatory_perms - s:
            print(f'String #{i} is missing {mandatory_perms - s}.')
            return False
    if all_perms - set.union(*strings_perms):
        print(f"missing:{len(all_perms - set.union(*strings_perms))}")
        print(f'Strings are missing {all_perms - set.union(*strings_perms)}.')
        return False
    return True

def perms_in_string_list(string_as_list):
    perms = []
    for i in range(len(string_as_list)):
        perm = tuple(string_as_list[i:i+7])
        if len(set(perm))==7:
            if 8 not in perm:
                perms.append(perm)
            else:
                if perm.count(8) > 1:
                    continue
                for i in range(1,8):
                    perm2 = list(perm)
                    perm2[perm2.index(8)] = i
                    if len(set(perm2))==7:
                        perms.append(tuple(perm2))
                
    return perms

def perms_in_string(string_as_list):
    perms = set()
    for i in range(len(string_as_list)):
        perm = tuple(string_as_list[i:i+7])
        if len(set(perm))==7:
            if 8 not in perm:
                perms.add(perm)
            else:
                if perm.count(8) > 1:
                    continue
                for i in range(1,8):
                    perm2 = list(perm)
                    perm2[perm2.index(8)] = i
                    if len(set(perm2))==7:
                        perms.add(tuple(perm2))
                
    return perms
def concat_perms(perms):
    perms_flat = []
    for e in perms:
        for i in e:
            perms_flat.append(i)
    return perms_flat

# Compute wildcards

I designed a heuristic that ordered candidates based on the following principles:

1. *distance_from_prev*: current distance from the previous permutation in the current string.
2. *candidates_min_distance*: minimum distance of all the permutations to the mandatory permutation. 
3. *n_candidates_not_already_before_mandatory*: number of candidates that could go before the mandatory permutation with distance=*candidates_min_distance* that are not already before another mandatory permutation.
4. *candidates_not_already_avg_distances*: average distance to the next permutation in their current place at the string of the candidates of the previous point.
5. *n_candidates*: number of candidates that could go before the mandatory permutation with distance=*candidates_min_distance*.
6. *candidates_avg_distances*: average distance to the next permutation in their current place at the string of the candidates of the previous point.

The reasoning behind these points, which are ordered by importance, is the following. Maximizing "1" is the natural greedy approach, hoping that the distance from the previous node can be changed to 1 with a wildcard. Apart from trying to apply the wildcard to a permutation that is adding a lot of distance, we have to take the candidates that could help reduce this distance into account. Therefore, "2" should be ideally 1, so there is at least a candidate in the string with distance 1 to the mandatory permutation. However, we do not want to solve a problem to create another one somewhere else in the string. Thus, the rest of the points in the heuristic take into account how many candidates are available and what is their current distance to their next permutation. Special attention is paid to those candidates that are not being used before another mandatory permutation.

In [None]:
def get_distances_from_prev_node(perms, str_id):
    # First node has prev_distance = -inf
    return [-float('inf')] + [perm_dist(perms[i-1], perms[i], str_id, use_wildcards=False) for i in range(1, len(perms))]

class CostlyMandatory:
    def __init__(self, perm, distance_from_prev, candidates_min_distance, n_candidates, n_candidates_not_already_before_mandatory, candidates_avg_distances, candidates_not_already_avg_distances):
        self.perm = perm
        self.distance_from_prev = distance_from_prev  # distance from prev node
        self.candidates_min_distance = candidates_min_distance  # min distance of candidates to this perm
        self.n_candidates = n_candidates
        self.n_candidates_not_already_before_mandatory = n_candidates_not_already_before_mandatory
        self.candidates_avg_distances = candidates_avg_distances
        self.candidates_not_already_avg_distances = candidates_not_already_avg_distances
    
    def __lt__(self, other):
        return (-self.distance_from_prev,
                self.candidates_min_distance, 
                -self.n_candidates_not_already_before_mandatory, 
                -self.candidates_not_already_avg_distances, 
                -self.n_candidates, 
                -self.candidates_avg_distances) < (-other.distance_from_prev,
                                                   other.candidates_min_distance, 
                                                   -other.n_candidates_not_already_before_mandatory, 
                                                   -other.candidates_not_already_avg_distances, 
                                                   -other.n_candidates, 
                                                   -other.candidates_avg_distances)

def choose_wildcards(string, str_id, verbose=False):
    perms = perms_in_string_list(string)
    distances_from_prev = get_distances_from_prev_node(perms, str_id)
    costly_mandatory_perms = [(perms[i], distances_from_prev[i]) for i in range(len(perms)) 
                                          if perms[i] in mandatory_perms and distances_from_prev[i] >= MIN_COSTLY_DISTANCE]
    
    if verbose:
        print(f'Total costly mandatory perms: {len(costly_mandatory_perms)}')
        print(f'Costly mandatory perms: {costly_mandatory_perms}')
            
    
    costly_mandatory_ordered = []
    wildcards = []
    for perm, distance_from_prev in costly_mandatory_perms:
        # Add wildcard to desired position
        perm_with_wildcard = list(perm)
        perm_with_wildcard[WILDCARD_POS] = 8
        distances = [perm_dist(perms[i], perm_with_wildcard, str_id, use_wildcards=False) for i in range(len(perms))]
        candidates_min_distance = min(distances)
        # Choose candidates with potential distance = min_distance to perm
        candidates = [(i, perms[i]) for i in range(len(perms)) if distances[i] == candidates_min_distance] 
        candidates_not_already_before_mandatory = [(index, candidate) for index, candidate in candidates if perms[index+1] not in mandatory_perms]
        # Compute avg distances of the involved candidates from prev node and to next node
        if len(candidates) > 0:
            candidates_avg_distances = sum([distances_from_prev[index] + distances_from_prev[index+1] for index, _ in candidates]) / (2*len(candidates))
        else:
            candidates_avg_distances = 0
        if len(candidates_not_already_before_mandatory) > 0:
            candidates_not_already_avg_distances = sum([distances_from_prev[index] + distances_from_prev[index+1] for index, _ in candidates_not_already_before_mandatory]) / (2*len(candidates_not_already_before_mandatory))
        else:
            candidates_not_already_avg_distances = 0
        if verbose:
            print(f'\t{perm} has {len(candidates)} candidates with dist={candidates_min_distance} (avg_dist_prevnext={candidates_avg_distances}), {len(candidates_not_already_before_mandatory)} not already before other mandatory (avg_dist_prevnext={candidates_not_already_avg_distances})')

        costly_mandatory_ordered.append(CostlyMandatory(perm, distance_from_prev, candidates_min_distance, len(candidates), len(candidates_not_already_before_mandatory), candidates_avg_distances, candidates_not_already_avg_distances))

    costly_mandatory_ordered.sort()
    wildcards = [costly_mandatory.perm for costly_mandatory in costly_mandatory_ordered[:MAX_ELEMENTS_BRUTE_FORCE]]
    wildcards = list(itertools.combinations(wildcards, 2))
    if verbose:
        print(f'Chosen wildcards pairs: {wildcards}')
    
    return wildcards

In [None]:
str1 = [int(e) for e in str1]

wildcards_pairs = choose_wildcards(str1, str_id=0, verbose=True)

Add submission file to output

In [None]:
import pandas as pd
submission = pd.read_csv('../input/santa-2021-best-submission/submission.csv')
submission.to_csv('submission.csv', index=False)
submission.head()