In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import copy
import random

import time

from multiprocessing import Pool

In [None]:
cities_df = pd.read_csv('../input/cities.csv')
print(cities_df.info())
cities_df.head()

In [None]:
def is_prime(n):
    if n < 2: return False
    for x in range(2, int(np.sqrt(n)) + 1):
        if n % x == 0:
            return False
    return True
    
prime_numbers = [x for x in range(197769) if is_prime(x)]

In [None]:
def vectorised_distance(x1s, y1s, x2s, y2s):
    return np.abs(np.sqrt(np.square(x2s - x1s) + np.square(y2s - y1s)))

# def calc_prime_loss(route, distance_losses):
#     prime_loss = 0.0
#     current_step = 10
#     for city in route[::10]:
#         if int(city) in prime_numbers:
#             try:
#                 prime_loss += distance_losses[current_step] * 0.1
#             except:
#                 # simple way to avoid index error
#                 pass
#         current_step += 10
#     return prime_loss

def calc_prime_loss(route, distance_losses):
    loss_array = distance_losses * 0.1
    loss_array[[i for i in range(len(route)) if i % 10 == 0]] = 0.1
    loss_array = np.insert(loss_array, 0, 0)
    
    route = np.array(route)
    prime_array = np.where(np.isin(route, prime_numbers), 0, 1.0)
    
    prime_loss = np.sum(loss_array.T.dot(prime_array))
    
    return prime_loss

def calc_loss(route, x1s, y1s, x2s, y2s):
    distance_losses = vectorised_distance(x1s, y1s, x2s, y2s)
    prime_loss = calc_prime_loss(route, distance_losses)
#     prime_loss = 0
    return np.sum(distance_losses) + prime_loss

In [None]:
def score_route(route, cities):
    prev = route[0]
    x1s = []
    x2s = []
    y1s = []
    y2s = []
    for city in route[1:]:
        x1, y1 = cities[prev]
        x2, y2 = cities[city]
        x1s.append(x1)
        y1s.append(y1)
        x2s.append(x2)
        y2s.append(y2)
        prev = city
    loss = calc_loss(route, np.array(x1s), np.array(y1s), np.array(x2s), np.array(y2s))
    return loss

In [None]:
def create_route(start_id, cities, end_id):
    city_ids = [x[0] for x in np.array(cities, dtype=np.int)[1:]]
    return [start_id, *np.random.permutation(city_ids), end_id]

In [None]:
def swap_mutate(route, n_swaps=1):
    new_route = copy.copy(route)
    for i in range(n_swaps):
        loc_1 = random.choice(new_route[1:-2])
        loc_2 = random.choice(new_route[1:-2])
        temp_1 = new_route[loc_1]
        temp_2 = new_route[loc_2]
        new_route[loc_1] = temp_2
        new_route[loc_2] = temp_1
        del temp_1, temp_2
    return new_route

In [None]:
class Route(object):
    def __init__(self, path):
        self.path = path
        self.distance = -1
        
    def score(self, cities):
        if self.distance < 0:
            self.distance = score_route(self.path, cities)
        return self.distance
    
    def swap_path(self, n_swaps):
        self.path = swap_mutate(self.path, n_swaps)
        self.distance = -1
        return self

In [None]:
class GeneticReindeers(object):
    def __init__(self, pop_size=10, max_iter=15, top_k=2, n_swaps=5, n_jobs=4):
        self.pop_size = pop_size
        self.max_iter = max_iter
        self.top_k = top_k
        self.n_swaps = n_swaps
        self.n_jobs = n_jobs
    
    def fit(self, cities, start_id=0, end_id=0):
        fit_start = time.time()
        # cities is list of (city_id, x, y)
        print('creating initial population...')
        population = [Route(create_route(start_id, cities, end_id)) for i in range(self.pop_size)]

        city_lookup = {int(x[0]): (x[1], x[2]) for x in np.array(cities)}
        prev_best = None
        
        if self.n_jobs > 1:
            pool = Pool(self.n_jobs)
        
        # loop through each epoch
        for epoch in range(self.max_iter):
            epoch_start = time.time()
            print('currently processing iteration {}...'.format(epoch+1))
            results = []
        
            if self.n_jobs > 1:
                pool_results = []
        
            # scoring each routes
            for i, route in enumerate(population):
                if self.n_jobs > 1:
                    pool_results.append(pool.apply_async(route.score, (city_lookup,)))
                else:
                    route.score(city_lookup)
                    results.append(route)
            
            if self.n_jobs > 1:
                results = [r.get() for r in pool_results]
            
            print('  sorting results...')
            # sorting routes by score (smaller is better!)
            results_sorted = sorted(results, key=lambda x: x.distance)
            
            # keep top_k
            print('  keeping top {} routes...'.format(self.top_k))
            elites = results_sorted[0:self.top_k]
            new_pop = copy.copy(elites)
            
            # create swapped versions for remaining space in pop
            print('  creating mutated population ({} indivs)...'.format(self.pop_size - self.top_k))
            for i in range(self.top_k, self.pop_size):
                new_route = copy.copy(random.choice(elites))
                new_route.swap_path(self.n_swaps)
                new_pop.append(new_route)
            
            # print summary of epoch
            print('  finished epoch!')
            best_score = results_sorted[0].distance
            print('    best score:', best_score)
            if prev_best != None:
                print('    improvement since last epoch {}!'.format(prev_best - best_score))
            
            epoch_end = time.time()
            print('  epoch completed in {:.2f} seconds!'.format(epoch_end - epoch_start))
            
            prev_best = best_score
            population = new_pop
        self.results = results_sorted
        fit_end = time.time()
        print('fitting completed in {:.2f} seconds!'.format(fit_end - fit_start))

In [None]:
model = GeneticReindeers(n_jobs=1)
model.fit(cities_df)

In [None]:
route  = model.results[0]
print('genetic reindeer parameters:')
print('  pop size:', model.pop_size)
print('  max_iter:', model.max_iter)
print('  top kept:', model.top_k)
print('')
print('best score {} was acheived using route:'.format(route.distance))
print(route.path)

In [None]:
sub_df = pd.DataFrame()
sub_df['path'] = route

sub_df.to_csv('./my_submission.csv', index=False)

In [None]:
sub_df.head()