This notebook shows how to build and run Clusterig Travel Sanat solver, by runnig Concorde and LKH on each cluster to find best path in each cluster.

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import sympy

## Build concorde

In [None]:
%%bash -e
if ! [[ -f ./linkern ]]; then
  wget http://www.math.uwaterloo.ca/tsp/concorde/downloads/codes/src/co031219.tgz
  echo 'c3650a59c8d57e0a00e81c1288b994a99c5aa03e5d96a314834c2d8f9505c724  co031219.tgz' | sha256sum -c
  tar xf co031219.tgz
  (cd concorde && CFLAGS='-Ofast -march=native -mtune=native -fPIC' ./configure)
  (cd concorde/LINKERN && make -j && cp linkern ../../)
  rm -rf concorde co031219.tgz
fi

# Build LKH

In [None]:
# %%bash -e
# wget http://akira.ruc.dk/~keld/research/LKH/LKH-2.0.9.tgz
# tar xvfz LKH-2.0.9.tgz
# cd LKH-2.0.9
# make

# Write TSP for Concorde

In [None]:
def write_tsp(cities, filename, dim, name='traveling-santa-2018-prime-paths'):
    with open(filename, 'w') as f:
        f.write('NAME : %s\n' % name)
        f.write('COMMENT : %s\n' % name)
        f.write('TYPE : TSP\n')
        f.write('DIMENSION : %d\n' % dim)
        f.write('EDGE_WEIGHT_TYPE : EUC_2D\n')
        f.write('NODE_COORD_SECTION\n')
        for row in cities.itertuples():
            f.write('%d %.11f %.11f\n' % (row.idx, row.X, row.Y))
        f.write('EOF\n')

# Write TSP for LKH

In [None]:
# def write_tsp1(cities, filename,dim, name='traveling-santa-2018-prime-paths'):
#     with open("../working/LKH-2.0.9/{0}".format(filename), 'w') as f:
#         f.write('NAME : %s\n' % name)
#         f.write('COMMENT : %s\n' % name)
#         f.write('TYPE : TSP\n')
#         f.write('DIMENSION : %d\n' % dim)
#         f.write('EDGE_WEIGHT_TYPE : EUC_2D\n')
#         f.write('NODE_COORD_SECTION\n')
#         for row in cities.itertuples():
#             f.write('%d %.11f %.11f\n' % (row.idx, row.X, row.Y))
#         f.write('EOF\n')

In [None]:
# def write_parameters(filename):
#     parameters = [
#     ("PROBLEM_FILE", "{0}.tsp\n".format(filename)),
#     ("OUTPUT_TOUR_FILE", "{0}_sol.csv\n".format(filename)),
#     ("SEED", 2018),
#     ('CANDIDATE_SET_TYPE', 'POPMUSIC'), #'NEAREST-NEIGHBOR', 'ALPHA'),
#     ('INITIAL_PERIOD', 1000),
#     ('MAX_TRIALS', 1000),
#     ]
#     with open("../working/LKH-2.0.9/{0}.par".format(filename), 'w') as f:
#         for param, value in parameters:
#             f.write("{} = {}\n".format(param, value))
#     #print("Parameters saved as", filename)

In [None]:
cities = pd.read_csv('../input/cities.csv')
cities['idx'] = cities.index + 1 
cities.head()

In [None]:
def plot_tour(tour, tg, cmap=mpl.cm.gist_rainbow):
    fig, ax = plt.subplots(figsize=(25, 25))
    ind = tour
    plt.plot(tg.X[ind], tg.Y[ind], linewidth=1)

# Scale input

In [None]:
cities1k = cities
cities1k.X = cities.X * 1000
cities1k.Y = cities.Y * 1000

# Clustering cities by Kmeans into 36 cluster

In [None]:
# Kmeans
from sklearn.cluster import MiniBatchKMeans,Birch
coords = np.vstack((cities1k.X.values,cities1k.Y.values)).T
sample_ind = np.random.permutation(len(coords))
kmeans = MiniBatchKMeans(n_clusters = 36, batch_size = 50).fit(coords[sample_ind])
cities1k.loc[:, 'kmeans']   = kmeans.predict(cities1k[['X', 'Y']])

# Clustering cities by GMM into 36 cluster

In [None]:
# GMM
from sklearn.mixture import GaussianMixture
mclusterer = GaussianMixture(n_components=36, tol=0.01, random_state=66, verbose=1)
cities['gmm'] = mclusterer.fit_predict(cities[['X', 'Y']].values)
nmax = cities.gmm.max()
print("{} clusters".format(nmax+1))

# Plot Clustering Results on Santa cities

In [None]:
plt.style.use('seaborn')
fig, ax = plt.subplots(ncols=1, nrows=2,figsize=(15, 5))
plt.subplot(1,2,1)
plt.scatter(cities1k.X.values, cities1k.Y.values,c=cities1k.gmm.values,s=0.3, cmap='nipy_spectral', alpha=0.9)
plt.subplot(1,2,2)
plt.scatter(cities1k.X.values, cities1k.Y.values,c=cities1k.kmeans.values,s=0.3, cmap='nipy_spectral', alpha=0.9)
plt.show()

# Prepare .tsp files for Concorde & LKH and plot each cluster seperatly - Kmeans

In [None]:
# Concorde on Kmeans
plt.style.use('seaborn')
fig, ax = plt.subplots(ncols=6, nrows=6,figsize=(15, 15))
cmap=mpl.cm.nipy_spectral

for i in range(cities1k.kmeans.max()+1):
    citiesk = cities1k[cities1k.kmeans == i]
    citiesk = citiesk.reset_index(drop=True)
    citiesk['idx'] = citiesk.index + 1
    dim = len(citiesk)
    #citiesk.to_csv('citieskm{0}.csv'.format(i),index=False)
    write_tsp(citiesk, 'citieskm{0}.tsp'.format(i),dim)
#     write_tsp1(citiesk, 'citieskm{0}.tsp'.format(i),dim)
#     write_parameters('citieskm{0}'.format(i))
    plt.subplot(6,6,i+1)
    plt.scatter(citiesk.X.values, citiesk.Y.values,s=0.5,color=cmap(i), alpha=0.99)  
    plt.title(i)
    plt.xticks([])
    plt.yticks([])

# Prepare .tsp files for Concorde & LKH and plot each cluster seperatly - GMM

In [None]:
# Concorde on GMM
plt.style.use('seaborn')
fig, ax = plt.subplots(ncols=6, nrows=6,figsize=(15, 15))
cmap=mpl.cm.nipy_spectral

for i in range(cities1k.kmeans.max()+1):
    citiesk = cities1k[cities1k.gmm == i].reset_index()
    citiesk['idx'] = citiesk.index + 1
    dim = len(citiesk)
    #citiesk.to_csv('citiesgmm{0}.csv'.format(i),index=False)
    write_tsp(citiesk, 'citiesgmm{0}.tsp'.format(i),dim)
#     write_tsp1(citiesk, 'citiesgmm{0}.tsp'.format(i),dim)
#     write_parameters('citiesgmm{0}'.format(i))
    plt.subplot(6,6,i+1)
    plt.scatter(citiesk.X.values, citiesk.Y.values,s=0.5,color=cmap(i), alpha=0.99)  
    plt.title(i)
    plt.xticks([])
    plt.yticks([])

In [None]:
#!cd LKH-2.0.9 && ls

In [None]:
print(cities1k.kmeans.max())
print(cities1k.gmm.max())

# Run Concorde on each Kmeans clusters 

In [None]:
%%bash
for i in {0..36}
    do
    echo $i
    time ./linkern -K 1 -s 42 -S linkernkm$i.tour -R 999999999 -t 3 ./citieskm$i.tsp >linkernkm$i.log
    done

# Run Concorde on each GMM clusters 

In [None]:
%%bash
for i in {0..36}
    do
    echo $i
    time ./linkern -K 1 -s 42 -S linkerngmm$i.tour -R 999999999 -t 3 ./citiesgmm$i.tsp >linkerngmm$i.log
    done

In [None]:
#!cat ./LKH-2.0.9/citiesgmm4_sol.csv

# Run LKH on each GMM clusters 

In [None]:
# %%bash
# cd ./LKH-2.0.9
# for i in {0..64}
#     do
#     echo $i
#     timeout 20s ./LKH citieskm$i.par
#     done

In [None]:
def from_file(filename):  # from linkern's output or csv
    seq = [int(x) for x in open(filename).read().split()[1:]]
    return (seq if seq[-1] == 0 else (seq + [0]))

In [None]:
# def read_tour(filename):
#     tour = []
#     for line in open(filename).readlines():
#         line = line.replace('\n', '')
#         try:
#             tour.append(int(line) - 1)
#         except ValueError as e:
#             pass  # skip if not a city id (int)
#     return tour[:-1]

# Plot Concord solution for every Kmeans cluster

In [None]:
plt.style.use('seaborn')
fig, ax = plt.subplots(ncols=1, nrows=1,figsize=(13, 9))
cmap=mpl.cm.nipy_spectral

for i in range(cities1k.kmeans.max()+1):
    citiesk = cities1k[cities1k.kmeans == i].reset_index()
    citiesk['idx'] = citiesk.index + 1
    tour = from_file('linkernkm{0}.tour'.format(i))
 #   plt.subplot(10,10,i+1)
    plt.plot(citiesk.X[tour], citiesk.Y[tour], linewidth=1)  
 #   plt.title(i)
 #   plt.xticks([])
 #   plt.yticks([])
plt.show()


# Plot Concord solution for every GMM cluster

In [None]:
plt.style.use('seaborn')
fig, ax = plt.subplots(ncols=1, nrows=1,figsize=(13, 9))
cmap=mpl.cm.nipy_spectral

for i in range(cities1k.gmm.max()+1):
    citiesk = cities1k[cities1k.gmm == i].reset_index()
    citiesk['idx'] = citiesk.index + 1
    tour = from_file('linkerngmm{0}.tour'.format(i))
 #   plt.subplot(10,10,i+1)
    plt.plot(citiesk.X[tour], citiesk.Y[tour], linewidth=1)  
 #   plt.title(i)
 #   plt.xticks([])
 #   plt.yticks([])
plt.show()


# Plot LKH solution for every GMM cluster

In [None]:
# plt.style.use('seaborn')
# fig, ax = plt.subplots(ncols=1, nrows=1,figsize=(13, 9))
# cmap=mpl.cm.nipy_spectral

# for i in range(cities1k.gmm.max()+1):
#     citiesk = cities1k[cities1k.gmm == i].reset_index()
#     citiesk['idx'] = citiesk.index + 1    
#     tour = read_tour('../working/LKH-2.0.9/citiesgmm{0}_sol.csv'.format(i))
#  #   plt.subplot(10,10,i+1)
#     plt.plot(citiesk.X[tour], citiesk.Y[tour], linewidth=1)  
#  #   plt.title(i)
#  #   plt.xticks([])
#  #   plt.yticks([])
# plt.show()


In [None]:
#@staticmethod
def score(cities, tour):
    penalized = ~cities.CityId.isin(sympy.primerange(0, len(cities)))
    df = cities.reindex(tour)
    dist = np.hypot(df.X.diff(-1), df.Y.diff(-1))
    penalty = 0.1 * dist[9::10] * penalized[tour[9::10]]
    return dist.sum() + penalty.sum()

# Show sum of all Concord solution's scores for every Kmeans cluster

In [None]:
scoretotal = 0
for i in range(cities1k.kmeans.max()+1):
    citiesk = cities1k[cities1k.kmeans == i].reset_index()
    citiesk['idx'] = citiesk.index + 1
    tour = from_file('linkernkm{0}.tour'.format(i))
    scorei = score(citiesk,tour)
    scorei = scorei/1000
    scoretotal = scorei + scoretotal
print(scoretotal)

# Show sum of all Concord solution's scores for every GMM cluster

In [None]:
scoretotal = 0
for i in range(cities1k.kmeans.max()+1):
    citiesk = cities1k[cities1k.kmeans == i].reset_index()
    citiesk['idx'] = citiesk.index + 1
    tour = from_file('linkerngmm{0}.tour'.format(i))
    scorei = score(citiesk,tour)
    scorei = scorei/1000
    scoretotal = scorei + scoretotal
print(scoretotal)

# Show sum of all LKH solution's scores for every GMM cluster

In [None]:
# scoretotal = 0
# for i in range(cities1k.gmm.max()+1):
#     citiesk = cities1k[cities1k.kmeans == i].reset_index()
#     citiesk['idx'] = citiesk.index + 1
#     tour = read_tour('../working/LKH-2.0.9/citiesgmm{0}_sol.csv'.format(i))
#     scorei = score(citiesk,tour)
#     scorei = scorei/1000
#     scoretotal = scorei + scoretotal
# print(scoretotal)

In [None]:
# tour.to_csv('submission.csv')
# tour.score()