# Ground Truth

Import necessary libraries

In [None]:
import pandas as pd
import os

#### Load in the datasets.

In [None]:
DATA_DIR = '../4_edge_dataset/'
datafiles = [i for i in os.listdir(DATA_DIR) if i.endswith('.csv')]
datafiles.sort()
datafiles

In [None]:
df_edges_list = [pd.read_csv(DATA_DIR + file) for file in datafiles] # list of dataframes for each csv
df_edges_list[0]

#### Generate Ground Truth

Define the directories of Concorde and Graphs.

In [None]:
CONCORDE_HOME = '/home/sbakir/Documents/Concorde/Concorde_build/TSP/'
GRAPH_DIR = '../2_graphgen/Graphs/'
tsp_files = [i for i in os.listdir(GRAPH_DIR) if i.endswith('.tsp')]
tsp_files.sort()

# Check these directories exist
assert os.path.isdir(CONCORDE_HOME)
assert os.path.isdir(GRAPH_DIR)

Run the Concorde.

In [None]:
skip = True

if not skip:
    for tsp in tsp_files:
        os.popen(CONCORDE_HOME + 'concorde -f ' + GRAPH_DIR + tsp).read()

Cleanup the directory.

In [None]:
files_types_to_keep = ['sol', 'ipynb', 'ipynb_checkpoints']

files_to_delete = [i for i in os.listdir() if i.split('.')[-1] not in files_types_to_keep]

for file in files_to_delete:
    os.remove(file)
    
os.listdir()

Create Dataframes for the solutions.

In [None]:
sol_files = tsp_files = [i for i in os.listdir() if i.endswith('.sol')]
sol_files.sort()

df_sols_list = [pd.read_csv(sol, delimiter=' ', skiprows=[0], names=['Node1_ID', 'Node2_ID', 'DISTANCE_KM']) for sol in sol_files]
    
df_sols_list[0].head()

Put Nodes into tuples.

In [None]:
for df_sol in df_sols_list:
    df_sol['EDGE(Node1_ID, Node2_ID)'] = list(zip(df_sol['Node1_ID'], df_sol['Node2_ID']))
    df_sol.drop(['Node1_ID', 'Node2_ID'], axis=1, inplace=True)
    
df_sols_list[0].head()

Create Boolean attribute to date to see if an edge is in the solution.

In [None]:
for i in range(len(df_sols_list)):
    edges_sol = df_sols_list[i]['EDGE(Node1_ID, Node2_ID)'].tolist()
    edges_sol += df_sols_list[i]['EDGE(Node1_ID, Node2_ID)'].apply(lambda x: (x[1], x[0])).tolist()

    df_edges_list[i]['EDGE_IN_SOL'] = df_edges_list[i]['EDGE(Node1_ID, Node2_ID)'].apply(eval).isin(edges_sol)
    
df_edges_list[0].head()

Sanity Check, there should be 1000 `True` values

In [None]:
print(df_edges_list[0]['EDGE_IN_SOL'].value_counts())

#### Save the final Dataframes

In [None]:
for i in range(len(df_edges_list)):
    df_edges_list[i].to_csv('./Final_Dataset/TSP_Data{}.csv'.format(i), index=False, float_format='%.6f')