# Feature Engineering

In [None]:
import pandas as pd
import os

import feature_engineering_utils as F

Import a dataset.

In [None]:
DATASET_DIR = '../5_ground_truth/Final_Dataset/'
data_files = [i for i in os.listdir(DATASET_DIR) if i.endswith('.csv')]
data_files

In [None]:
df_TSP_list = [pd.read_csv(DATASET_DIR + file) for file in data_files]
df_TSP_list[0].head()

Some of the dtypes of the data may not be set correctly.

In [None]:
for i in range(len(df_TSP_list)):
    df_TSP_list[i] = df_TSP_list[i].applymap(lambda x: eval(x) if isinstance(x, str) else x)
    
df_TSP_list[0].info()

### Local Ranking

This will be the rank of an edge by it's weight with respect to each node that makes creates an edge.

In [None]:
df_TSP_list[0]

Find all the edges that are incident to each node.

In [None]:
df_nodes_list = []

for i in range(len(df_TSP_list)):
    num_nodes = max(df_TSP_list[i]['EDGE(Node1_ID, Node2_ID)'].max()) + 1

    df_nodes = pd.DataFrame({'Node': range(num_nodes),
                             'Incident Edges': [F.edges_incident_to_node(df_TSP_list[i], j) for j in range(num_nodes)]
                            })
    
    df_nodes_list.append(df_nodes)
    
df_nodes_list[0]

In [None]:
for i in range(len(df_TSP_list)):
    df_TSP_list[i]['Local Rank Incident to Node1'] = df_TSP_list[i]['EDGE(Node1_ID, Node2_ID)'].apply(
        lambda edge: F.local_edge_rank_incident_to_node(df_nodes_list[i], edge[0], edge))
    df_TSP_list[i]['Local Rank Incident to Node2'] = df_TSP_list[i]['EDGE(Node1_ID, Node2_ID)'].apply(
        lambda edge: F.local_edge_rank_incident_to_node(df_nodes_list[i], edge[1], edge))
    
df_TSP_list[0]

### Global Ranking

Similarly to Local edge ranking, this will rank the edges in the entire dateset.

In [None]:
for i in range(len(df_TSP_list)):
    df_TSP_list[i] = df_TSP_list[i].sort_values('DISTANCE_KM')
    df_TSP_list[i]['GLOBAL_RANK'] = range(1, df_TSP_list[i].shape[0] + 1)
    df_TSP_list[i] = df_TSP_list[i].sort_index()

df_TSP_list[0].head()

### Edge Statistics

These features will be show if an edge is in within certain quatiles.

In [None]:
df_TSP_list[0]['DISTANCE_KM'].quantile([0.25, 0.5, 0.75])

In [None]:
for i in range(len(df_TSP_list)):
    df_TSP_list[i]['IS_IN_1ST_QUARTILE'] = df_TSP_list[i]['DISTANCE_KM'] < df_TSP_list[i]['DISTANCE_KM'].quantile(0.25)
    df_TSP_list[i]['IS_IN_2ND_QUARTILE'] = df_TSP_list[i]['DISTANCE_KM'] < df_TSP_list[i]['DISTANCE_KM'].quantile(0.5)
    df_TSP_list[i]['IS_IN_3RD_QUARTILE'] = df_TSP_list[i]['DISTANCE_KM'] < df_TSP_list[i]['DISTANCE_KM'].quantile(0.75)
    
df_TSP_list[0].head()

Save dataset.

In [None]:
cols = ['DISTANCE_KM', 'Local Rank Incident to Node1', 'Local Rank Incident to Node2', 'GLOBAL_RANK', 'IS_IN_1ST_QUARTILE', 'IS_IN_2ND_QUARTILE', 'IS_IN_3RD_QUARTILE', 'EDGE_IN_SOL']

for i in range(len(df_TSP_list)):
    df_TSP_list[i][cols].to_csv('Feature_Dataset/TSP_Data{}.csv'.format(i), index=False, float_format='%.6f')