# Feature Engineering

In [1]:
import pandas as pd
import os

import feature_engineering_utils as F

Import a dataset.

In [2]:
DATASET_DIR = '../5_ground_truth/Final_Dataset/'
data_files = [i for i in os.listdir(DATASET_DIR) if i.endswith('.csv')]
data_files

['TSP_Data0.csv',
 'TSP_Data1.csv',
 'TSP_Data2.csv',
 'TSP_Data3.csv',
 'TSP_Data4.csv',
 'TSP_Data5.csv',
 'TSP_Data6.csv',
 'TSP_Data7.csv',
 'TSP_Data8.csv',
 'TSP_Data9.csv']

In [3]:
df_TSP_list = [pd.read_csv(DATASET_DIR + file) for file in data_files]
df_TSP_list[0].head()

Unnamed: 0,DISTANCE_KM,"EDGE(Node1_ID, Node2_ID)",NODE1_COORDS,NODE2_COORDS,GEODESIC_DISTANCE_KM,GEODESIC_ROUNDED_DISTANCE_KM,EDGE_IN_SOL
0,108,"(0, 1)","(53.891303, -1.746483)","(53.383309, -2.6226)",80.961034,81.0,False
1,384,"(0, 2)","(53.891303, -1.746483)","(50.848774, 0.211594)",363.840033,364.0,False
2,11,"(0, 3)","(53.891303, -1.746483)","(53.926187, -1.824431)",6.427843,6.0,False
3,168,"(0, 4)","(53.891303, -1.746483)","(52.586267, -2.126056)",147.432268,147.0,False
4,402,"(0, 5)","(53.891303, -1.746483)","(50.8511, 0.542925)",372.453433,372.0,False


Some of the dtypes of the data may not be set correctly.

In [5]:
for i in range(len(df_TSP_list)):
    df_TSP_list[i] = df_TSP_list[i].applymap(lambda x: eval(x) if isinstance(x, str) else x)
    
df_TSP_list[0].info()

(0, 1)

### Local Ranking

This will be the rank of an edge by it's weight in comparison between the edges that are incident to the nodes that create this edge.

<img src="figs/Local_Edge_Ranking.png" style="width: 300px;"/>

The smaller the rank, the lower the weight is, in this local system. With this defintion of local edge ranking, ranks will go from range from $1$ to $2n-2$ (2 nodes with $n-1$ edges incident and 1 in common) where $n$ is the number of nodes in the graph. There will be a total of $2n-3$ to sample for each rank.

In [None]:
#F.local_edge_rank(df_TSP, edge, metric='DISTANCE_KM')

In [None]:
#from dask.distributed import Client, progress
#client = Client(n_workers=4, threads_per_worker=2, memory_limit='3GB')
#client

In [None]:
#import dask.dataframe as dd

#dd_series = dd.from_pandas(df_TSP['EDGE(Node1_ID, Node2_ID)'], npartitions=8)
#dd_series

In [None]:
#res = dd_series.map_partitions(lambda series: series.apply(lambda x: F.local_edge_rank(df_TSP, x, metric='DISTANCE_KM')), meta=(None, 'i8'))
#res.dtype

In [None]:
#local_ranks = res.compute()
#local_ranks

### Global Ranking

Similarly to Local edge ranking, this will rank the edges in the entire dateset.

In [9]:
for i in range(len(df_TSP_list)):
    df_TSP_list[i] = df_TSP_list[i].sort_values('DISTANCE_KM')
    df_TSP_list[i]['GLOBAL_RANK'] = range(1, df_TSP_list[i].shape[0] + 1)
    df_TSP_list[i] = df_TSP_list[i].sort_index()

df_TSP_list[0].head()

Unnamed: 0,DISTANCE_KM,"EDGE(Node1_ID, Node2_ID)",NODE1_COORDS,NODE2_COORDS,GEODESIC_DISTANCE_KM,GEODESIC_ROUNDED_DISTANCE_KM,EDGE_IN_SOL,GLOBAL_RANK
0,108,"(0, 1)","(53.891303, -1.746483)","(53.383309, -2.6226)",80.961034,81.0,False,83596
1,384,"(0, 2)","(53.891303, -1.746483)","(50.848774, 0.211594)",363.840033,364.0,False,396911
2,11,"(0, 3)","(53.891303, -1.746483)","(53.926187, -1.824431)",6.427843,6.0,False,2752
3,168,"(0, 4)","(53.891303, -1.746483)","(52.586267, -2.126056)",147.432268,147.0,False,156459
4,402,"(0, 5)","(53.891303, -1.746483)","(50.8511, 0.542925)",372.453433,372.0,False,406968


### Edge Statistics

These features will be show if an edge is in within certain quatiles.

In [8]:
df_TSP_list[0]['DISTANCE_KM'].quantile([0.25, 0.5, 0.75])

0.25    143.0
0.50    241.0
0.75    352.0
Name: DISTANCE_KM, dtype: float64

In [10]:
for i in range(len(df_TSP_list)):
    df_TSP_list[i]['IS_IN_1ST_QUARTILE'] = df_TSP_list[i]['DISTANCE_KM'] < df_TSP_list[i]['DISTANCE_KM'].quantile(0.25)
    df_TSP_list[i]['IS_IN_2ND_QUARTILE'] = df_TSP_list[i]['DISTANCE_KM'] < df_TSP_list[i]['DISTANCE_KM'].quantile(0.5)
    df_TSP_list[i]['IS_IN_3RD_QUARTILE'] = df_TSP_list[i]['DISTANCE_KM'] < df_TSP_list[i]['DISTANCE_KM'].quantile(0.75)
    
df_TSP_list[0].head()

Unnamed: 0,DISTANCE_KM,"EDGE(Node1_ID, Node2_ID)",NODE1_COORDS,NODE2_COORDS,GEODESIC_DISTANCE_KM,GEODESIC_ROUNDED_DISTANCE_KM,EDGE_IN_SOL,GLOBAL_RANK,IS_IN_1ST_QUARTILE,IS_IN_2ND_QUARTILE,IS_IN_3RD_QUARTILE
0,108,"(0, 1)","(53.891303, -1.746483)","(53.383309, -2.6226)",80.961034,81.0,False,83596,True,True,True
1,384,"(0, 2)","(53.891303, -1.746483)","(50.848774, 0.211594)",363.840033,364.0,False,396911,False,False,False
2,11,"(0, 3)","(53.891303, -1.746483)","(53.926187, -1.824431)",6.427843,6.0,False,2752,True,True,True
3,168,"(0, 4)","(53.891303, -1.746483)","(52.586267, -2.126056)",147.432268,147.0,False,156459,False,True,True
4,402,"(0, 5)","(53.891303, -1.746483)","(50.8511, 0.542925)",372.453433,372.0,False,406968,False,False,False


Save dataset.

In [11]:
cols = ['DISTANCE_KM', 'GLOBAL_RANK', 'IS_IN_1ST_QUARTILE', 'IS_IN_2ND_QUARTILE', 'IS_IN_3RD_QUARTILE', 'EDGE_IN_SOL']

for i in range(len(df_TSP_list)):
    df_TSP_list[i][cols].to_csv('Feature_Dataset/TSP_Data{}.csv'.format(i), index=False, float_format='%.6f')