# Precision vs. Prune Ratio

This notebook will look at how the threshold value effects the prune ratio and precision in the tour predicted by a model.

In [None]:
import pandas as pd
import numpy as np
import pickle
import os

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import joblib
import model_utils as M

import seaborn as sns
import matplotlib.pyplot as plt

Load in the models.

In [None]:
TSP_NB = joblib.load('Models/TSP_NB_model.pkl')
TSP_LR = joblib.load('Models/TSP_LR_model.pkl')
TSP_RF = joblib.load('Models/TSP_RF_model.pkl')

Load the test data.

In [None]:
DATA_DIR = '../6_feature_engineering/Feature_Dataset/'
data_files = [i for i in os.listdir(DATA_DIR) if i.endswith('.csv')]
data_files.sort()
df_TSP_list = [pd.read_csv(DATA_DIR + file) for file in data_files]

for i in range(len(df_TSP_list)):
    # Convert booleans to int
    df_TSP_list[i] = df_TSP_list[i].astype({'IS_IN_1ST_QUARTILE': 'int64',
                                            'IS_IN_2ND_QUARTILE': 'int64',
                                            'IS_IN_3RD_QUARTILE': 'int64',
                                            'EDGE_IN_SOL': 'int64'})
    df_TSP_list[i] = df_TSP_list[i].reset_index(drop=True)
    
train_set = df_TSP_list[:5]
test_set = df_TSP_list[5:]

test_set[0].shape

Load the coordinates for the test data.

In [None]:
COORD_DATA_DIR = '../5_ground_truth/Final_Dataset/'
coord_datafiles = [i for i in os.listdir(COORD_DATA_DIR) if i.endswith('.csv')]
coord_datafiles.sort()

df_coord_list = [pd.read_csv(COORD_DATA_DIR + file) for file in coord_datafiles]

for i in range(len(df_coord_list)):
    df_coord_list[i]['NODE1_COORDS'] = df_coord_list[i]['NODE1_COORDS'].apply(eval)
    df_coord_list[i]['NODE2_COORDS'] = df_coord_list[i]['NODE2_COORDS'].apply(eval)

train_coords = df_coord_list[:5]
test_coords = df_coord_list[5:]

test_coords[0].shape

Constant Variables.

In [None]:
set_num = 0 # For changing between different test graphs
cols = test_set[set_num].columns[:-1]  # Every column except the last one

## Naïve Bayes: Prune Ratio vs. Precision

In [None]:
step = 0.05 # size between threshold values

In [None]:
NB_precisions = np.array([])
NB_prune_ratio = np.array([])


X = test_set[set_num][cols].values

for t in np.arange(0, 1+step, step):
    tour = M.threshold_tour(X, test_coords[set_num], TSP_NB, threshold=t)
    NB_precisions = np.append(NB_precisions, M.precision(tour))
    NB_prune_ratio = np.append(NB_prune_ratio, M.prune_ratio(tour, test_coords[set_num].shape[0]))

print("Naïve Bayes")
print("Precision: {}".format(NB_precisions))
print("Prune ratio: {}".format(NB_prune_ratio))

In [None]:
df = pd.DataFrame({"NB precisions": 1 - NB_precisions, 
                   "NB prune ratio": NB_prune_ratio})


ax = sns.lineplot(x="NB precisions", y="NB prune ratio", data=df)
#ax.figure.legend()
ax.set_xlabel('Precision')
ax.set_ylabel('Non-Pruning Ratio')

plt.title("Naïve Bayes: Non-Pruning Ratio vs. Precision (Graph {})".format(set_num))

plt.savefig('exp2/NB non-prune vs precision {}.pdf'.format(set_num))
plt.show()

## Logistic Regression: Prune Ratio vs. Precision

In [None]:
LR_precisions = np.array([])
LR_prune_ratio = np.array([])


X = test_set[set_num][cols].values

for t in np.arange(0, 1+step, step):
    tour = M.threshold_tour(X, test_coords[set_num], TSP_LR, threshold=t)
    LR_precisions = np.append(LR_precisions, M.precision(tour))
    LR_prune_ratio = np.append(LR_prune_ratio, M.prune_ratio(tour, test_coords[set_num].shape[0]))

print("Logistic Regression")
print("Precision: {}".format(LR_precisions))
print("Prune ratio: {}".format(LR_prune_ratio))

In [None]:
df = pd.DataFrame({"LR precisions": 1 - LR_precisions, 
                   "LR prune ratio": LR_prune_ratio})


ax = sns.lineplot(x="LR precisions", y="LR prune ratio", data=df)
#ax.figure.legend()
ax.set_xlabel('Precision')
ax.set_ylabel('Non-Pruning Ratio')

plt.title("Logistic Regression: Non-Pruning Ratio vs. Precision (Graph {})".format(set_num))

plt.savefig('exp2/LR non-prune vs precision {}.pdf'.format(set_num))
plt.show()

## Random Forest: Prune Ratio vs. Precision

In [None]:
RF_precisions = np.array([])
RF_prune_ratio = np.array([])


X = test_set[set_num][cols].values

for t in np.arange(0, 1+step, step):
    tour = M.threshold_tour(X, test_coords[set_num], TSP_RF, threshold=t)
    RF_precisions = np.append(RF_precisions, M.precision(tour))
    RF_prune_ratio = np.append(RF_prune_ratio, M.prune_ratio(tour, test_coords[set_num].shape[0]))

print("Random Forest")
print("Precision: {}".format(RF_precisions))
print("Prune ratio: {}".format(RF_prune_ratio))

In [None]:
df = pd.DataFrame({"RF precisions": 1 - RF_precisions, 
                   "RF prune ratio": RF_prune_ratio})


ax = sns.lineplot(x="RF precisions", y="RF prune ratio", data=df)
#ax.figure.legend()
ax.set_xlabel('Precision')
ax.set_ylabel('Non-Pruning Ratio')

plt.title("Random Forest: Non-Pruning Ratio vs. Precision (Graph {})".format(set_num))

plt.savefig('exp2/RF non-prune vs precision {}.pdf'.format(set_num))
plt.show()