# Precision and Ground Truth Edges vs. Threshold

This notebook will look at how the threshold value effects the prune ratio and the amount of ground truth edges in the tour predicted by a model.

In [None]:
import pandas as pd
import numpy as np
import pickle
import os

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import joblib
import model_utils as M

import seaborn as sns
import matplotlib.pyplot as plt

Load in the models.

In [None]:
TSP_NB = joblib.load('Models/TSP_NB_model.pkl')
TSP_LR = joblib.load('Models/TSP_LR_model.pkl')
TSP_RF = joblib.load('Models/TSP_RF_model.pkl')

Load the test data.

In [None]:
DATA_DIR = '../6_feature_engineering/Feature_Dataset/'
data_files = [i for i in os.listdir(DATA_DIR) if i.endswith('.csv')]
data_files.sort()
df_TSP_list = [pd.read_csv(DATA_DIR + file) for file in data_files]

for i in range(len(df_TSP_list)):
    # Convert booleans to int
    df_TSP_list[i] = df_TSP_list[i].astype({'IS_IN_1ST_QUARTILE': 'int64',
                                            'IS_IN_2ND_QUARTILE': 'int64',
                                            'IS_IN_3RD_QUARTILE': 'int64',
                                            'EDGE_IN_SOL': 'int64'})
    df_TSP_list[i] = df_TSP_list[i].reset_index(drop=True)
    
train_set = df_TSP_list[:5]
test_set = df_TSP_list[5:]

test_set[0].shape

Load the coordinates for the test data.

In [None]:
COORD_DATA_DIR = '../5_ground_truth/Final_Dataset/'
coord_datafiles = [i for i in os.listdir(COORD_DATA_DIR) if i.endswith('.csv')]
coord_datafiles.sort()

df_coord_list = [pd.read_csv(COORD_DATA_DIR + file) for file in coord_datafiles]

for i in range(len(df_coord_list)):
    df_coord_list[i]['NODE1_COORDS'] = df_coord_list[i]['NODE1_COORDS'].apply(eval)
    df_coord_list[i]['NODE2_COORDS'] = df_coord_list[i]['NODE2_COORDS'].apply(eval)

train_coords = df_coord_list[:5]
test_coords = df_coord_list[5:]

test_coords[0].shape

Constant variables.

In [None]:
set_num = 0 # For changing between different test graphs
cols = test_set[set_num].columns[:-1]  # Every column except the last one

## Naïve Bayes Threshold Testing

In [None]:
NB_precisions = np.array([])
NB_ground_truth_count = np.array([])


X = test_set[set_num][cols].values

step = 0.05
for t in np.arange(0, 1+step, step):
    tour = M.threshold_tour(X, test_coords[set_num], TSP_NB, threshold=t)
    NB_precisions = np.append(NB_precisions, M.precision(tour))
    NB_ground_truth_count = np.append(NB_ground_truth_count, M.ground_truth_count(tour))

print("Naïve Bayes")
print("Average Precision: {}".format(NB_precisions))
print("Average Ground Truth Count: {}".format(NB_ground_truth_count))

In [None]:
df = pd.DataFrame({"Threshold": np.arange(0, 1+step, step),
                   "NB precisions": NB_precisions, 
                   "NB ground truth count": NB_ground_truth_count})

ax = df.plot(x="Threshold", y="NB precisions", legend=False, figsize=(10,5))
ax2 = ax.twinx()
df.plot(x="Threshold", y="NB ground truth count", ax=ax2, legend=False, color="r")
ax.figure.legend(loc=(0.77,0.91))

plt.title("Naïve Bayes: Threshold vs. Precision and Ground Truth Count (Graph {})".format(set_num))
ax.set_ylabel('Precision')
ax2.set_ylabel('Ground Truth Count')

plt.savefig('exp1/NB thresh vs. precision and gtruth {}.pdf'.format(set_num))
plt.show()

## Logistic Regression Threshold Testing

In [None]:
LR_precisions = np.array([])
LR_ground_truth_count = np.array([])


X = test_set[set_num][cols].values

step = 0.05
for t in np.arange(0, 1+step, step):
    tour = M.threshold_tour(X, test_coords[set_num], TSP_LR, threshold=t)
    LR_precisions = np.append(LR_precisions, M.precision(tour))
    LR_ground_truth_count = np.append(LR_ground_truth_count, M.ground_truth_count(tour))
    

print("Logistic Regression")
print("Precision: {}".format(LR_precisions))
print("Ground Truth Count: {}".format(LR_ground_truth_count))

In [None]:
df = pd.DataFrame({"Threshold": np.arange(0, 1+step, step),
                   "LR precisions": LR_precisions, 
                   "LR ground truth count": LR_ground_truth_count})

ax = df.plot(x="Threshold", y="LR precisions", legend=False, figsize=(10,5))
ax2 = ax.twinx()
df.plot(x="Threshold", y="LR ground truth count", ax=ax2, legend=False, color="r")
ax.figure.legend(loc=(0.77,0.91))

plt.title("Logistic Regression: Threshold vs. Precision and Ground Truth Count (Graph {})".format(set_num))
ax.set_ylabel('Precision')
ax2.set_ylabel('Ground Truth Count')

plt.savefig('exp1/LR thresh vs. precision and gtruth {}.pdf'.format(set_num))
plt.show()

## Random Forest Threshold Testing

In [None]:
RF_precisions = np.array([])
RF_ground_truth_count = np.array([])


X = test_set[set_num][cols].values

step = 0.05
for t in np.arange(0, 1+step, step):
    tour = M.threshold_tour(X, test_coords[set_num], TSP_RF, threshold=t)
    RF_precisions = np.append(RF_precisions, M.precision(tour))
    RF_ground_truth_count = np.append(RF_ground_truth_count, M.ground_truth_count(tour))


print("Random Forest")
print("Precision: {}".format(RF_precisions))
print("Ground Truth Count: {}".format(RF_ground_truth_count))

In [None]:
df = pd.DataFrame({"Threshold": np.arange(0, 1+step, step),
                   "RF precisions": RF_precisions, 
                   "RF ground truth count": RF_ground_truth_count})

ax = df.plot(x="Threshold", y="RF precisions", legend=False, figsize=(10,5))
ax2 = ax.twinx()
df.plot(x="Threshold", y="RF ground truth count", ax=ax2, legend=False, color="r")
ax.figure.legend(loc=(0.77,0.91))

plt.title("Random Forest: Threshold vs. Precision and Ground Truth Count (Graph {})".format(set_num))
ax.set_ylabel('Precision')
ax2.set_ylabel('Ground Truth Count')

plt.savefig('exp1/RF thresh vs. precision and gtruth {}.pdf'.format(set_num))
plt.show()