# Prediction from Visual Features

### Install/Imports

In [1]:
import LSBoost
import helper_functions as hf

import pandas as pd

import numpy as np

import sklearn as sk
from sklearn.model_selection import train_test_split
from helper_functions import MSCE as MSCE
from sklearn.metrics import mean_squared_error as MSE

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingRegressor

import gc
import time
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
random_state = 0
np.random.seed(random_state)

X = pd.read_csv('./visual-prediction-data/features_labels_wide.csv')
h = pd.read_csv('./visual-prediction-data/study_2/responses.csv')

FileNotFoundError: [Errno 2] No such file or directory: './data/features_labels_wide.csv'

In [None]:
data = X.drop(['escaped', 'location', 'room'], axis=1)
escaped = X['escaped'].map({'Y': 1, 'N': 0})

In [None]:
data.head()

In [None]:
# split train test data

x_train = data[~data['file_id'].isin(h['Img'])]
y_train = escaped[~data['file_id'].isin(h['Img'])]

x_test = data[data['file_id'].isin(h['Img'])]
y_test = escaped[data['file_id'].isin(h['Img'])]

x_train_id = x_train['file_id']
x_train = x_train.drop(['file_id'], axis = 1)
x_test_id = x_test['file_id']
x_test = x_test.drop('file_id', axis = 1)


# print out label information
hf.print_data_report(x_train, x_test, y_train, y_test)

### Hyperparameters

In [None]:
# number of rounds - (integer)
T = 100

# level sets - (integer)
num_bins = 2

# minimum number of points in level set for update to occur - (integer)
min_group_size = 5

# amount (multiplicative) level set must improve by to accept update if validation data or leniency factor for update requirements - (float)
global_gamma = .001

# weak learner class for UDT - (ML class with .predict method)
weak_learner = DecisionTreeClassifier(max_depth = 5)

# determines width of level sets, either 'default' for equal width or 'distribution' for equal number of expected points - (string)
bin_type = 'default'

# multiplicative weight for new update - (float)
learning_rate = 1

# starting model to boost. Set to None for weak learner to be use as initial model - (fit ML class with .predict method or None)
initial_model = None

# round final predictions to level set center or use output of weak learner - (bool)
final_round = True

# center level sets using historical mean - (bool)
center_mean = True

### Train LSBoost Regressor

In [None]:
LSBoostReg = LSBoost.LSBoostingRegressor(
                                T = T, 
                                num_bins = num_bins, 
                                min_group_size = min_group_size, 
                                global_gamma = global_gamma, 
                                weak_learner= weak_learner, 
                                bin_type = bin_type, 
                                learning_rate = learning_rate, 
                                initial_model = initial_model,  
                                final_round = final_round, 
                                center_mean=center_mean)
LSBoostReg.fit(x_train.values, y_train.values)

In [None]:
training_predictions = LSBoostReg.predict(x_train)
test_predictions = LSBoostReg.predict(x_test)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_test, test_predictions)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()



In [None]:
# calculate 0-1 accuracy
zero_one_preds = map(lambda x: 0 if x <= 0.5 else 1, test_predictions)
print(np.mean(y_test == np.array(list(zero_one_preds))))


In [None]:
bins = np.ceil(test_predictions * num_bins).astype(int) - 1
df = pd.DataFrame({'file_id': x_test_id, 'mc_pred': test_predictions, 'bin': bins})
df.head()


In [None]:
df.to_csv('mc_predictions.csv', index=False)
