# Prediction gap test

Głównym celem tego notatnika jest przetestowanie naszego algorytmu liczącego prediction gap.

Robimy to dla zadania regresji, ponieważ dla zadania binarnej regresji logistycznej jest taki problem, że na koniec do wyników drzew przykładana jest funkcja sigmoid, co uniemożliwia szybkie analityczne obliczenie wyniku. Przy zadaniu regresji ten problem nie występuje.


## Konfiguracja

In [1]:
import os
while "notebooks" in os.getcwd():
    os.chdir("../")


In [2]:
from pathlib import Path
import pandas as pd

from src.decision_tree.tree import load_trees
from src.decision_tree.prediction_gap import (
    NormalPredictionGap,
    prediction_gap_on_single_feature_perturbation,
    prediction_gap_by_random_sampling,
    prediction_gap_by_exact_calc
)


In [3]:
models_path = Path("models")
data_path = Path("data")
wine_model_name = "winequality_red"
wine_test_data_path = data_path / "wine_quality/test_winequality_red_scaled.csv"
housing_model_name = "housing"
housing_test_data_path = data_path / "housing_data/test_housing_scaled.csv"


In [4]:
stddev = 0.3


## Wczytanie danych i modelu

In [5]:
wine_trees = load_trees(models_path, wine_model_name)


In [6]:
wine_data = pd.read_csv(wine_test_data_path)
wine_data


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,-0.413454,0.123905,-0.313113,-0.240375,-0.349975,-0.848716,-0.561586,-0.183745,-0.201591,-0.638220,-0.678644,5
1,1.310138,-0.937525,1.638205,-0.240375,1.371576,-0.944346,-0.865676,0.982285,-1.756618,2.312434,-0.960246,5
2,-1.160343,2.526090,-1.340122,-0.382271,-0.647527,-0.083669,-0.409542,-0.989366,1.871778,-1.169337,0.729364,6
3,-0.068735,0.598756,-0.877968,-0.311323,-0.307468,0.872638,0.411500,-0.194345,-0.136798,0.542042,0.447763,6
4,-1.217796,-0.490607,0.611196,-0.027532,-0.222453,-0.944346,-0.987312,-0.634256,1.288643,0.187963,0.541630,6
...,...,...,...,...,...,...,...,...,...,...,...,...
315,0.103624,-0.714066,0.662546,2.668484,-0.796303,-1.231239,-1.108948,-0.575955,-0.201591,-0.579207,1.480302,4
316,1.195232,0.626688,-0.159061,0.185312,0.372651,1.255161,0.198638,1.618302,-0.460762,0.069937,-0.490910,5
317,0.103624,-0.323013,-0.005010,-0.453218,-0.626274,0.203223,-0.257497,-0.830361,-0.979104,1.132173,0.635497,6
318,-0.585813,0.347364,-0.056360,-0.382271,-0.158692,0.107592,1.749495,-0.480552,-0.201591,-0.815259,-0.490910,5


In [7]:
housing_trees = load_trees(models_path, housing_model_name)


In [8]:
housing_data = pd.read_csv(housing_test_data_path)
housing_data


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,1.307575,-0.862335,-0.686477,-0.108529,0.019293,0.265378,0.111059,-1.045477,77700.0
1,-1.452618,0.987002,1.856182,-0.215333,-0.272609,-0.312139,-0.338825,0.137031,314300.0
2,-0.993418,1.581599,-0.527561,-0.341390,-0.469583,-0.355408,-0.532380,-0.600266,99100.0
3,-0.878618,1.370915,-0.924851,0.445661,0.342046,0.237120,0.435395,-0.294598,109400.0
4,-1.302879,0.982320,1.141059,-0.570583,-0.581123,-0.503761,-0.561151,-0.711385,76400.0
...,...,...,...,...,...,...,...,...,...
4123,-1.123192,0.809091,0.425936,-0.367518,-0.381775,0.021655,-0.398984,-0.079942,161500.0
4124,0.259400,-0.141327,-0.845393,4.616525,5.095534,4.930548,5.368429,-0.487884,87200.0
4125,0.638740,-0.768697,1.141059,-0.336348,-0.331939,-0.240611,-0.357134,-1.017579,112900.0
4126,0.598809,-0.675060,-0.765935,0.116539,0.671918,0.686594,0.537403,-0.632217,185100.0


In [9]:
sample_housing_data = housing_data.sample(n=200, random_state=42)
sample_housing_data


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
949,-0.065035,-0.567377,1.141059,-0.382186,0.045398,0.056977,0.100597,-0.629375,247900.0
3168,-0.883609,1.108731,0.267020,0.568967,0.353912,0.191202,0.385698,0.091447,129200.0
2080,-1.477574,1.075958,0.664310,-0.418857,-0.441105,-0.686553,-0.425140,0.133873,310300.0
1210,0.688653,-0.792107,1.299975,-0.350558,-0.396015,-0.097557,-0.307438,-0.265595,160800.0
2553,0.379192,-0.632923,-0.686477,0.477748,0.346792,0.403134,0.508632,0.152665,196800.0
...,...,...,...,...,...,...,...,...,...
3816,-0.119939,0.570316,-0.924851,-0.135574,-0.398388,-0.222067,-0.270819,0.189353,94400.0
1659,1.342514,-0.796789,0.664310,-0.522911,-0.258370,-0.494931,-0.412062,-1.395886,55000.0
3230,-1.168113,0.462633,1.856182,-0.648967,-0.642826,-0.827842,-0.613464,-0.065940,243800.0
4099,0.279366,0.205131,1.220517,-0.085151,0.088115,-0.084311,0.022129,-1.189809,50900.0


## Prediction gap dla każdego featura osobno

In [10]:
predgap = NormalPredictionGap(stddev)


In [11]:
# %%time
# single_abs_predgaps = prediction_gap_on_single_feature_perturbation(
#     predgap, wine_trees, wine_data, squared=False)


In [12]:
# single_abs_predgaps


In [13]:
%%time
single_sqr_predgaps = prediction_gap_on_single_feature_perturbation(
    predgap, wine_trees, wine_data, squared=True)


Starting predgap calculation for fixed_acidity.
Starting predgap calculation for residual_sugar.
Starting predgap calculation for sulphates.
Starting predgap calculation for alcohol.
Starting predgap calculation for chlorides.
Starting predgap calculation for citric_acid.
Starting predgap calculation for volatile_acidity.
Starting predgap calculation for pH.
Starting predgap calculation for density.
Starting predgap calculation for total_sulfur_dioxide.
Starting predgap calculation for free_sulfur_dioxide.
CPU times: user 1min 27s, sys: 4.65 s, total: 1min 32s
Wall time: 1min 26s


In [14]:
single_sqr_predgaps


Unnamed: 0,Feature,PredGap
3,alcohol,0.071638
2,sulphates,0.063286
6,volatile_acidity,0.062282
4,chlorides,0.061304
1,residual_sugar,0.061165
9,total_sulfur_dioxide,0.052848
10,free_sulfur_dioxide,0.04194
0,fixed_acidity,0.040056
5,citric_acid,0.04
8,density,0.036612


In [15]:
# %%time
# single_abs_predgaps = prediction_gap_on_single_feature_perturbation(
#     predgap, housing_trees, sample_housing_data, squared=False)


In [16]:
# single_abs_predgaps


In [17]:
%%time
single_sqr_predgaps = prediction_gap_on_single_feature_perturbation(
    predgap, housing_trees, sample_housing_data, squared=True)


Starting predgap calculation for population.
Starting predgap calculation for longitude.
Starting predgap calculation for median_income.
Starting predgap calculation for housing_median_age.
Starting predgap calculation for total_bedrooms.
Starting predgap calculation for total_rooms.
Starting predgap calculation for households.
Starting predgap calculation for latitude.
CPU times: user 1min 17s, sys: 1.95 s, total: 1min 19s
Wall time: 1min 17s


In [18]:
single_sqr_predgaps


Unnamed: 0,Feature,PredGap
7,latitude,5857819000.0
1,longitude,4390758000.0
0,population,753886200.0
5,total_rooms,565972500.0
2,median_income,527007300.0
6,households,207404100.0
3,housing_median_age,121078100.0
4,total_bedrooms,


### Tu patrzymy na wyniki liczone metodą prediction_gap_fixed

In [52]:
small_test_data = wine_data[:50]
small_test_data


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,-0.413454,0.123905,-0.313113,-0.240375,-0.349975,-0.848716,-0.561586,-0.183745,-0.201591,-0.63822,-0.678644,5
1,1.310138,-0.937525,1.638205,-0.240375,1.371576,-0.944346,-0.865676,0.982285,-1.756618,2.312434,-0.960246,5
2,-1.160343,2.52609,-1.340122,-0.382271,-0.647527,-0.083669,-0.409542,-0.989366,1.871778,-1.169337,0.729364,6
3,-0.068735,0.598756,-0.877968,-0.311323,-0.307468,0.872638,0.4115,-0.194345,-0.136798,0.542042,0.447763,6
4,-1.217796,-0.490607,0.611196,-0.027532,-0.222453,-0.944346,-0.987312,-0.634256,1.288643,0.187963,0.54163,6
5,1.597403,-0.434742,2.357111,0.469103,-0.456244,-0.944346,-0.74404,0.982285,-0.914312,0.010924,0.729364,6
6,1.310138,-0.937525,1.535504,-0.169427,-0.009916,-0.944346,-0.804858,0.00706,-1.10869,0.365003,0.635497,7
7,0.161077,-0.434742,0.200392,0.043416,-0.031169,0.490115,0.107411,0.77028,0.381544,1.486251,-0.49091,6
8,0.390889,-0.714066,0.713897,-0.382271,-0.626274,-0.083669,-0.196679,-1.381576,-0.590348,-0.343154,1.668037,6
9,-0.643266,1.101539,-1.13472,1.497846,-0.031169,-1.135608,-1.078539,-0.289747,0.640715,-1.582429,0.447763,5


In [53]:
baseline_preds = wine_trees.eval_on_multiple_rows(small_test_data)
baseline_preds


array([5.0759096, 5.015626 , 4.4223237, 5.9840064, 5.9561796, 6.2821665,
       6.762588 , 5.60282  , 6.0557013, 4.9935913, 5.084101 , 5.484541 ,
       5.998147 , 6.020846 , 5.799851 , 5.846906 , 5.3933973, 5.068446 ,
       5.222242 , 5.6952453, 4.983913 , 4.9407907, 5.9650016, 5.015912 ,
       5.9211564, 5.1665096, 5.0072565, 5.995089 , 5.219608 , 6.1953797,
       5.573    , 5.55623  , 4.7922826, 4.967173 , 5.2016373, 6.0257177,
       5.578702 , 6.63684  , 5.026106 , 5.6405716, 5.967401 , 6.7764277,
       6.613883 , 5.18775  , 7.476025 , 4.8777204, 5.0063667, 5.057142 ,
       6.9373713, 6.173322 ], dtype=float32)

In [54]:
feature = "sulphates"


In [55]:
func = predgap.prediction_gap_fixed


In [56]:
        curr_feature_total = 0.0
        for i in range(len(small_test_data)):
            x = small_test_data.iloc[i, :-1]
            y = baseline_preds[i]
            curr_datapoint_predgap = func(wine_trees, x, {feature}, y)
            print(curr_datapoint_predgap)
            curr_feature_total += curr_datapoint_predgap
        curr_feature_total /= len(small_test_data)
        

0.026529146496500774
0.0030613110630768044
0.033752266760499916
0.013618064800284503
0.06989194540941945
0.24506756210200378
0.029140271206203053
0.0034407393888035045
0.019547645833481318
0.005710543735125647
0.010706451690609744
0.02425385099443804
8.643304375747711e-05
0.10520736397474476
0.0024549551552652434
0.013760260700920025
0.32019196112079606
0.0028902856021431073
0.11677134646549941
0.0010248332009339066
0.00048102596431245546
0.026789432562591437
0.0014770824272020435
0.007647409866339507
0.20069676442244974
0.023213321170473863
0.05210633133472825
0.2587448373137733
0.07348691243812588
2.128043334816864e-05
0.02093490392662912
0.01044569781921558
0.07880128656750651
0.005678626268919466
0.002332019860099715
0.020453411799426086
0.041084416667771226
0.050346164066965564
0.015884583289230637
0.2234755508549669
0.011067171593710779
0.07325067055764875
0.1435789545223446
0.005467362985456903
0.019114618051542902
0.062496643200143955
0.021522937946251385
0.0049148655566271255


In [57]:
print(curr_feature_total)


0.0574421700977847


### Tu patrzymy ile mniej więcej wychodzi przy liczeniu tego samego przez losowe samplowanie

In [58]:
import numpy as np


In [59]:
perturbed_features = {feature}


In [60]:
rng = np.random.default_rng()


In [61]:
perturbed_df = small_test_data.copy()
perturbed_df[list(perturbed_features)] += rng.normal(loc=0.0, scale=stddev,
                                                     size=(len(small_test_data), len(perturbed_features)))


In [63]:
perturbed_preds = wine_trees.eval_on_multiple_rows(perturbed_df)
perturbed_preds


array([5.3274355, 5.024579 , 4.4223237, 5.72366  , 5.986934 , 6.3353586,
       6.759215 , 5.60282  , 6.097934 , 4.9935913, 4.9752274, 5.328226 ,
       5.998147 , 6.006169 , 5.7166777, 5.7840886, 5.6622906, 5.068446 ,
       5.807396 , 5.680813 , 4.983913 , 4.7912073, 5.9650016, 4.943754 ,
       5.7557216, 5.1775584, 5.5147357, 5.229317 , 5.274926 , 6.1953797,
       5.6350775, 5.5406666, 4.7922826, 4.9548826, 5.2273073, 6.0318913,
       5.564811 , 6.63684  , 5.026106 , 5.6136823, 5.9676757, 6.7545075,
       6.8587313, 5.1634903, 7.479767 , 4.895701 , 5.15318  , 5.3127055,
       7.0412045, 6.1674757], dtype=float32)

In [64]:
(baseline_preds - perturbed_preds) ** 2


array([6.3265264e-02, 8.0157901e-05, 0.0000000e+00, 6.7780256e-02,
       9.4584335e-04, 2.8294036e-03, 1.1378114e-05, 0.0000000e+00,
       1.7835852e-03, 0.0000000e+00, 1.1853514e-02, 2.4434332e-02,
       0.0000000e+00, 2.1541573e-04, 6.9177938e-03, 3.9460477e-03,
       7.2303578e-02, 0.0000000e+00, 3.4240526e-01, 2.0829504e-04,
       0.0000000e+00, 2.2375176e-02, 0.0000000e+00, 5.2067568e-03,
       2.7368685e-02, 1.2207584e-04, 2.5753513e-01, 5.8640653e-01,
       3.0601204e-03, 0.0000000e+00, 3.8536189e-03, 2.4222216e-04,
       0.0000000e+00, 1.5105585e-04, 6.5895153e-04, 3.8113471e-05,
       1.9295275e-04, 0.0000000e+00, 0.0000000e+00, 7.2303577e-04,
       7.5437129e-08, 4.8049536e-04, 5.9950665e-02, 5.8852660e-04,
       1.4000627e-05, 3.2330109e-04, 2.1554172e-02, 6.5312825e-02,
       1.0781333e-02, 3.4181568e-05], dtype=float32)

In [65]:
np.mean((baseline_preds - perturbed_preds) ** 2)


0.033319082