# Blog Feedback Dataset Prediction Intervals using `pitci`

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import requests
import zipfile
from pathlib import Path
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

In [2]:
import pitci
pitci.__version__

'0.1.2.dev1'

# Build example xgboost model

## Download data

In [3]:
zip_address = (
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00304/BlogFeedback.zip"
)

In [4]:
dataloaded_data_location = "data/BlogFeedback.zip"

In [5]:
if not Path(dataloaded_data_location).is_file():

    r = requests.get(zip_address)

    with open(dataloaded_data_location, "wb") as f:

        f.write(r.content)

    with zipfile.ZipFile(dataloaded_data_location, "r") as zip_ref:

        zip_ref.extractall("data/blogfeedback")

## Import data

In [6]:
train = pd.read_csv("data/blogfeedback/blogData_train.csv", header=None)
train.rename(columns={280: "number_comments"}, inplace=True)

In [7]:
test = pd.concat(
    [pd.read_csv(x, header=None) for x in Path("data/blogfeedback/").glob("*test*")],
    axis=0,
)
test.rename(columns={280: "number_comments"}, inplace=True)

## Add sample column
Create a sample column with 3 values that will be used in the following way; <br>
1: (65%) train <br>
2: (17.5%) validate (early stopping) <br>
3: (17.5%) interval <br>
The test sample is provided in a different dataframe.

In [8]:
np.random.seed(1)
random_col = np.random.random(train.shape[0])
train["sample"] = "train"
train.loc[random_col > 0.65, "sample"] = "validate"
train.loc[random_col > 0.825, "sample"] = "interval"

## Append train and test datasets

In [9]:
test["sample"] = "test"

In [10]:
train = train.append(test)

## Create xgboost DMatrices

In [11]:
response = "number_comments"

In [12]:
lgb_data_train = lgb.Dataset(
    data=train.loc[train["sample"] == "train"].drop(columns=[response, "sample"]),
    label=train.loc[train["sample"] == "train", response],
)

In [13]:
lgb_data_valid = lgb.Dataset(
    data=train.loc[train["sample"] == "validate"].drop(columns=[response, "sample"]),
    label=train.loc[train["sample"] == "validate", response],
)

In [14]:
lgb_data_interval = lgb.Dataset(
    data=train.loc[train["sample"] == "interval"].drop(columns=[response, "sample"]),
    label=train.loc[train["sample"] == "interval", response],
)

In [15]:
lgb_data_test = lgb.Dataset(
    data=train.loc[train["sample"] == "test"].drop(columns=[response, "sample"]),
    label=train.loc[train["sample"] == "test", response],
)

## Build model

In [16]:
model = lgb.train(
    params={'num_leaves': 10, "learning_rate": 0.05, 'metric': "mean_squared_error"},
    train_set=lgb_data_train,
    num_boost_round=500,
    valid_sets=[lgb_data_valid],
    valid_names=['validate'],
    early_stopping_rounds=5,
)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8293
[LightGBM] [Info] Number of data points in the train set: 34044, number of used features: 219
[LightGBM] [Info] Start training from score 6.681207
[1]	validate's l2: 1266.53
Training until validation scores don't improve for 5 rounds
[2]	validate's l2: 1202.16
[3]	validate's l2: 1147
[4]	validate's l2: 1096.99
[5]	validate's l2: 1053.63
[6]	validate's l2: 1009.72
[7]	validate's l2: 974.423
[8]	validate's l2: 941.595
[9]	validate's l2: 912.571
[10]	validate's l2: 886.644
[11]	validate's l2: 862.804
[12]	validate's l2: 842.814
[13]	validate's l2: 821.709
[14]	validate's l2: 804.55
[15]	validate's l2: 787.156
[16]	validate's l2: 770.827
[17]	validate's l2: 759.053
[18]	validate's l2: 747.499
[19]	validate's l2: 738.739
[20]	validate's l2: 730.007
[21]	validate's l2: 721.24
[22]	validate's l2: 713.376
[23]	validate's l2: 705.618
[24]	va

# Generate prediction intervals

## ScaledAbsoluteErrorConformalPredictor
### Calibrate conformal predictor

In [17]:
confo_model2 = pitci.get_leaf_node_scaled_conformal_predictor(model)

In [18]:
confo_model2.calibrate(
    data=train.loc[train["sample"] == "interval"].drop(columns=[response, "sample"]), 
    alpha=0.8,
    response =train.loc[train["sample"] == "interval", response] 
)

In [19]:
confo_model2.baseline_interval

3559767.661347609

### Interval sample predictions

In [20]:
pred_intervals = confo_model2.predict_with_interval(train.loc[train["sample"] == "interval"].drop(columns=[response, "sample"]))

In [21]:
pitci.helpers.check_response_within_interval(
    intervals_with_predictions=pred_intervals,
    response=train.loc[train["sample"] == "interval", response],
)

True     0.801376
False    0.198624
Name: number_comments, dtype: float64

In [22]:
pitci.helpers.check_interval_width(intervals_with_predictions=pred_intervals)

0.0        5.130368
0.05       5.130368
0.1        5.130368
0.2        5.130368
0.3        5.130368
0.4        5.130368
0.5        5.304721
0.6        5.475715
0.7        5.701914
0.8        6.449043
0.9        9.193793
0.95      14.948104
1.0     3320.678789
mean      13.460537
std      104.895087
iqr        0.813048
dtype: float64

In [23]:
pred_intervals = pitci.helpers.prepare_prediction_interval_df(
    pred_intervals, train.loc[train["sample"] == "interval", response]
)

In [24]:
pred_intervals = pitci.helpers.create_interval_buckets(
    pred_intervals, q=5, duplicates="drop"
)

In [25]:
pred_intervals.groupby("interval_width_bucket").apply(
    lambda x: mean_absolute_error(x["response"], x["prediction"])
)

interval_width_bucket
(5.129, 5.476]        1.688534
(5.476, 6.449]        3.529742
(6.449, 3320.679]    21.098750
dtype: float64

### Test sample predictions

In [26]:
pred_test = confo_model2.predict_with_interval(train.loc[train["sample"] == "test"].drop(columns=[response, "sample"]))

In [27]:
pitci.helpers.check_interval_width(intervals_with_predictions=pred_test)

0.0        5.130368
0.05       5.130368
0.1        5.130368
0.2        5.130368
0.3        5.130368
0.4        5.130368
0.5        5.130368
0.6        5.470573
0.7        5.674786
0.8        6.095007
0.9        7.854436
0.95      11.784660
1.0     3399.969113
mean       9.280083
std       66.067575
iqr        0.571546
dtype: float64

In [28]:
pred_test = pitci.helpers.prepare_prediction_interval_df(
    pred_test, train.loc[train["sample"] == "test", response]
)

In [29]:
pred_test = pitci.helpers.create_interval_buckets(pred_test, q=5, duplicates="drop")

In [30]:
pred_test.groupby("interval_width_bucket").apply(
    lambda x: mean_absolute_error(x["prediction"], x["response"])
)

interval_width_bucket
(5.129, 5.471]        1.644240
(5.471, 6.095]        3.273206
(6.095, 3399.969]    19.316058
dtype: float64