In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# import sys
# sys.path.append("../population_shifts/ensemble_models")
# ! ls 

import numpy as np
import pandas as pd
%config Completer.use_jedi = False

In [3]:
from py_exp.helpers import gen_samples, setup_logging, shift_features, psi

setup_logging()

ModuleNotFoundError: No module named 'py_exp'

In [None]:
make_copy = lambda x: x.copy()

data_dev_raw = gen_samples(n_samples = 100000, n_exp=3, n_unif=3, n_normal = 3)
data_dev_raw.columns = [f"f_{ix}" for ix in range(data_dev_raw.shape[1])]

data_dev = (
    data_dev_raw
    .pipe(make_copy)
    .pipe(shift_features, cols_to_shift = data_dev_raw.columns, min_max_scaler_range=(1,100))
)


data_samp_raw = gen_samples(n_samples = 100000, n_exp=3, n_unif=3, n_normal = 3)
data_samp_raw.columns = [f"f_{ix}" for ix in range(data_samp_raw.shape[1])]

data_samp = (
    data_samp_raw
    .pipe(make_copy)
    .pipe(shift_features, cols_to_shift = data_dev_raw.columns, min_max_scaler_range=(1,100))
)

data_valid_raw = gen_samples(n_samples = 10000, n_exp=3, n_unif=3, n_normal = 3)
data_valid_raw.columns = [f"f_{ix}" for ix in range(data_valid_raw.shape[1])]

data_valid = (
    data_valid_raw
    .pipe(make_copy)
    .pipe(shift_features, cols_to_shift = data_dev_raw.columns, min_max_scaler_range=(1,100))
)




In [None]:

features = [col for col in data_dev_raw.columns if col.startswith("f_")]

### Generate the targets

In [None]:
from targen.data import target


def get_samples(data_in, feature_list, n_samples = 100):
    interaction_terms_1 = "".join([
        "0.5*f_0*f_2",
        "-0.2*f_3*f_4",
        "+0.27*f_7*f_1",
        "+0.8*f_4/f_6",
        "-0.8*f_1/f_0",
        "+0.5*f_4/f_1",
    ])

#     interaction_terms_2 = "".join([
#         "-1.5*f_2*(f_2-f_1)",
#         "+0.09*f_2*f_4",
#         "+0.6*f_7*f_1",
#         "+0.8*f_4/(f_2+f_0+f_6)",
#         "+0.8*f_1/(f_0-0.5*f_2+0.2)", # the shift shoul reduce here the effect  
#         "+0.6*f_4/(f_1+0.2*f_7)",
#     ])
    
    interaction_terms_2 = "".join([
        "-1.5*f_2*(f_2-f_1)",
        "+0.09*f_2*f_4",
        "+0.6*f_7*f_1",
        "+0.8*f_4/(f_2+f_0)",
        "+0.8*f_1/(f_2)", # the shift shoul reduce here the effect  
        "+0.6*f_4/(f_1+0.2*f_7)",
    ])

    expressions_1 = {
        'linear': '-12.5*f_0 + 2*f_4 -3.2*f_8',
        'non_linear': '0.7*f_2**1.5 - 0.2*sin(f_7)- 0.9*log(f_8)',
        'interaction': interaction_terms_1,
        'uniform_noise': {
            'weight':9.7
        },
        'gaussian_noise': {
            'weight':1.4,
            'mu_gaus': -1.5
        }
    }

#     expressions_2 = {
#         'linear': '-12.7*f_0 + 2.1*f_4 -3.2*f_6',
#         'non_linear': '0.2*f_2**1.4 - 0.2*sin(f_7)- 0.9*log(f_6)',
#         'interaction': interaction_terms_2,
#         'uniform_noise': {
#             'weight':3.8
#         },
#         'gaussian_noise': {
#             'weight':10.4,
#             'mu_gaus': 5.5
#         }
#     }
    
    expressions_2 = {
        'linear': '-12.7*f_0 + 2.1*f_4 -3.2*f_6',
        'non_linear': '0.2*f_2**1.4 - 0.2*sin(f_7)- 0.9*log(f_6)',
        'interaction': interaction_terms_2,
        'uniform_noise': {
            'weight':1.8
        },
        'gaussian_noise': {
            'weight':5.4,
            'mu_gaus': 5.5
        }
    }

    data_y_1 = target.get_target_and_contributions(data_in, expressions=expressions_1, imbalance = 0.4, 
                                                 drop_features=False)
    data_y_2 = target.get_target_and_contributions(data_in, expressions=expressions_2, imbalance = 0.3, 
                                                 drop_features=False)

    X = data_y_1[feature_list]
    y1 = data_y_1['y']
    y2 = data_y_2['y']
    
    sample_ix = np.random.randint(0,high = n_samples,size = y1.shape[0])
    
    print(f"Total samples: {n_samples}, expected data per sample {y1.shape[0]/n_samples}")
    
    return X, y1, y2, pd.Series(sample_ix, index = X.index )

In [None]:
X_dev, y1_dev, y2_dev, dummy = get_samples(data_dev,features, n_samples = 1000)
X_samp, y1_samp, y2_samp, samples = get_samples(data_samp,features, n_samples = 100)
X_valid, y1_valid, y2_valid, dummy = get_samples(data_valid,features, n_samples = 1000)

In [None]:
y2_samp.groupby(samples).mean().plot()

# Build a model

In [None]:
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train_1, y_test_1 = train_test_split(X_dev,y1_dev, test_size=0.2, random_state=42)

y_train_2 = y2_dev.loc[X_train.index]
y_test_2= y2_dev.loc[X_test.index]

# Train model 1

In [None]:
%%time

min_frac =int(0.1*X_train.shape[0])
print(min_frac)

xgb_model_1 = xgb.XGBClassifier(
    max_depth = 2,
    reg_lambda = 0,
    num_leaves=4,
    n_estimators=105,
    min_child_samples=min_frac
)

eval_set_1 = [(X_train,y_train_1),(X_test, y_test_1)]

xgb_model_1.fit(
    X_train,
    y_train_1,
    eval_metric=["auc","logloss"], 
#     eval_names = ['validation_0','validation_1'],
    eval_set=eval_set_1, 
    verbose=40, # after how many training sets you want the printout on the console
    early_stopping_rounds=10
)

fig, ax = plt.subplots(1,2,figsize=(18,8))


ax[0].plot(xgb_model_1.evals_result_['validation_0']['logloss'], label = "training target")
ax[0].plot(xgb_model_1.evals_result_['validation_1']['logloss'], label = "test target")

ax[1].plot(xgb_model_1.evals_result_['validation_0']['auc'])
ax[1].plot(xgb_model_1.evals_result_['validation_1']['auc'])
ax[0].legend()


# Train model 2

In [None]:
%%time

min_frac =int(0.1*X_train.shape[0])
print(min_frac)

xgb_model_2 = xgb.XGBClassifier(
    max_depth = 2,
    reg_lambda = 0,
    num_leaves=4,
    n_estimators=105,
    min_child_samples=min_frac
)

eval_set_2 = [(X_train,y_train_2),(X_test, y_test_2)]

xgb_model_2.fit(
    X_train,
    y_train_2,
    eval_metric=["auc","logloss"], 
#     eval_names = ['validation_0','validation_1'],
    eval_set=eval_set_2, 
    verbose=40, # after how many training sets you want the printout on the console
    early_stopping_rounds=10
)

fig, ax = plt.subplots(1,2,figsize=(18,8))


ax[0].plot(xgb_model_2.evals_result_['validation_0']['logloss'], label = "training target")
ax[0].plot(xgb_model_2.evals_result_['validation_1']['logloss'], label = "test target")

ax[1].plot(xgb_model_2.evals_result_['validation_0']['auc'])
ax[1].plot(xgb_model_2.evals_result_['validation_1']['auc'])
ax[0].legend()

In [None]:
n_tree_1 = 100
n_tree_2 = 100

# Now let's work on the samples

In [None]:
# X_samp, y1_samp, y2_samp, samples

In [None]:
sample_ixs = np.sort(samples.unique())

In [None]:
from transferboost.models import XGBTransferLearner

In [None]:
xgb_tboost = XGBTransferLearner(xgb_model_1, verbosity=0, base_score=0.1)

In [None]:
# X_valid

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import tqdm

probas_model = list()
probas_tboost = list()

for i in tqdm.tqdm(sample_ixs):
#     if i >0: break
    
    X_train_i = X_samp[samples==i]
    y2_samp_i = y2_samp[samples==i]

    xgb_i = xgb.XGBClassifier(
        max_depth = 2,
        reg_lambda = 0,
        num_leaves=4,
        n_estimators=n_tree_1,
        min_child_samples=min_frac,
        verbosity = 0
    )
    
    xgb_i.fit(X_train_i,y2_samp_i, verbose=False)
    
    probas_i = xgb_i.predict_proba(X_valid)[:,1]
    
    xgb_tboost.fit(X_train_i,y2_samp_i)
    
#     t_full_probas_i = xgb_tboost.predict_proba(X_valid)[:,1]
    t_opti_probas_i = xgb_tboost.predict_proba(X_valid, tree_index = n_tree_1)[:,1]
    
    probas_model.append(probas_i)
    probas_tboost.append(t_opti_probas_i)

In [None]:
n_tree_1

In [None]:
probas_model_df = pd.DataFrame(probas_model).T
probas_model_df.columns = [f'sample_{ix}' for ix in probas_model_df.columns ]

probas_tboost_df = pd.DataFrame(probas_tboost).T
probas_tboost_df.columns = [f'sample_{ix}' for ix in probas_tboost_df.columns ]

real_probas = pd.Series(xgb_model_2.predict_proba(X_valid)[:,1])


In [None]:
step = 20

fig, ax = plt.subplots(1,2,figsize=(18,8))

real_probas_sorted = real_probas.sort_values()


tboost_mean = probas_tboost_df.loc[real_probas_sorted.index].mean(axis=1)
tboost_std = probas_tboost_df.loc[real_probas_sorted.index].std(axis=1)

model_mean = probas_model_df.loc[real_probas_sorted.index].mean(axis=1)
model_std = probas_model_df.loc[real_probas_sorted.index].std(axis=1)

ax[0].plot(model_mean.values[::step], color = 'blue', label = "refitted models", alpha=0.8)
ax[0].plot(tboost_mean.values[::step], color = 'red', label = "transfer boosted models",alpha=0.8)

ax[0].plot(real_probas_sorted.values[::step], color = 'black', label = "ideal model", linewidth=3)

ax[0].legend()

ax[1].hist(model_std.values, color = 'blue', label = "refitted models", histtype='step', linewidth=3)
ax[1].hist(tboost_std.values, color = 'red', label = "transfer boosted models",  histtype='step',linewidth=3)
ax[1].set_title("std dev distribution")
ax[1].legend()

In [None]:
fig, ax = plt.subplots(figsize=(15,8))

ax.hist(model_std.values, color = 'blue', label = "refitted models", histtype='step')
ax.hist(tboost_std.values, color = 'red', label = "transfer boosted models",  histtype='step')
ax.set_title("std dev distribution")
ax.legend()

# Compute metrics

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
ideal_roc_auc = roc_auc_score(y2_valid, real_probas)
ideal_roc_auc

In [None]:
probas_tboost_df.head()

In [None]:
aucs_tboost = probas_tboost_df.apply(lambda x: roc_auc_score(y2_valid,x))
aucs_model = probas_model_df.apply(lambda x: roc_auc_score(y2_valid,x))

In [None]:
fig, ax = plt.subplots(figsize=(10,8))

ax.hist(aucs_model.values, color = 'blue', label = "refitted models", histtype='step', linewidth=3)
ax.hist(aucs_tboost.values, color = 'red', label = "transfer boosted models",  histtype='step', linewidth=3)
ax.set_title("AUCs distribution")
ax.legend()


In [None]:
aucs_tboost.mean(), aucs_tboost.std()

In [None]:
aucs_model.mean(), aucs_model.std()