In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from ConfSelect import weighted_BH, weighted_CS, eval_sel
q=0.5
quantile = 0.9

# Initialize lists to store results for each metric
results_fdp = []
results_power = []
results_nsel = []

#seed = 8

# Loop over 100 different seeds
for seed in range(1, 101):
    
# Step 1: Generate samples
np.random.seed(seed)  # For reproducibility
X1 = np.random.normal(0.5, 0.5, 16000)
X2 = np.random.normal(0.5, 0.5, 1000)

# Step 2: Combine X1 and X2, generate Y
X = np.concatenate((X1, X2))
e = np.random.normal(0, 0.3, X.shape[0])
Y = - X + X**3 + e

# Preparing the data for linear regression (adding X^2 and X^3)
X_poly = np.vstack([X, X**2, X**3]).T
#X_poly = np.vstack([X]).T

# Model for X1
X1_poly = np.vstack([X1, X1**2, X1**3]).T
#X1_poly = np.vstack([X1]).T

# Model for X2
X2_poly = np.vstack([X2, X2**2, X2**3]).T
#X2_poly = np.vstack([X2]).T

#model for training dataset
ttrain = X1_poly[:8000]
model = LinearRegression().fit(ttrain, Y[:8000])

print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
# Calculating the R-squared value
r_squared = model.score(ttrain, Y[:8000])

# Printing the R-squared value
print("R-squared:", r_squared)

y_train = Y[:8000]

Coefficients: [ 1.01081541 -0.04899497 -1.0154869 ]
Intercept: 0.009063600572148424
R-squared: 0.36966406726639056


In [2]:
dcalib = X1_poly[8000:16000]
dtest = X2_poly

dother = np.concatenate((dcalib,dtest))
all_pred = model.predict(dother)
train_pred = model.predict(ttrain)


hat_mu_calib = np.array(model.predict(dcalib))
hat_mu_test = np.array(model.predict(dtest))
y_calib = Y[8000:16000]
w_calib = np.exp(-(dcalib[:, 0]+0.28)**2/(2*0.375*0.375))
y_test = Y[16000:]
w_test = np.exp(-(dtest[:, 0]+0.28)**2/(2*0.375*0.375))

In [3]:
print(w_calib)

[0.96904436 0.07149548 0.77392968 ... 0.86773692 0.43122453 0.75282245]


In [4]:
# Calculate in sample R^2 Score
y_trmean = np.mean(y_train)
ss_tot = np.sum((y_train - y_trmean) ** 2)
ss_res = np.sum((y_train - train_pred) ** 2)
ss_reg = np.sum((y_trmean - train_pred) **2)
r2 = 1 - (ss_res / ss_tot)
print(f"in sample R^2 Score: {r2}")
r22 = (ss_reg / ss_tot)
print(f"in of sample R^2 Score: {r22}")

# Calculate MSE
mse = np.mean((y_test - hat_mu_test) ** 2)
print(f"Mean Squared Error: {mse}")

# Calculate out of sample R^2 Score
y_mean = np.mean(y_test)
ss_tot = np.sum((y_test - y_mean) ** 2)
ss_res = np.sum((y_test - hat_mu_test) ** 2)
ss_reg = np.sum((hat_mu_test - y_mean) ** 2)
r2 = 1 - (ss_res / ss_tot)
print(f"out of sample R^2 Score: {r2}")
r22 = (ss_reg / ss_tot)
print(f"out of sample R^2 Score: {r22}")
print(ss_tot)

in sample R^2 Score: 0.36966406726639056
in of sample R^2 Score: 0.3696640672663905
Mean Squared Error: 0.09034283895724604
out of sample R^2 Score: 0.8187940373549232
out of sample R^2 Score: 0.9217490241997026
498.564382973413


In [5]:



#c = 0
c = np.quantile(y_train, quantile) 

 
calib_scores_res = y_calib - hat_mu_calib
calib_scores_sub = - hat_mu_calib 
calib_scores_clip = 100 * (y_calib > c) + c * (y_calib <= c) - hat_mu_calib
 
test_scores = c - hat_mu_test

 
# ========================= 
# ## weighted BH procedure
# ========================= 

# use scores res, sub, and clip
#BH_res = weighted_BH(calib_scores_res, w_calib, test_scores, w_test, q)  
#BH_sub = weighted_BH(calib_scores_sub[y_calib <= c], w_calib[y_calib<=c], test_scores, w_test, q) 
BH_clip = weighted_BH(calib_scores_clip, w_calib, test_scores, w_test, q)


In [6]:
# =============================================================================
# # summarize FDP, power and selection sizes
# =============================================================================


#BH_res_fdp, BH_res_power = eval_sel(BH_res, y_test, np.array([c]*len(y_test)))
#BH_sub_fdp, BH_sub_power = eval_sel(BH_sub, y_test, np.array([c]*len(y_test)))
#BH_clip_fdp, BH_clip_power = eval_sel(BH_clip, y_test, np.array([c]*len(y_test))) 

# Assuming BH_res[0] contains the integer indices you want to use for evaluation
#BH_res_indices = BH_res[0]
# Now pass these indices to the eval_sel function
#BH_res_fdp, BH_res_power = eval_sel(BH_res_indices, y_test, np.array([c]*len(y_test)))

# Assuming BH_sub[0] contains the integer indices for the "sub" case
#BH_sub_indices = BH_sub[0]
# Now pass these indices to the eval_sel function
#BH_sub_fdp, BH_sub_power = eval_sel(BH_sub_indices, y_test, np.array([c]*len(y_test)))

# Assuming BH_clip[0] contains the integer indices for the "clip" case
BH_clip_indices = BH_clip[0]
# Now pass these indices to the eval_sel function
BH_clip_fdp, BH_clip_power = eval_sel(BH_clip_indices, y_test, np.array([c]*len(y_test)))


# Organize BH results for DataFrame
#fdp = [BH_res_fdp, BH_sub_fdp, BH_clip_fdp]
#power = [BH_res_power, BH_sub_power, BH_clip_power]
#nsel = [len(BH_res_indices), len(BH_sub_indices), len(BH_clip_indices)]
#ndiff = [0] * 3  # Assuming no difference for BH-only results
#nsame = [len(BH_res_indices), len(BH_sub_indices), len(BH_clip_indices)]  # Assuming all selections are the same for BH-only results





# Create DataFrame for BH-only results
res_BH_only = pd.DataFrame({
    "FDP": fdp,
    "power": power,
    "nsel": nsel,
    "ndiff": ndiff,
    "nsame": nsame,
    "score": ["res", "sub", "clip"],
    "method": ["WBH"] * 3,
    "q": q,
    "seed": seed
})

# If you want to print or use res_BH_only DataFrame
print(res_BH_only)

        FDP     power  nsel  ndiff  nsame score method    q  seed
0  0.000000  0.000000     0      0      0   res    WBH  0.5     8
1  0.000000  0.000000     0      0      0   sub    WBH  0.5     8
2  0.766252  0.994118   723      0    723  clip    WBH  0.5     8
