In [1]:
import numpy as np  
import random
from sklearn.linear_model import LinearRegression  
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import pandas as pd
from numpy.linalg import inv

import warnings
warnings.filterwarnings("ignore")

# Optional: suppress secure RNG warning from Opacus
import logging
logging.getLogger("opacus").setLevel(logging.ERROR)

### Dataset 1

In [2]:
# Load the dataset
data = pd.read_csv('input/intrusion detection.csv', header=0, delimiter=",")

# X = data.drop(columns=['Number of Barriers']).values
# y = data['Number of Barriers'].values

selected_features = ['Area', 'Sensing Range', 'Transmission Range', 'Number of Sensor nodes']
X = data[selected_features].values
y = data['Number of Barriers'].values

# Number of features and samples
n_features = X.shape[1]
n_samples = X.shape[0]
print(f"n = {n_samples}, d = {n_features}")
# Determine bounds for selected features and target variable
x_bounds = [(data[feature].min(), data[feature].max()) for feature in selected_features]
y_lb, y_ub = data['Number of Barriers'].min(), data['Number of Barriers'].max()
# Number of features and samples
n_features = X.shape[1]
n_samples = X.shape[0]


n = 182, d = 4


### Dataset 2

In [3]:
# Load the dataset
data = pd.read_csv('input/auction.csv')

# Drop non-numeric and irrelevant columns
data['verification.result'] = data['verification.result'].astype(int)
data_clean = data.copy()

# Separate the target variable and features
y = data_clean['property.price'].values
X = data_clean.drop(columns=['property.price']).values

# Number of features and samples
n_features = X.shape[1]
n_samples = X.shape[0]
print(f"n = {n_samples}, d = {n_features}")

# Determine bounds for selected features and target variable
x_bounds = [(data_clean[col].min(), data_clean[col].max()) for col in data_clean.columns if col != 'property.price']
y_lb, y_ub = data_clean['property.price'].min(), data_clean['property.price'].max()

n = 2043, d = 8


### D3

In [4]:
# Load the dataset
data = pd.read_csv('input/parkinsons_updrs.data')

# Drop non-numeric and irrelevant columns (if any)
data_clean = data.copy()

# Separate the target variable and features
y = data_clean['total_UPDRS'].values
X = data_clean.drop(columns=['total_UPDRS']).values

# Number of features and samples
n_features = X.shape[1]
n_samples = X.shape[0]

print(f"n = {n_samples}, d = {n_features}")

# Determine bounds for selected features and target variable
x_bounds = [(data_clean[col].min(), data_clean[col].max()) for col in data_clean.columns if col != 'total_UPDRS']
y_lb, y_ub = data_clean['total_UPDRS'].min(), data_clean['total_UPDRS'].max()

n = 5875, d = 21


### D4

In [5]:
# Load the dataset
data = pd.read_csv('input/liver-bupa.data', header=None)

# Drop non-numeric and irrelevant columns (if any)
data_clean = data.copy()

# Separate the target variable and features
y = data_clean[6].values
X = data_clean.drop(columns=[6]).values

# Number of features and samples
n_features = X.shape[1]
n_samples = X.shape[0]

print(f"n = {n_samples}, d = {n_features}")

# Determine bounds for selected features and target variable
x_bounds = [(data_clean[col].min(), data_clean[col].max()) for col in data_clean.columns if col != 6]
y_lb, y_ub = data_clean[6].min(), data_clean[6].max()

n = 345, d = 6


### D5

In [6]:
from sklearn.preprocessing import OneHotEncoder

In [7]:
# Load the dataset
data = pd.read_csv('input/abalone.data', header=None)

# One-hot encode the first (categorical) column
encoder = OneHotEncoder(sparse_output=False)
encoded_gender = encoder.fit_transform(data[[0]])

# Drop the original gender column and append the encoded one
data_clean = data.drop(columns=[0])
data_clean = pd.concat([pd.DataFrame(encoded_gender, columns=encoder.categories_[0]), data_clean], axis=1)

# Separate the target variable and features
y = data_clean[8].values
X = data_clean.drop(columns=[8]).values

# Number of features and samples
n_features = X.shape[1]
n_samples = X.shape[0]

print(f"n = {n_samples}, d = {n_features}")

# Determine bounds for selected features and target variable
x_bounds = [(data_clean[col].min(), data_clean[col].max()) for col in data_clean.columns if col != 8]
y_lb, y_ub = data_clean[8].min(), data_clean[8].max()

n = 4177, d = 10


### D6

In [8]:
# Load the dataset
data = pd.read_csv('input/AirQualityUCI.csv', sep=';')

# Drop unnecessary columns
data = data.drop(columns=['Date', 'Time', 'Unnamed: 15', 'Unnamed: 16'])

# Replace commas with dots for proper float conversion
data = data.replace(',', '.', regex=True)

# Convert all columns to numeric, forcing errors to NaN
data = data.apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values (if any)
data = data.dropna()

# Separate the target variable and features
y = data['CO(GT)'].values
X = data.drop(columns=['CO(GT)']).values

# Number of features and samples
n_features = X.shape[1]
n_samples = X.shape[0]

print(f"n = {n_samples}, d = {n_features}")

# Determine bounds for selected features and target variable
x_bounds = [(data[col].min(), data[col].max()) for col in data.columns if col != 'CO(GT)']
y_lb, y_ub = data['CO(GT)'].min(), data['CO(GT)'].max()


n = 9357, d = 12


### D7

In [9]:
# Load the two datasets
red_wine = pd.read_csv('input/winequality-red.csv', sep=';')
white_wine = pd.read_csv('input/winequality-white.csv', sep=';')

# Add a "type" column to distinguish red (0) and white (1)
red_wine['type'] = 0
white_wine['type'] = 1

# Combine the two datasets
data = pd.concat([red_wine, white_wine], ignore_index=True)

# Display some information
print(f"Combined Dataset Shape: {data.shape}")
print(f"Red Wine Samples: {len(red_wine)}")
print(f"White Wine Samples: {len(white_wine)}")

# Separate the target variable and features
y = data['quality'].values
X = data.drop(columns=['quality']).values

# Number of features and samples
n_features = X.shape[1]
n_samples = X.shape[0]

print(f"n = {n_samples}, d = {n_features}")

# Determine bounds for selected features and target variable
x_bounds = [(data[col].min(), data[col].max()) for col in data.columns if col != 'quality']
y_lb, y_ub = data['quality'].min(), data['quality'].max()


Combined Dataset Shape: (6497, 13)
Red Wine Samples: 1599
White Wine Samples: 4898
n = 6497, d = 12


### D8

In [10]:
# Load the dataset (Replace with your path if needed)
data = pd.read_csv('input/energydata_complete.csv')

# Drop non-numeric and irrelevant columns
data_clean = data.drop(columns=['date'])

# Separate the target variable and features
y = data_clean['Appliances'].values
X = data_clean.drop(columns=['Appliances']).values

# Number of features and samples
n_features = X.shape[1]
n_samples = X.shape[0]

print("n=",n_samples, "d=",n_features)

# Determine bounds for selected features and target variable
x_bounds = [(data_clean[col].min(), data_clean[col].max()) for col in data_clean.columns if col != 'Appliances']
y_lb, y_ub = data_clean['Appliances'].min(), data_clean['Appliances'].max()

n= 19735 d= 27
