In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [2]:
np.random.seed(42)  # Set the seed for NumPy

# Set the seed for PyTorch (CPU and GPU)
torch.manual_seed(42)

# If you're using CUDA (GPU), set the seed for CUDA as well
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)  # For all GPUs (if you have more than one)

In [3]:
# 1. Download the dataset (OpenML: id 180)
dataset = openml.datasets.get_dataset(180)
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute, dataset_format='dataframe')

In [4]:
# 2. Preprocessing: Numeric features only, standardize, encode labels (classes are 1-7)
X_numeric = X.select_dtypes(include=[np.number])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric.values.astype(np.float32))

le = LabelEncoder()
y_encoded = le.fit_transform(y)  # Converts to 0...6
y_encoded = y_encoded.astype(np.int64)

In [None]:
# 3. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

In [6]:
# 4. PyTorch Dataset
class CovertypeDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = CovertypeDataset(X_train, y_train)
test_ds = CovertypeDataset(X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=256)

In [7]:
from dpn_2.dpn import DPN
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DPN(X_train.shape[1], 100 + len(le.classes_), len(le.classes_), False).to(device)
model.compile()

In [8]:
print(model.weights[0])

Parameter containing:
tensor([[-0.1352, -0.1196,  0.2225,  ...,  1.5253,  0.4836, -0.0048],
        [ 0.0879,  0.0484, -0.1208,  ...,  0.0987, -0.2055,  0.2481],
        [ 0.1558,  0.4036,  0.0066,  ...,  0.5725, -0.2003, -0.1504],
        ...,
        [-0.0465, -0.0838,  0.0361,  ..., -0.1032,  0.0696,  0.1492],
        [-0.1528, -0.0048, -0.1326,  ..., -0.0756,  0.1466,  0.1071],
        [ 0.1221, -0.1458, -0.2132,  ..., -0.0110,  0.1236,  0.0584]],
       device='cuda:0', requires_grad=True)


In [9]:
weights = []
for i in range(107):
    weight = model.weights[0][i].clone()

    for j in range(i):
        weight = torch.cat((weight, model.weights[1 + j][i  - 1 - j]))

    weight = weight.unsqueeze(0)
    print(weight.shape)
    weights.append(weight)

biases = model.biases.clone()

torch.Size([1, 14])
torch.Size([1, 15])
torch.Size([1, 16])
torch.Size([1, 17])
torch.Size([1, 18])
torch.Size([1, 19])
torch.Size([1, 20])
torch.Size([1, 21])
torch.Size([1, 22])
torch.Size([1, 23])
torch.Size([1, 24])
torch.Size([1, 25])
torch.Size([1, 26])
torch.Size([1, 27])
torch.Size([1, 28])
torch.Size([1, 29])
torch.Size([1, 30])
torch.Size([1, 31])
torch.Size([1, 32])
torch.Size([1, 33])
torch.Size([1, 34])
torch.Size([1, 35])
torch.Size([1, 36])
torch.Size([1, 37])
torch.Size([1, 38])
torch.Size([1, 39])
torch.Size([1, 40])
torch.Size([1, 41])
torch.Size([1, 42])
torch.Size([1, 43])
torch.Size([1, 44])
torch.Size([1, 45])
torch.Size([1, 46])
torch.Size([1, 47])
torch.Size([1, 48])
torch.Size([1, 49])
torch.Size([1, 50])
torch.Size([1, 51])
torch.Size([1, 52])
torch.Size([1, 53])
torch.Size([1, 54])
torch.Size([1, 55])
torch.Size([1, 56])
torch.Size([1, 57])
torch.Size([1, 58])
torch.Size([1, 59])
torch.Size([1, 60])
torch.Size([1, 61])
torch.Size([1, 62])
torch.Size([1, 63])


In [10]:
print(len(weights))
print(weights[0])

107
tensor([[-0.1352, -0.1196,  0.2225, -0.3366,  0.1549, -0.5507, -0.0387, -0.2265,
          0.2333, -0.0640,  0.0882,  1.5253,  0.4836, -0.0048]],
       device='cuda:0', grad_fn=<UnsqueezeBackward0>)


In [11]:
for weight in weights:
    print(weight)

tensor([[-0.1352, -0.1196,  0.2225, -0.3366,  0.1549, -0.5507, -0.0387, -0.2265,
          0.2333, -0.0640,  0.0882,  1.5253,  0.4836, -0.0048]],
       device='cuda:0', grad_fn=<UnsqueezeBackward0>)
tensor([[ 0.0879,  0.0484, -0.1208,  0.1890,  0.1416, -0.2117, -0.0618,  0.7052,
          0.3695, -0.5245, -0.1797,  0.0987, -0.2055,  0.2481, -0.0448]],
       device='cuda:0', grad_fn=<UnsqueezeBackward0>)
tensor([[ 0.1558,  0.4036,  0.0066, -0.6384,  0.2894,  0.5234,  0.1220, -0.5035,
         -0.0411, -0.3439,  0.3389,  0.5725, -0.2003, -0.1504,  0.2621,  0.5799]],
       device='cuda:0', grad_fn=<UnsqueezeBackward0>)
tensor([[ 0.0900, -0.4936,  0.1788,  0.1196,  0.3319, -0.1597,  0.1707, -0.1460,
         -0.4586, -0.0663,  0.2239, -0.6519,  0.0784,  0.0085,  0.1214, -0.1657,
         -0.0106]], device='cuda:0', grad_fn=<UnsqueezeBackward0>)
tensor([[ 0.0649, -0.5512,  0.2271,  0.4870, -0.1033, -0.5341,  0.4510,  0.4296,
          0.3726, -0.4319,  0.0168, -0.1952, -0.1300,  0.0119, 

tensor([[-0.1531,  0.2557,  0.0506,  0.0318, -0.1372,  0.2249, -0.1767, -0.2431,
         -0.4079, -0.0502, -0.2731, -0.2312,  0.0819, -0.3963, -0.2665,  0.0505,
         -0.0313,  0.0891, -0.0083, -0.0585, -0.0152,  0.0683,  0.1800, -0.2236,
          0.1297, -0.1180, -0.1517, -0.0406, -0.1419, -0.0852, -0.0386,  0.1761,
         -0.1759, -0.1431, -0.1304,  0.3690,  0.2742,  0.1693,  0.0364,  0.0843,
          0.3175,  0.1925,  0.0756, -0.1206, -0.3308, -0.0108,  0.1288, -0.0887,
          0.0840,  0.1654,  0.1212, -0.2722,  0.1381,  0.1719, -0.0368, -0.1272,
          0.0872,  0.3650,  0.0170,  0.3101,  0.1726,  0.2981, -0.0543, -0.0366,
          0.0536, -0.3531,  0.0166]], device='cuda:0',
       grad_fn=<UnsqueezeBackward0>)
tensor([[ 0.1328, -0.0935, -0.3701, -0.2848, -0.3785, -0.0458,  0.0443,  0.1330,
         -0.0171, -0.0963, -0.1020,  0.2176,  0.0597,  0.1188,  0.1961, -0.1188,
         -0.1595,  0.1241, -0.0569,  0.3761, -0.2030, -0.1198,  0.2231,  0.0247,
         -0.1099,

In [12]:
from dpn_4.dpn import DPN
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_2 = DPN(X_train.shape[1], 100 + len(le.classes_), len(le.classes_), False).to(device)
model_2.weights = nn.ParameterList(weights)
model_2.biases = nn.Parameter(biases)

In [13]:
# 6. Training setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [14]:
criterion_2 = nn.CrossEntropyLoss()
optimizer_2 = optim.Adam(model_2.parameters(), lr=1e-3)

In [15]:
from utils import train

model = model
train_loader = train_loader
val_loader = test_loader
optimizer = optimizer
criterion = criterion
device = device

percent = 0.99
rounds = 10

In [16]:
original_weights = [param.clone().detach() for param in model.weights]
final_masks = [torch.ones_like(param) for param in original_weights]
current_masks = [torch.ones_like(param) for param in original_weights]
p_per_round = 1 - (1 - percent) ** (1 / rounds)

In [17]:
def prune_by_percent_once(percent, mask, final_weight):
    # Get the absolute values of weights where mask == 1
    masked_weights = final_weight[mask == 1].abs()

    # Sort the unmasked weights
    sorted_weights, _ = torch.sort(masked_weights)
    if sorted_weights.shape[0] != 0:
        # Determine the cutoff index for pruning
        cutoff_index = int(round(percent * sorted_weights.shape[0]))
        cutoff = sorted_weights[cutoff_index]

        # Prune all weights below or equal to the cutoff
        new_mask = torch.where(final_weight.abs() <= cutoff, torch.zeros_like(mask), mask)

    else:
        new_mask = mask

    return new_mask

In [18]:
def prune_by_percent(model, masks, percent):

    blocks = model.weights
    for i in range(len(blocks)):
        masks[i] = prune_by_percent_once(percent, masks[i], blocks[i])

    return masks

In [19]:
_, val_metrics = train(model, train_loader, val_loader, 1, optimizer, criterion, device=device)
val_accuracy = val_metrics[0][1]

tensor([[-3.3573e-04, -2.3120e-04,  4.2047e-05,  ...,  4.7239e-05,
         -3.1116e-05,  4.0263e-04],
        [-1.3080e-04, -1.9626e-06, -9.8197e-05,  ...,  1.9768e-05,
          6.8528e-05,  7.0514e-04],
        [ 3.2685e-04,  1.8752e-05, -2.6642e-04,  ...,  1.0904e-04,
         -5.1899e-05, -2.0692e-04],
        ...,
        [ 3.2016e-04,  2.4169e-04,  1.1961e-05,  ...,  1.1453e-05,
         -4.6185e-05,  3.8308e-04],
        [ 3.0859e-04, -6.8151e-05, -3.3541e-05,  ...,  1.6471e-05,
          2.6069e-06, -4.5626e-04],
        [-8.3605e-04, -4.9340e-04,  1.7439e-05,  ..., -2.9494e-04,
         -1.4523e-04,  2.1246e-04]], device='cuda:0')
tensor([[-4.0584e-04, -6.9772e-05,  1.7083e-04,  ...,  3.5664e-05,
         -1.5579e-04,  7.0578e-04],
        [-2.4780e-04,  3.2490e-04,  3.2570e-04,  ...,  1.8139e-04,
         -4.4532e-05,  9.7482e-04],
        [ 4.0247e-04, -1.9859e-04, -3.3122e-04,  ...,  9.4684e-05,
         -3.0105e-05, -3.4694e-04],
        ...,
        [ 4.1552e-04,  3.6944

In [20]:
_, val_metrics = train(model_2, train_loader, val_loader, 1, optimizer_2, criterion_2, device=device)
val_accuracy = val_metrics[0][1]

tensor([[-3.3573e-04, -2.3120e-04,  4.2047e-05,  3.0295e-05,  1.5394e-04,
         -2.8381e-04,  1.2456e-04, -2.2902e-04, -2.2642e-04, -1.2534e-04,
         -1.8910e-04,  4.7239e-05, -3.1116e-05,  4.0263e-04]], device='cuda:0')
tensor([[-4.0632e-04, -7.2270e-05,  1.6922e-04, -2.2216e-05,  7.5246e-05,
         -1.7899e-04, -1.3313e-04, -2.8272e-04, -5.9741e-05, -9.8307e-05,
         -2.0748e-04,  3.4582e-05, -1.5707e-04,  7.0522e-04]], device='cuda:0')
tensor([[-4.7663e-04, -2.2018e-05,  2.4865e-04, -1.3014e-04,  3.9656e-06,
         -2.9445e-04, -6.2033e-05, -3.6032e-04, -1.7465e-04, -2.1026e-04,
         -1.2847e-04,  7.4764e-05, -2.1967e-04,  6.3565e-04]], device='cuda:0')
tensor([[-5.7333e-04,  8.1024e-06,  3.4978e-05, -1.5293e-04, -3.8664e-05,
         -2.3701e-04, -1.0515e-04, -7.7200e-05,  4.6743e-05, -1.3388e-04,
         -2.3275e-04, -1.9801e-05, -1.4766e-04,  7.8619e-04]], device='cuda:0')
tensor([[-3.6253e-04, -1.4167e-05,  1.4337e-04, -1.6674e-04, -2.2630e-05,
         -1.91

KeyboardInterrupt: 

In [None]:
for round_idx in range(rounds):
    current_masks = prune_by_percent(model, current_masks, p_per_round)
    pruned_weights = [w * m for w, m in zip(original_weights, current_masks)]
    model.weights = nn.ParameterList([nn.Parameter(w) for w in pruned_weights])

    _, val_metrics = train(model, train_loader, val_loader, 1, optimizer, criterion, device=device)
    new_val_accuracy = val_metrics[0][1]
    
    if new_val_accuracy >= val_accuracy:
        print(new_val_accuracy, val_accuracy)
        val_accuracy = new_val_accuracy
        final_masks = current_masks.copy()  # This creates a shallow copy of the list

tensor([[ 0.8426,  1.9154,  0.0000,  ...,  2.1935, -1.0295,  1.0088],
        [ 0.5979,  2.8811,  0.0433,  ...,  2.0454, -2.7001,  2.1674],
        [ 0.8537,  1.1126,  1.0813,  ...,  3.0728, -1.8117,  1.5561],
        ...,
        [ 0.2523,  0.0000,  0.0000,  ...,  1.6676, -1.3021,  2.2465],
        [ 0.5421,  2.2454,  0.0000,  ...,  2.3842, -2.3717,  2.0300],
        [ 0.0000,  0.0000,  1.5564,  ...,  0.6504, -2.7111,  1.1979]],
       device='cuda:0')
tensor([[ 0.0000,  0.0000,  2.5176,  ...,  1.0945, -3.1209,  0.8474],
        [ 0.3564,  2.2680,  1.1379,  ...,  3.0934, -1.5538,  0.8476],
        [ 0.0000,  0.1845,  1.3675,  ...,  0.4371, -3.1992,  1.4745],
        ...,
        [ 1.1140,  2.5023,  0.1738,  ...,  3.2900, -1.2739,  0.2062],
        [ 2.2015,  2.4441,  1.1387,  ...,  3.8030, -1.8432,  0.0524],
        [ 0.5233,  2.2711,  0.9985,  ...,  3.3249, -1.3538,  0.6005]],
       device='cuda:0')
tensor([[ 0.0000,  0.0000,  2.4423,  ...,  0.8116, -3.1673,  1.0022],
        [ 0.00

KeyboardInterrupt: 

In [None]:
for mask in final_masks:
    print(mask.sum())

tensor(2688., device='cuda:0')
tensor(1442., device='cuda:0')


In [None]:
pruned_weights = [w * m for w, m in zip(original_weights, final_masks)]
for idx in range(len(pruned_weights)):
    print((pruned_weights[idx] == original_weights[idx]).sum())
model.weights = nn.ParameterList([nn.Parameter(w) for w in pruned_weights])

for idx in range(len(pruned_weights)):
    print((pruned_weights[idx] == model.weights[idx]).sum())

tensor(2688, device='cuda:0')
tensor(1442, device='cuda:0')
tensor(2688, device='cuda:0')
tensor(1442, device='cuda:0')


In [None]:
train_metrics, eval_metrics, test_metrics = train(model, train_loader, test_loader, 50, optimizer, criterion, device=device)


Epoch: 1 Total_Time: 0.6059 Average_Time_per_batch: 0.0018 Train_Accuracy: 0.0954 Train_Loss: 4.0034 Validation_Accuracy: 0.0975 Validation_Loss: 3.9739
Epoch: 2 Total_Time: 0.5811 Average_Time_per_batch: 0.0017 Train_Accuracy: 0.0954 Train_Loss: 4.0034 Validation_Accuracy: 0.0975 Validation_Loss: 3.9739
Epoch: 3 Total_Time: 0.5969 Average_Time_per_batch: 0.0017 Train_Accuracy: 0.0954 Train_Loss: 4.0034 Validation_Accuracy: 0.0975 Validation_Loss: 3.9739
Epoch: 4 Total_Time: 0.6052 Average_Time_per_batch: 0.0018 Train_Accuracy: 0.0954 Train_Loss: 4.0034 Validation_Accuracy: 0.0975 Validation_Loss: 3.9739
Epoch: 5 Total_Time: 0.6241 Average_Time_per_batch: 0.0018 Train_Accuracy: 0.0954 Train_Loss: 4.0034 Validation_Accuracy: 0.0975 Validation_Loss: 3.9739
Epoch: 6 Total_Time: 0.5777 Average_Time_per_batch: 0.0017 Train_Accuracy: 0.0954 Train_Loss: 4.0034 Validation_Accuracy: 0.0975 Validation_Loss: 3.9739
Epoch: 7 Total_Time: 0.6053 Average_Time_per_batch: 0.0018 Train_Accuracy: 0.0954

KeyboardInterrupt: 