In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [2]:
np.random.seed(42)  # Set the seed for NumPy

# Set the seed for PyTorch (CPU and GPU)
torch.manual_seed(42)

# If you're using CUDA (GPU), set the seed for CUDA as well
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)  # For all GPUs (if you have more than one)

In [3]:
# 1. Download the dataset (OpenML: id 180)
dataset = openml.datasets.get_dataset(180)
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute, dataset_format='dataframe')

In [4]:
# 2. Preprocessing: Numeric features only, standardize, encode labels (classes are 1-7)
X_numeric = X.select_dtypes(include=[np.number])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric.values.astype(np.float32))

le = LabelEncoder()
y_encoded = le.fit_transform(y)  # Converts to 0...6
y_encoded = y_encoded.astype(np.int64)

In [5]:
# 3. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

In [6]:
# 4. PyTorch Dataset
class CovertypeDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = CovertypeDataset(X_train, y_train)
test_ds = CovertypeDataset(X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=256)

In [7]:
from dpn_4.dpn import DPN
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DPN(X_train.shape[1], 192 + len(le.classes_), len(le.classes_), False).to(device)
model.compile()

In [8]:
# 6. Training setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [9]:
from utils import train

model = model
train_loader = train_loader
val_loader = test_loader
optimizer = optimizer
criterion = criterion
device = device

percent = 0.99
rounds = 100

In [10]:
original_weights = [param.clone().detach() for param in model.weights]
final_masks = [torch.ones_like(param) for param in original_weights]
current_masks = [torch.ones_like(param) for param in original_weights]
p_per_round = 1 - (1 - percent) ** (1 / rounds)

In [11]:
def prune_by_percent_once(percent, mask, final_weight):
    # Get the absolute values of weights where mask == 1
    masked_weights = final_weight[mask == 1].abs()

    # Sort the unmasked weights
    sorted_weights, _ = torch.sort(masked_weights)
    if sorted_weights.shape[0] != 0:
        # Determine the cutoff index for pruning
        cutoff_index = int(round(percent * sorted_weights.shape[0]))
        cutoff = sorted_weights[cutoff_index]

        # Prune all weights below or equal to the cutoff
        if torch.rand(1).item() < 0.3:
            new_mask = torch.where(final_weight.abs() <= cutoff, torch.zeros_like(mask), mask)
        else:
            for i in range(final_weight.shape[0]):
                tensor = final_weight[i]
                last_nonzero_index = torch.nonzero(tensor)
                if len(last_nonzero_index) > 0:
                    last_nonzero_index = last_nonzero_index[-1].item()
                    if final_weight[i, last_nonzero_index].abs() <= cutoff:
                        mask[i, last_nonzero_index] = 0
                    
            new_mask = mask
    else:
        new_mask = mask

    return new_mask

In [12]:
def prune_by_percent(model, masks, percent):

    blocks = model.weights
    for i in range(len(blocks)):
        masks[i] = prune_by_percent_once(percent, masks[i], blocks[i])

    return masks

In [13]:
_, val_metrics = train(model, train_loader, val_loader, 5, optimizer, criterion, device=device)
val_accuracy = val_metrics[-1][1]


Epoch: 1 Total_Time: 19.6748 Average_Time_per_batch: 0.0570 Train_Accuracy: 0.6576 Train_Loss: 0.9413 Validation_Accuracy: 0.6929 Validation_Loss: 0.8268
Epoch: 2 Total_Time: 19.3147 Average_Time_per_batch: 0.0560 Train_Accuracy: 0.6980 Train_Loss: 0.8217 Validation_Accuracy: 0.7055 Validation_Loss: 0.7944
Epoch: 3 Total_Time: 19.5558 Average_Time_per_batch: 0.0567 Train_Accuracy: 0.7119 Train_Loss: 0.7927 Validation_Accuracy: 0.7196 Validation_Loss: 0.7718
Epoch: 4 Total_Time: 19.4949 Average_Time_per_batch: 0.0565 Train_Accuracy: 0.7235 Train_Loss: 0.7711 Validation_Accuracy: 0.7289 Validation_Loss: 0.7562
Epoch: 5 Total_Time: 19.5867 Average_Time_per_batch: 0.0568 Train_Accuracy: 0.7329 Train_Loss: 0.7529 Validation_Accuracy: 0.7368 Validation_Loss: 0.7401Peak GPU memory: 18.96 MB


In [14]:
for round_idx in range(rounds):
    current_masks = prune_by_percent(model, current_masks, p_per_round)
    pruned_weights = [w * m for w, m in zip(original_weights, current_masks)]
    model.weights = nn.ParameterList([nn.Parameter(w) for w in pruned_weights])
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    _, val_metrics = train(model, train_loader, val_loader, 5, optimizer, criterion, device=device)
    new_val_accuracy = val_metrics[-1][1]
    
    if new_val_accuracy >= val_accuracy:
        #val_accuracy = new_val_accuracy
        final_masks = current_masks.copy()  # This creates a shallow copy of the list
    else:
        break


Epoch: 1 Total_Time: 19.4535 Average_Time_per_batch: 0.0564 Train_Accuracy: 0.6597 Train_Loss: 0.9353 Validation_Accuracy: 0.6956 Validation_Loss: 0.8237
Epoch: 2 Total_Time: 19.3526 Average_Time_per_batch: 0.0561 Train_Accuracy: 0.6991 Train_Loss: 0.8177 Validation_Accuracy: 0.7098 Validation_Loss: 0.7892
Epoch: 3 Total_Time: 19.5142 Average_Time_per_batch: 0.0566 Train_Accuracy: 0.7143 Train_Loss: 0.7873 Validation_Accuracy: 0.7245 Validation_Loss: 0.7653
Epoch: 4 Total_Time: 19.6243 Average_Time_per_batch: 0.0569 Train_Accuracy: 0.7245 Train_Loss: 0.7656 Validation_Accuracy: 0.7312 Validation_Loss: 0.7478
Epoch: 5 Total_Time: 19.3278 Average_Time_per_batch: 0.0560 Train_Accuracy: 0.7349 Train_Loss: 0.7483 Validation_Accuracy: 0.7383 Validation_Loss: 0.7356Peak GPU memory: 18.96 MB

Epoch: 1 Total_Time: 19.5103 Average_Time_per_batch: 0.0566 Train_Accuracy: 0.6620 Train_Loss: 0.9292 Validation_Accuracy: 0.6963 Validation_Loss: 0.8222
Epoch: 2 Total_Time: 19.4945 Average_Time_per_bat

In [15]:
pruned_weights = [w * m for w, m in zip(original_weights, final_masks)]

In [16]:
for weight in pruned_weights:
    print(torch.nonzero(weight))

tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
tensor([], device='cuda:0', size=(0, 2),

In [17]:
merged_weights = []
size_limit = pruned_weights[0].shape[1]
start_idx = 0

In [18]:
zero_indices = [i + size_limit for i, t in enumerate(pruned_weights) if torch.all(t == 0)]
print(zero_indices)


while len(zero_indices) > 0:
    pruned_weights = [t for t in pruned_weights if not torch.all(t == 0)]

    for i in range(len(pruned_weights)):
        tensor = pruned_weights[i]
        columns_to_keep = [i for i in range(tensor.shape[1]) if i not in zero_indices]
        pruned_weights[i] = tensor[:, columns_to_keep]

    zero_indices = [i + size_limit for i, t in enumerate(pruned_weights) if torch.all(t == 0)]
    print(zero_indices)


[14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 52, 56, 57, 58, 59, 60, 61, 62, 64, 66, 67, 71, 75, 81, 90, 97]
[14, 18, 21, 26, 33]
[32]
[]


In [19]:
for weight in pruned_weights:
    print(torch.nonzero(weight))

tensor([[0, 2],
        [0, 3]], device='cuda:0')
tensor([[0, 5]], device='cuda:0')
tensor([[ 0,  0],
        [ 0, 11],
        [ 0, 13]], device='cuda:0')
tensor([[0, 0],
        [0, 5]], device='cuda:0')
tensor([[ 0,  2],
        [ 0, 12]], device='cuda:0')
tensor([[0, 8]], device='cuda:0')
tensor([[0, 9]], device='cuda:0')
tensor([[ 0,  7],
        [ 0, 17]], device='cuda:0')
tensor([[ 0,  0],
        [ 0, 21]], device='cuda:0')
tensor([[ 0,  3],
        [ 0, 10],
        [ 0, 15]], device='cuda:0')
tensor([[ 0,  9],
        [ 0, 11],
        [ 0, 12],
        [ 0, 13],
        [ 0, 23]], device='cuda:0')
tensor([[0, 9]], device='cuda:0')
tensor([[0, 9]], device='cuda:0')
tensor([[ 0,  0],
        [ 0, 25]], device='cuda:0')
tensor([[ 0,  2],
        [ 0, 11],
        [ 0, 15]], device='cuda:0')
tensor([[ 0,  3],
        [ 0, 12],
        [ 0, 20],
        [ 0, 26],
        [ 0, 27]], device='cuda:0')
tensor([[ 0,  5],
        [ 0,  8],
        [ 0, 11],
        [ 0, 15],
        [ 

tensor([[ 0,  4],
        [ 0, 11],
        [ 0, 22],
        [ 0, 25],
        [ 0, 26],
        [ 0, 32],
        [ 0, 45],
        [ 0, 58],
        [ 0, 60],
        [ 0, 64],
        [ 0, 68],
        [ 0, 80],
        [ 0, 85]], device='cuda:0')
tensor([[ 0,  6],
        [ 0,  7],
        [ 0,  9],
        [ 0, 16],
        [ 0, 19],
        [ 0, 27],
        [ 0, 30],
        [ 0, 32],
        [ 0, 38],
        [ 0, 55],
        [ 0, 61],
        [ 0, 67],
        [ 0, 68],
        [ 0, 76],
        [ 0, 78],
        [ 0, 88],
        [ 0, 94]], device='cuda:0')
tensor([[ 0,  8],
        [ 0, 15],
        [ 0, 18],
        [ 0, 21],
        [ 0, 30],
        [ 0, 31],
        [ 0, 33],
        [ 0, 36],
        [ 0, 44],
        [ 0, 52],
        [ 0, 54],
        [ 0, 67],
        [ 0, 78],
        [ 0, 81],
        [ 0, 83],
        [ 0, 85],
        [ 0, 86],
        [ 0, 93],
        [ 0, 97]], device='cuda:0')
tensor([[ 0,  0],
        [ 0,  1],
        [ 0,  7],
        [ 

In [20]:
for i in range(1, len(pruned_weights)):
    nonzero_idx = torch.nonzero(pruned_weights[i])[-1][1].item()

    if nonzero_idx >= size_limit:

        merged_weights.append(torch.cat([t[:, :size_limit] for t in pruned_weights[start_idx:i]], dim=0))
        
        size_limit = pruned_weights[i].shape[1]
        start_idx = i

merged_weights.append(torch.cat([t[:, :size_limit] for t in pruned_weights[start_idx:]], dim=0))

In [21]:
for weight in merged_weights:
    print(weight.shape)

torch.Size([7, 14])
torch.Size([1, 21])
torch.Size([2, 22])
torch.Size([3, 24])
torch.Size([2, 27])
torch.Size([2, 29])
torch.Size([3, 31])
torch.Size([5, 34])
torch.Size([6, 39])
torch.Size([2, 45])
torch.Size([2, 47])
torch.Size([5, 49])
torch.Size([5, 54])
torch.Size([6, 59])
torch.Size([4, 65])
torch.Size([3, 69])
torch.Size([2, 72])
torch.Size([3, 74])
torch.Size([1, 77])
torch.Size([2, 78])
torch.Size([2, 80])
torch.Size([8, 82])
torch.Size([3, 90])
torch.Size([1, 93])
torch.Size([3, 94])
torch.Size([1, 97])
torch.Size([3, 98])
torch.Size([3, 101])
torch.Size([5, 104])
torch.Size([2, 109])
torch.Size([1, 111])
torch.Size([3, 112])
torch.Size([5, 115])
torch.Size([1, 120])
torch.Size([5, 121])
torch.Size([6, 126])
torch.Size([2, 132])
torch.Size([1, 134])
torch.Size([4, 135])
torch.Size([3, 139])
torch.Size([3, 142])
torch.Size([3, 145])
torch.Size([5, 148])
torch.Size([2, 153])


In [22]:
sizes = [w.shape[0] for w in merged_weights]
print(sum(sizes))

141


In [23]:
model = DPN(X_train.shape[1], sum(sizes), len(le.classes_), False).to(device)
model.compile()
model.weights = nn.ParameterList([nn.Parameter(w) for w in merged_weights])
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [24]:
train_metrics, eval_metrics = train(model, train_loader, test_loader, 50, optimizer, criterion, device=device)


Epoch: 1 Total_Time: 9.3002 Average_Time_per_batch: 0.0270 Train_Accuracy: 0.6571 Train_Loss: 0.9161 Validation_Accuracy: 0.6924 Validation_Loss: 0.8207
Epoch: 2 Total_Time: 9.3210 Average_Time_per_batch: 0.0270 Train_Accuracy: 0.6954 Train_Loss: 0.8289 Validation_Accuracy: 0.7066 Validation_Loss: 0.7925
Epoch: 3 Total_Time: 9.2236 Average_Time_per_batch: 0.0267 Train_Accuracy: 0.7058 Train_Loss: 0.8040 Validation_Accuracy: 0.7180 Validation_Loss: 0.7712
Epoch: 4 Total_Time: 9.3175 Average_Time_per_batch: 0.0270 Train_Accuracy: 0.7165 Train_Loss: 0.7849 Validation_Accuracy: 0.7263 Validation_Loss: 0.7555
Epoch: 5 Total_Time: 9.3111 Average_Time_per_batch: 0.0270 Train_Accuracy: 0.7244 Train_Loss: 0.7696 Validation_Accuracy: 0.7335 Validation_Loss: 0.7442
Epoch: 6 Total_Time: 9.2096 Average_Time_per_batch: 0.0267 Train_Accuracy: 0.7293 Train_Loss: 0.7574 Validation_Accuracy: 0.7378 Validation_Loss: 0.7331
Epoch: 7 Total_Time: 9.3295 Average_Time_per_batch: 0.0270 Train_Accuracy: 0.7349

In [25]:
print(sizes)

[7, 1, 2, 3, 2, 2, 3, 5, 6, 2, 2, 5, 5, 6, 4, 3, 2, 3, 1, 2, 2, 8, 3, 1, 3, 1, 3, 3, 5, 2, 1, 3, 5, 1, 5, 6, 2, 1, 4, 3, 3, 3, 5, 2]
