# Notebook to achieve 10 and 20% label noise on the CIFAR-10 dataset

In [2]:
# Let's import the necessary libraries
import torch
import torch.nn.functional as F
import torchvision
import torch.nn as nn
from torch import optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import wandb
from torch.utils.data import TensorDataset, Subset, DataLoader, Dataset
from torchvision import datasets, transforms

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
use_adam_op = True
augmented = True
use_label_noise = True
pytorch_default_resnet = False
model_width = 64
num_classes = 10
label_noise_10 = 0.10
label_noise_15 = 0.15
label_noise_20 = 0.20
batch_size = 128
lr = 0.0001
epochs = 2000
model_seed = 42
data_seed = 42

In [4]:
# Let's import the CIFAR10 dataset from torchvision
transform = transforms.Compose([transforms.ToTensor()]) if not augmented else transforms.Compose([transforms.ToTensor(), transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

if data_seed is not None:
    torch.manual_seed(data_seed)

train_set = datasets.CIFAR10(root='./data',
                             train=True,
                             download=True,
                             transform=transform)
trainloader = DataLoader(train_set,
                         batch_size=batch_size,
                         shuffle=True
                         )

test_set = datasets.CIFAR10(root='./data',
                            train=False,
                            download=True,
                            transform=transform)
testloader = DataLoader(test_set,
                        shuffle=False,
                        batch_size=batch_size
                        )

Files already downloaded and verified
Files already downloaded and verified


In [153]:
num_noise_10 = int(label_noise_10 * len(trainloader.dataset.data))
num_noise_20 = int(label_noise_20 * len(trainloader.dataset.data))
true_targets = trainloader.dataset.targets

labels = torch.from_numpy(np.array(true_targets)).clone()
np.random.seed(data_seed)
corrupt_indices_10 = np.random.permutation(len(true_targets))[:num_noise_10]
np.random.seed(data_seed)
corrupt_indices_20 = np.random.permutation(len(true_targets))[:num_noise_20]
check_labels_10 = labels[corrupt_indices_10].tolist() # I will use this to cross-chec
check_labels_20 = labels[corrupt_indices_20].tolist() # I will use this to cross-chec

In [164]:
corrupt_indices_20

array([33553,  9427,   199, ..., 18707, 15200,  5857])

In [154]:
# Let's define the function to make the derangement
import random
def D(l):
 o = l[:]
 while any(x==y for x,y in zip(o,l)):
   random.shuffle(o)
 return o

In [155]:
len(check_labels_10), len(check_labels_20)

(5000, 10000)

In [156]:
deranged_10 = []
starts_10 = list(range(0, 5000, 100))
for i in starts_10:
    print(f'Starting from {i}...')
    deranged_10.extend(D(check_labels_10[i:i+100]))
    print('Done!')
print('Finished derangement for the specified noise level!')

Starting from 0...
Done!
Starting from 100...
Done!
Starting from 200...
Done!
Starting from 300...
Done!
Starting from 400...
Done!
Starting from 500...
Done!
Starting from 600...
Done!
Starting from 700...
Done!
Starting from 800...
Done!
Starting from 900...
Done!
Starting from 1000...
Done!
Starting from 1100...
Done!
Starting from 1200...
Done!
Starting from 1300...
Done!
Starting from 1400...
Done!
Starting from 1500...
Done!
Starting from 1600...
Done!
Starting from 1700...
Done!
Starting from 1800...
Done!
Starting from 1900...
Done!
Starting from 2000...
Done!
Starting from 2100...
Done!
Starting from 2200...
Done!
Starting from 2300...
Done!
Starting from 2400...
Done!
Starting from 2500...
Done!
Starting from 2600...
Done!
Starting from 2700...
Done!
Starting from 2800...
Done!
Starting from 2900...
Done!
Starting from 3000...
Done!
Starting from 3100...
Done!
Starting from 3200...
Done!
Starting from 3300...
Done!
Starting from 3400...
Done!
Starting from 3500...
Done!
Star

In [157]:
np.where(np.array(deranged_10) == np.array(check_labels_10))

(array([], dtype=int64),)

In [158]:
deranged_20 = []
starts_20 = list(range(0, 10000, 100))
for i in starts_20:
    print(f'Starting from {i}...')
    deranged_20.extend(D(check_labels_20[i:i+100]))
    print('Done!')
print('Finished derangement for the specified noise level!')

Starting from 0...
Done!
Starting from 100...
Done!
Starting from 200...
Done!
Starting from 300...
Done!
Starting from 400...
Done!
Starting from 500...
Done!
Starting from 600...
Done!
Starting from 700...
Done!
Starting from 800...
Done!
Starting from 900...
Done!
Starting from 1000...
Done!
Starting from 1100...
Done!
Starting from 1200...
Done!
Starting from 1300...
Done!
Starting from 1400...
Done!
Starting from 1500...
Done!
Starting from 1600...
Done!
Starting from 1700...
Done!
Starting from 1800...
Done!
Starting from 1900...
Done!
Starting from 2000...
Done!
Starting from 2100...
Done!
Starting from 2200...
Done!
Starting from 2300...
Done!
Starting from 2400...
Done!
Starting from 2500...
Done!
Starting from 2600...
Done!
Starting from 2700...
Done!
Starting from 2800...
Done!
Starting from 2900...
Done!
Starting from 3000...
Done!
Starting from 3100...
Done!
Starting from 3200...
Done!
Starting from 3300...
Done!
Starting from 3400...
Done!
Starting from 3500...
Done!
Star

In [159]:
print(np.where(np.array(deranged_20) == np.array(check_labels_20)))

(array([], dtype=int64),)


In [160]:
deranged_20

[5,
 5,
 5,
 4,
 0,
 0,
 0,
 1,
 4,
 3,
 6,
 8,
 9,
 3,
 2,
 7,
 0,
 8,
 7,
 4,
 6,
 9,
 4,
 9,
 0,
 8,
 8,
 4,
 8,
 3,
 5,
 8,
 8,
 5,
 1,
 3,
 2,
 0,
 6,
 7,
 8,
 5,
 6,
 4,
 5,
 7,
 2,
 1,
 0,
 0,
 3,
 8,
 1,
 3,
 6,
 6,
 5,
 6,
 3,
 8,
 5,
 7,
 1,
 9,
 7,
 2,
 0,
 2,
 9,
 4,
 1,
 4,
 8,
 0,
 0,
 5,
 1,
 7,
 2,
 9,
 1,
 9,
 6,
 4,
 7,
 2,
 4,
 9,
 8,
 7,
 6,
 0,
 4,
 5,
 8,
 0,
 0,
 0,
 1,
 5,
 0,
 3,
 7,
 8,
 2,
 1,
 1,
 1,
 5,
 4,
 7,
 1,
 7,
 1,
 6,
 4,
 2,
 1,
 1,
 5,
 0,
 7,
 5,
 9,
 4,
 3,
 4,
 1,
 1,
 8,
 5,
 9,
 8,
 6,
 2,
 9,
 3,
 6,
 2,
 3,
 8,
 8,
 6,
 3,
 9,
 0,
 7,
 4,
 7,
 2,
 1,
 8,
 4,
 3,
 5,
 2,
 6,
 2,
 6,
 6,
 6,
 9,
 6,
 8,
 4,
 1,
 5,
 8,
 8,
 7,
 1,
 4,
 4,
 2,
 8,
 7,
 3,
 1,
 7,
 5,
 3,
 5,
 9,
 3,
 3,
 5,
 5,
 0,
 6,
 3,
 3,
 4,
 5,
 2,
 2,
 8,
 1,
 1,
 4,
 4,
 9,
 4,
 4,
 4,
 4,
 3,
 2,
 2,
 2,
 4,
 1,
 0,
 9,
 2,
 0,
 5,
 9,
 8,
 3,
 7,
 4,
 9,
 7,
 8,
 6,
 5,
 6,
 1,
 3,
 1,
 7,
 1,
 3,
 7,
 9,
 9,
 1,
 1,
 7,
 1,
 6,
 5,
 5,
 2,
 6,
 9,
 9,
 2,
 9,
 4,


In [33]:
import pandas as pd

In [161]:
d_10 = deranged_10
d_20 = deranged_20
d_10 = pd.DataFrame(d_10)
d_20 = pd.DataFrame(d_20)
d_10 = d_10.to_csv('label_noise/ten_percent_label_noise.csv')
d_20 = d_20.to_csv('label_noise/twenty_percent_label_noise.csv')

In [147]:
pd.DataFrame(np.array(deranged_10))

Unnamed: 0,0
0,6
1,7
2,4
3,8
4,5
...,...
4995,6
4996,2
4997,0
4998,3


In [115]:
num_epochs = 2000
logspace_intervals = np.logspace(10, np.log(num_epochs), num=num_epochs, endpoint=True, base=10, dtype=int)
linear_interval = int(num_epochs / 10)

In [116]:
linear_interval

200

In [91]:
resnet_pytorch = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=False)

Using cache found in /Users/sienkadounia/.cache/torch/hub/pytorch_vision_v0.10.0


In [45]:
resnet_pytorch.fc = nn.Linear(512, num_classes)

In [46]:
criterion = nn.CrossEntropyLoss(reduction='mean')
model = resnet_pytorch
optimizer = optim.SGD(model.parameters(), lr, momentum=0.9, weight_decay=5e-10) if not use_adam_op else optim.Adam(model.parameters(), lr)

In [61]:
def save_checkpoint(model, optimizer, epoch, checkpoint_dir):
    if epoch in logspace_intervals or epoch % linear_interval == 0:
            # Save checkpoint
            checkpoint_path = checkpoint_dir + f'checkpoint_epoch_{epoch}.pt'
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, checkpoint_path)
            print(f"Checkpoint saved at epoch {epoch}")

In [101]:
np.logspace(10, 1000)

array([1.00000000e+010, 1.59985872e+030, 2.55954792e+050, 4.09491506e+070,
       6.55128557e+090, 1.04811313e+111, 1.67683294e+131, 2.68269580e+151,
       4.29193426e+171, 6.86648845e+191, 1.09854114e+212, 1.75751062e+232,
       2.81176870e+252, 4.49843267e+272, 7.19685673e+292,             inf,
                   inf,             inf,             inf,             inf,
                   inf,             inf,             inf,             inf,
                   inf,             inf,             inf,             inf,
                   inf,             inf,             inf,             inf,
                   inf,             inf,             inf,             inf,
                   inf,             inf,             inf,             inf,
                   inf,             inf,             inf,             inf,
                   inf,             inf,             inf,             inf,
                   inf,             inf])

In [117]:
num_epochs = 2000
for epoch in range(num_epochs):
    if epoch <= 10 or epoch == num_epochs or epoch % int(np.log10(num_epochs)) == 0:
        print(epoch)
    #save_checkpoint(model, optimizer, epoch, checkpoint_dir= 'label_noise/')

0
1
2
3
4
5
6
7
8
9
10
12
15
18
21
24
27
30
33
36
39
42
45
48
51
54
57
60
63
66
69
72
75
78
81
84
87
90
93
96
99
102
105
108
111
114
117
120
123
126
129
132
135
138
141
144
147
150
153
156
159
162
165
168
171
174
177
180
183
186
189
192
195
198
201
204
207
210
213
216
219
222
225
228
231
234
237
240
243
246
249
252
255
258
261
264
267
270
273
276
279
282
285
288
291
294
297
300
303
306
309
312
315
318
321
324
327
330
333
336
339
342
345
348
351
354
357
360
363
366
369
372
375
378
381
384
387
390
393
396
399
402
405
408
411
414
417
420
423
426
429
432
435
438
441
444
447
450
453
456
459
462
465
468
471
474
477
480
483
486
489
492
495
498
501
504
507
510
513
516
519
522
525
528
531
534
537
540
543
546
549
552
555
558
561
564
567
570
573
576
579
582
585
588
591
594
597
600
603
606
609
612
615
618
621
624
627
630
633
636
639
642
645
648
651
654
657
660
663
666
669
672
675
678
681
684
687
690
693
696
699
702
705
708
711
714
717
720
723
726
729
732
735
738
741
744
747
750
753
756
759
762
765

In [111]:
num_epochs = 2000
for epoch in range(num_epochs):
    if epoch in logspace_intervals or epoch % linear_interval == 0:
        print(epoch)

0
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281


In [162]:
to_save = list(range(0, 135)) + list(range(200, 2000, 100)) + [1999]
len(to_save)

154

In [163]:
for i in range(10):
    if i in to_save:
        print(i)

0
1
2
3
4
5
6
7
8
9


In [128]:
200*100

20000

In [137]:
a = np.array(list(range(0, 10)))
b = np.array(list(range(0, 10)))
a == b

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [138]:
np.where(a==b)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),)

In [140]:
list(range(10, 25, 5))

[10, 15, 20]