#  Noisy data
In this notebook, we will explore how data quality can affect predictions. We will simulate examples where noise is introduced in the test set, the training set, or both, and examine
the effects on model performance.   

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from timeit import default_timer as timer 

# Import modular
%cd ..
import modular.samples_setup as cs
from modular import engine
from modular import model_builder
from modular import extra_functions as ef

/u/ruizsuar/InformedML-CV


#### 1. Data simulation

Let's start by simulating some data. We will simulate a data set containing 5000 samples each of
squares and circles. We will save 20% of the data for test and we will use the rest for
train the models

In [2]:
### Simulate data 
seed = 262
n_samples = [5000]*2

output = cs.generate_sample(n = n_samples, noise_prop = 0,var=0,seed=seed)
images, labels= (output['images'], output['labels'])

# Split test and train
n_test = int(sum(n_samples)*0.2)
test_index = np.arange(n_test)
train_index = np.arange(n_test, sum(n_samples))

images_test = images[test_index]
images_train = images[train_index]

label_test = labels[test_index]
label_train = labels[train_index]

#### 2. Scenario with no noise
Let's start by training the model in a scenario where both the training and the test sets are noise-free, ensuring all samples are perfectly clear


In [4]:
# To reduce variability when re-running 
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

BATCH_SIZE = 50
EPOCHS = 12
loss_fn = nn.CrossEntropyLoss()

# Create tensor
X_test = torch.from_numpy(images_test).type(torch.float)
X_train = torch.from_numpy(images_train).type(torch.float)

y_train = torch.from_numpy(label_train).type(torch.long)
y_test = torch.from_numpy(label_test).type(torch.long)

## Add channel at dimension 1 (greyscale)
X_train = X_train.unsqueeze(1)  
X_test = X_test.unsqueeze(1)  
        
train_dataset = torch.utils.data.TensorDataset(X_train,y_train)
test_dataset = torch.utils.data.TensorDataset(X_test,y_test)
        
# Create data loader and turn datasets into iterables (batches)
train_dataloader = DataLoader(train_dataset, 
                                batch_size=BATCH_SIZE, 
                                shuffle=False) 
                           
test_dataloader = DataLoader(test_dataset,
                                batch_size=BATCH_SIZE,
                                shuffle=False)

# Initialize model and optimizer
model_1 =  model_builder.TVGG(input_shape = 1,  
                            hidden_units= 10, 
                            output_shape = 2)
optimizer = torch.optim.SGD(params=model_1.parameters(), lr=0.1)

output_1 = engine.train_test_loop(model_1,train_dataloader,
                                test_dataloader, optimizer, loss_fn,
                                epochs=EPOCHS,print_b=True)



Epoch: 1 | test_ce: 11.27871 | test_acc: 70.3000
Epoch: 2 | test_ce: 12.02957 | test_acc: 67.7500
Epoch: 3 | test_ce: 9.90794 | test_acc: 75.4500
Epoch: 4 | test_ce: 5.52176 | test_acc: 86.2000
Epoch: 5 | test_ce: 1.96328 | test_acc: 89.3500
Epoch: 6 | test_ce: 1.33277 | test_acc: 92.3500
Epoch: 7 | test_ce: 1.84044 | test_acc: 92.3500
Epoch: 8 | test_ce: 1.42633 | test_acc: 94.0500
Epoch: 9 | test_ce: 1.64422 | test_acc: 93.5500
Epoch: 10 | test_ce: 1.38013 | test_acc: 95.8000
Epoch: 11 | test_ce: 0.70289 | test_acc: 95.8000
Epoch: 12 | test_ce: 0.68674 | test_acc: 95.8000


Quite good!

#### 3. Scenario with noise in the test set
What happens if out test set is entirely noisy, while the training set remains noise-free?

In [5]:
# Adding noise only to the test set
var =0.15
BATCH_SIZE = 50
EPOCHS = 12
loss_fn = nn.CrossEntropyLoss()

# Create copy of image sets

images_test_e1 = images_test.copy()
images_train_e1 = images_train.copy()

label_test = labels[test_index]
label_train = labels[train_index]

# Add noise        
for i in range(len(images_test)):
        images_test_e1[i] = ef.add_gaussian_noise(images_test_e1[i],
                                               var=var)

# Create tensor
X_test = torch.from_numpy(images_test_e1).type(torch.float)
X_train = torch.from_numpy(images_train_e1).type(torch.float)

y_test = torch.from_numpy(label_test).type(torch.long)
y_train = torch.from_numpy(label_train).type(torch.long)

## Add channel at dimension 1 (greyscale)
X_train = X_train.unsqueeze(1)  
X_test = X_test.unsqueeze(1)  
        
train_dataset = torch.utils.data.TensorDataset(X_train,y_train)
test_dataset = torch.utils.data.TensorDataset(X_test,y_test)
        
# Create data loader and turn datasets into iterables (batches)
train_dataloader = DataLoader(train_dataset, 
                                batch_size=BATCH_SIZE, 
                                shuffle=False) 
                            

test_dataloader = DataLoader(test_dataset,
                                batch_size=BATCH_SIZE,
                                shuffle=False)

# Initialize model and optimizer
model_2 =  model_builder.TVGG(input_shape = 1,  
                            hidden_units= 10, 
                            output_shape = 2)

optimizer = torch.optim.SGD(params=model_2.parameters(), lr=0.1)

output_2 = engine.train_test_loop(model_2,train_dataloader,
                                test_dataloader, optimizer, loss_fn,
                                epochs=EPOCHS,print_b=True)

Epoch: 1 | test_ce: 11.11308 | test_acc: 68.2000
Epoch: 2 | test_ce: 11.13491 | test_acc: 67.8000
Epoch: 3 | test_ce: 10.25265 | test_acc: 69.0500
Epoch: 4 | test_ce: 13.90320 | test_acc: 58.3000
Epoch: 5 | test_ce: 17.90571 | test_acc: 49.5000
Epoch: 6 | test_ce: 17.25083 | test_acc: 50.2000
Epoch: 7 | test_ce: 17.68653 | test_acc: 48.0500
Epoch: 8 | test_ce: 17.75252 | test_acc: 48.1500
Epoch: 9 | test_ce: 17.67911 | test_acc: 48.7500
Epoch: 10 | test_ce: 17.44876 | test_acc: 49.6000
Epoch: 11 | test_ce: 17.55390 | test_acc: 49.3500
Epoch: 12 | test_ce: 17.17472 | test_acc: 50.0500


It really worsens!

#### 3. Scenario with noise in the train set
What happens if our training set is entirely noisy while the test set remains noise-free?


In [6]:
# Adding noise only to the train set
var =0.15
BATCH_SIZE = 50
EPOCHS = 12
loss_fn = nn.CrossEntropyLoss()

# Create copy of image sets

images_test_e2 = images_test.copy()
images_train_e2 = images_train.copy()

label_test = labels[test_index]
label_train = labels[train_index]

# add Gaussian Noise       
for i in range(len(images_test)):
        images_train_e2[i] = ef.add_gaussian_noise(images_train_e2[i],
                                               var=var)

# Create tensor
X_test = torch.from_numpy(images_test_e2).type(torch.float)
X_train = torch.from_numpy(images_train_e2).type(torch.float)

y_test = torch.from_numpy(label_test).type(torch.long)
y_train = torch.from_numpy(label_train).type(torch.long)

## Add channel at dimension 1 (greyscale)
X_train = X_train.unsqueeze(1)  
X_test = X_test.unsqueeze(1)  
        
train_dataset = torch.utils.data.TensorDataset(X_train,y_train)
test_dataset = torch.utils.data.TensorDataset(X_test,y_test)
        
# Create data loader and turn datasets into iterables (batches)
train_dataloader = DataLoader(train_dataset, 
                                batch_size=BATCH_SIZE, 
                                shuffle=False) 
                            

test_dataloader = DataLoader(test_dataset,
                                batch_size=BATCH_SIZE,
                                shuffle=False)

# Initialize model and optimizer
model_3 =  model_builder.TVGG(input_shape = 1,  
                            hidden_units= 10, 
                            output_shape = 2)

optimizer = torch.optim.SGD(params=model_3.parameters(), lr=0.1)

output_3 = engine.train_test_loop(model_3,train_dataloader,
                                test_dataloader, optimizer, loss_fn,
                                epochs=EPOCHS,print_b=True)

Epoch: 1 | test_ce: 14.47006 | test_acc: 70.8500
Epoch: 2 | test_ce: 12.29995 | test_acc: 69.9000
Epoch: 3 | test_ce: 12.18634 | test_acc: 67.5500
Epoch: 4 | test_ce: 10.55435 | test_acc: 72.3500
Epoch: 5 | test_ce: 8.13770 | test_acc: 80.0500
Epoch: 6 | test_ce: 4.95141 | test_acc: 86.6000
Epoch: 7 | test_ce: 3.59743 | test_acc: 89.9500
Epoch: 8 | test_ce: 1.05203 | test_acc: 97.1500
Epoch: 9 | test_ce: 1.92796 | test_acc: 96.5000
Epoch: 10 | test_ce: 1.66115 | test_acc: 94.9000
Epoch: 11 | test_ce: 1.62377 | test_acc: 95.3000
Epoch: 12 | test_ce: 1.61314 | test_acc: 95.3000


This is not so bad!

#### 4. Scenario with noise in both sets
Let's see an example where we have noisy data in both sets. We will add noise to half
of the training set and half of the test set


In [7]:
var =0.15
BATCH_SIZE = 50
EPOCHS = 12
loss_fn = nn.CrossEntropyLoss()

# Create copy of image sets
images_test_e3 = images_test.copy()
images_train_e3 = images_train.copy()

# add noise to half of them 
n_test_noisy = int(len(images_test)*0.5)
n_train_noisy = int(len(images_train)*0.5)

# add Gaussian Noise       
for i in range(n_test_noisy):
        images_test_e3[i] = ef.add_gaussian_noise(images_test_e3[i],
                                               var=var)
        
for i in range(n_train_noisy):
        images_train_e3[i] = ef.add_gaussian_noise(images_train_e3[i],
                                                var=var)

# Create tensor
X_test = torch.from_numpy(images_test_e3).type(torch.float)
X_train = torch.from_numpy(images_train_e3).type(torch.float)

y_test = torch.from_numpy(label_test).type(torch.long)
y_train = torch.from_numpy(label_train).type(torch.long)

## Add channel at dimension 1 (greyscale)
X_train = X_train.unsqueeze(1)  
X_test = X_test.unsqueeze(1)  
        
train_dataset = torch.utils.data.TensorDataset(X_train,y_train)
test_dataset = torch.utils.data.TensorDataset(X_test,y_test)
        
# Create data loader and turn datasets into iterables (batches)
train_dataloader = DataLoader(train_dataset, 
                                batch_size=BATCH_SIZE, 
                                shuffle=False) 
                            

test_dataloader = DataLoader(test_dataset,
                                batch_size=BATCH_SIZE,
                                shuffle=False)


# Initialize model and optimizer
model_4 =  model_builder.TVGG(input_shape = 1,  
                            hidden_units= 10, 
                            output_shape = 2)

optimizer = torch.optim.SGD(params=model_4.parameters(), lr=0.1)

output_4 = engine.train_test_loop(model_4,train_dataloader,
                                test_dataloader, optimizer, loss_fn,
                                epochs=EPOCHS,print_b=True)                        

Epoch: 1 | test_ce: 11.62045 | test_acc: 72.0500
Epoch: 2 | test_ce: 11.37824 | test_acc: 70.5000
Epoch: 3 | test_ce: 11.25965 | test_acc: 70.4000
Epoch: 4 | test_ce: 10.05169 | test_acc: 73.6500
Epoch: 5 | test_ce: 7.65696 | test_acc: 78.9000
Epoch: 6 | test_ce: 7.22368 | test_acc: 79.1500
Epoch: 7 | test_ce: 6.56874 | test_acc: 80.6000
Epoch: 8 | test_ce: 6.34997 | test_acc: 81.6000
Epoch: 9 | test_ce: 5.90878 | test_acc: 83.1500
Epoch: 10 | test_ce: 6.39517 | test_acc: 81.5500
Epoch: 11 | test_ce: 6.20328 | test_acc: 82.1000
Epoch: 12 | test_ce: 6.41006 | test_acc: 83.4000
