In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
import torch.utils.data as torch_data
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

from train import train

import random
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

import warnings
warnings.filterwarnings("ignore")

In [2]:
class DNA_DATA(torch_data.Dataset):
    
    def __init__(self, X, y):
        super(DNA_DATA, self).__init__()
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long) 
    
    def __len__(self):
        return list(self.X.size())[0]
    
    def __getitem__(self, idx):
        return (self.X[idx], self.y[idx])

In [3]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, input_dim, n_classes):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(input_dim, 20)
        self.linear2 = torch.nn.Linear(20, n_classes)
    
    def forward(self, x, last=False):
        l1scores = torch.nn.functional.relu(self.linear1(x))
        scores = self.linear2(l1scores)
        if last:
            return scores, l1scores
        else:
            return scores

In [4]:
def random_comparison(epochs, fullset, testset, r, input_dim, n_classes):
  random_samp_loader = DataLoader(random.sample(list(fullset), r), batch_size=20, shuffle=True)
  test_loader = DataLoader(testset, batch_size=50, shuffle=False)
  net = TwoLayerNet(input_dim, n_classes=n_classes)
  criterion = torch.nn.CrossEntropyLoss()
  optimizer = torch.optim.SGD(net.parameters(), lr=0.05)
  train(200, net, criterion, optimizer, random_samp_loader, test_loader, verbose=True)

In [7]:
#----------EXPERIMENTAL SETTING-------------------------------------
#----------DNA DATA SET---------------------------------------------
#----------GLISTER_ONLINE: STOCHASTIC VERSION, NO REGULARIZATION----

data = pd.read_csv('dna.csv')
X_train, X_test, y_train, y_test = train_test_split(data.drop('class', axis = 1), data['class'], test_size = 0.3)
y_train = y_train - 1
y_test = y_test - 1

trainset = DNA_DATA(np.array(X_train), np.array(y_train))
valset = DNA_DATA(np.array(X_train), np.array(y_train))
testset = DNA_DATA(np.array(X_test), np.array(y_test))

print("DNA: stochastic_greedy, no regularization")
from Glister_stochastic_noreg import GlisterOnline

for r in [100, 300, 500]:
  print("\n{fac}% of data set".format(fac=r/10))

  glister = GlisterOnline(
      fullset = trainset,
      valset = valset,
      testset = testset,
      device = "cpu",
      validation_set_fraction = 0.1,
      trn_batch_size = 20,
      val_batch_size = 50,
      tst_batch_size = 50,
      dss_batch_size = 50,
      model = TwoLayerNet(input_dim=180, n_classes=3),
      num_epochs = 20,
      learning_rate = 0.05,
      num_classes = 3,
      n_channels = 1,
      bud = r,
      lam = 0.1,
      r=r)

  val_acc, tst_acc, subtrn_acc, full_trn_acc,\
  val_loss, test_loss, subtrn_loss, full_trn_loss,\
  val_losses, substrn_losses, fulltrn_losses,\
  idxs, time = glister.random_greedy_train_model_online_taylor(np.arange(20))

  print("\nRandom comparison:")

  random_comparison(200, trainset, testset, r, 180, 3)

print("\nFull train:")

full_loader = DataLoader(trainset, batch_size=20, shuffle=True)
test_loader = DataLoader(testset, batch_size=50, shuffle=False)

net = TwoLayerNet(input_dim=180, n_classes=3)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.05)
train(200, net, criterion, optimizer, full_loader, test_loader, verbose=True)

#----------GLISTER_ONLINE: STOCHASTIC VERSION, REGULARIZATION----

print("\nDNA: stochastic_greedy, regularization")
from Glister_stochastic import GlisterOnline

for r in [100, 300, 500]:
  print("\n{fac}% of data set".format(fac=r/10))

  glister = GlisterOnline(
      fullset = trainset,
      valset = valset,
      testset = testset,
      device = "cpu",
      validation_set_fraction = 0.1,
      trn_batch_size = 20,
      val_batch_size = 50,
      tst_batch_size = 50,
      dss_batch_size = 50,
      model = TwoLayerNet(input_dim=180, n_classes=3),
      num_epochs = 20,
      learning_rate = 0.05,
      num_classes = 3,
      n_channels = 1,
      bud = r,
      lam = 0.1,
      r=r)

  val_acc, tst_acc, subtrn_acc, full_trn_acc,\
  val_loss, test_loss, subtrn_loss, full_trn_loss,\
  val_losses, substrn_losses, fulltrn_losses,\
  idxs, time = glister.random_greedy_train_model_online_taylor(np.arange(20))

  print("\nRandom comparison:")

  random_comparison(200, trainset, testset, r, 180, 3)

print("\nFull train:")

full_loader = DataLoader(trainset, batch_size=20, shuffle=True)
test_loader = DataLoader(testset, batch_size=50, shuffle=False)

net = TwoLayerNet(input_dim=180, n_classes=3)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.05)
train(200, net, criterion, optimizer, full_loader, test_loader, verbose=True)

DNA: stochastic_greedy, no regularization

10.0% of data set
SelectionRun---------------------------------
Final SubsetTrn and FullTrn Loss: 2.8617196679115295 66.76037114858627
Validation Loss and Accuracy: 3.497330904006958 0.7802690582959642
Test Data Loss and Accuracy: 13.518501281738281 0.7740585774058577
-----------------------------------

Random comparison:
Test Data Loss 200 , Loss : 0.00517 , and Accuracy: 0.826

30.0% of data set
SelectionRun---------------------------------
Final SubsetTrn and FullTrn Loss: 3.4522235840559006 27.191657595336437
Validation Loss and Accuracy: 1.5143404304981232 0.905829596412556
Test Data Loss and Accuracy: 6.316932663321495 0.9037656903765691
-----------------------------------

Random comparison:
Test Data Loss 200 , Loss : 0.00258 , and Accuracy: 0.902

50.0% of data set
SelectionRun---------------------------------
Final SubsetTrn and FullTrn Loss: 4.206054195761681 19.265915852040052
Validation Loss and Accuracy: 1.1757498234510422 0.905

In [5]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split


digits, targets = load_digits(return_X_y=True)
digits = digits.astype(np.float32) / 255   # scaling

digits_train, digits_test, targets_train, targets_test = train_test_split(digits, targets, random_state=0)

train_digits = DNA_DATA(digits_train, targets_train)
val_digits = DNA_DATA(digits_train, targets_train)
test_digits = DNA_DATA(digits_test, targets_test)

In [6]:
#----------EXPERIMENTAL SETTING------------------------------------
#----------DIGITS DATA SET---------------------------------------------
#----------GLISTER_ONLINE: STOCHASTIC VERSION, NO REGULARIZATION----

print("SKLEARN DIGITS: stochastic_greedy, no regularization")
from Glister_stochastic_noreg import GlisterOnline
#from GlisterImage import GlisterOnlineImage

for r in [100, 300, 500]:
  print("\n{fac}% of data set".format(fac=r/10))

  glister = GlisterOnline(
      fullset = train_digits,
      valset = val_digits,
      testset = test_digits,
      device = "cpu",
      validation_set_fraction = 0.1,
      trn_batch_size = 20,
      val_batch_size = 50,
      tst_batch_size = 50,
      dss_batch_size = 50,
      model = TwoLayerNet(input_dim=64, n_classes=10),
      num_epochs = 200,
      learning_rate = 0.05,
      num_classes = 10,
      n_channels = 1,
      bud = r,
      lam = 0.1)

  val_acc, tst_acc, subtrn_acc, full_trn_acc,\
  val_loss, test_loss, subtrn_loss, full_trn_loss,\
  val_losses, substrn_losses, fulltrn_losses,\
  idxs, time = glister.random_greedy_train_model_online_taylor(np.arange(20))

  print("\nRandom comparison:")

  random_comparison(200, train_digits, test_digits, r, 64, 10)

print("\nFull train:")

train_loader = DataLoader(train_digits, batch_size=20, shuffle=True)
test_loader = DataLoader(test_digits, batch_size=50, shuffle=False)

net = TwoLayerNet(input_dim=64, n_classes=10)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.05)
train(200, net, criterion, optimizer, train_loader, test_loader, verbose=True)

#----------GLISTER_ONLINE: STOCHASTIC VERSION, REGULARIZATION----

print("\nSKLEARN DIGITS: stochastic_greedy, regularization")
from Glister_stochastic import GlisterOnline

for r in [100, 300, 500]:
  print("\n{fac}% of data set".format(fac=r/10))

  glister = GlisterOnline(
      fullset = train_digits,
      valset = val_digits,
      testset = test_digits,
      device = "cpu",
      validation_set_fraction = 0.1,
      trn_batch_size = 20,
      val_batch_size = 50,
      tst_batch_size = 50,
      dss_batch_size = 50,
      model = TwoLayerNet(input_dim=64, n_classes=10),
      num_epochs = 200,
      learning_rate = 0.05,
      num_classes = 10,
      n_channels = 1,
      bud = r,
      lam = 0.1)

  val_acc, tst_acc, subtrn_acc, full_trn_acc,\
  val_loss, test_loss, subtrn_loss, full_trn_loss,\
  val_losses, substrn_losses, fulltrn_losses,\
  idxs, time = glister.random_greedy_train_model_online_taylor(np.arange(20))

  print("\nRandom comparison:")

  random_comparison(200, train_digits, test_digits, r, 64, 10)

print("\nFull train:")

full_loader = DataLoader(train_digits, batch_size=20, shuffle=True)
test_loader = DataLoader(test_digits, batch_size=50, shuffle=False)

net = TwoLayerNet(input_dim=64, n_classes=10)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.05)
train(200, net, criterion, optimizer, full_loader, test_loader, verbose=True)

SKLEARN DIGITS: stochastic_greedy, no regularization

10.0% of data set
SelectionRun---------------------------------
Final SubsetTrn and FullTrn Loss: 11.325555801391602 140.52311849594116
Validation Loss and Accuracy: 6.799923896789551 0.1417910447761194
Test Data Loss and Accuracy: 20.777456045150757 0.10666666666666667
-----------------------------------

Random comparison:
Test Data Loss 200 , Loss : 2.26 , and Accuracy: 0.0956

30.0% of data set
SelectionRun---------------------------------
Final SubsetTrn and FullTrn Loss: 31.620559453964233 130.3787670135498
Validation Loss and Accuracy: 6.331751585006714 0.44029850746268656
Test Data Loss and Accuracy: 19.37464427947998 0.37777777777777777
-----------------------------------

Random comparison:
Test Data Loss 200 , Loss : 2.16 , and Accuracy: 0.18

50.0% of data set
SelectionRun---------------------------------
Final SubsetTrn and FullTrn Loss: 36.83873927593231 97.02149260044098
Validation Loss and Accuracy: 4.584716796875 0.