## All imports:

In [42]:
!pip install scikit_posthocs



In [62]:
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)
import os
import random
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import glob
import tensorflow.keras as K
from tensorflow.keras import layers
import datetime
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.layers import Conv2DTranspose, Reshape, Lambda, Conv2D, MaxPool2D, Flatten, Dense, Dropout, BatchNormalization, Input, Activation, MaxPooling2D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.applications import VGG16
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow.keras as K
from sklearn.model_selection import StratifiedKFold
import time
from sklearn.metrics import roc_auc_score, average_precision_score
from tqdm import tqdm
from numpy import argmax
import pandas as pd
from scipy.stats import friedmanchisquare
import scikit_posthocs

## Constants:

Connection to drive - contains the datasets

In [44]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
datasets_path_drive = '/content/drive/My Drive/NoisyStudentProject/Datasets/'
path_drive = '/content/drive/My Drive/NoisyStudentProject/'

### Load all datasets: 


In [46]:
def get_datasets():
  datasets = []
  datasets_name = set()
  for file in glob.glob(datasets_path_drive+'*'):
    datasets_name.add(os.path.splitext(file.split('/')[-1])[0][:-2])
  for dataset_name in datasets_name:
    data_X = datasets_path_drive + dataset_name + '_X.npy'
    data_y = datasets_path_drive + dataset_name + '_Y.npy'
    datasets += [((np.load(data_X), np.load(data_y)), dataset_name)]
  return datasets

## Classes of models:
 - BaseModelVGG16
 - ArticleModel
 - improvedArticleModel

Each model contains 4 function: 
  build model - create the model
  
  train - complie the model and fit
  
  eval_acc - calculate the accuracy of model
  
  eval - calculate all matrics:

    A. Accuracy – Under the assumption that the classification is the Class with the highest
    probability.
    B. TPR
    C. FPR
    D. Precision
    E. AUC – Area Under the ROC Curve
    F. Area under the Precision-Recall
    G. Training time
    H. Inference time for 1000 instances.


### *BaseModelVGG16:*

In [47]:
class BaseModelVGG16:
  def __init__(self, data, classes, batch_size, learning_rate, pooling, weight):
    (self.x_train, self.y_train), (self.x_test, self.y_test) = data
    self.input_shape = self.x_train.shape[1:]
    self.weight = weight
    self.pooling = pooling
    self.bs = batch_size
    self.lr = learning_rate
    self.classes = classes
    self.model = self.build_model()

  def build_model(self):
    base_model = VGG16(include_top=False, weights=self.weight, pooling=self.pooling, input_shape=self.input_shape)
    model = Sequential()
    model.add(base_model)
    model.add(Dense(self.classes, activation='softmax'))
    return model

  def train(self, number_of_epochs):
    self.model.compile(loss="sparse_categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=self.lr), metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
    hist = self.model.fit(self.x_train, self.y_train, batch_size=self.bs, epochs=number_of_epochs, verbose=1)
    return hist.history


  def eval_acc(self):
    results_dict = {}
    prediction = self.model.predict(self.x_test)
    soft = np.reshape(prediction, (self.y_test.shape[0], self.classes))
    classes = np.unique(self.y_test)

    # A. Accuracy
    acc_eval = tf.keras.metrics.SparseCategoricalAccuracy()
    acc_eval.update_state(self.y_test, soft)
    acc = acc_eval.result().numpy()
    results_dict['acc'] = acc
    return acc

  def eval(self):
    '''
    Evaluate and calculate all metrics according to the assignment requirements
    :return: A dictionary contains all metrices
    '''
    results_dict = {}
    prediction = self.model.predict(self.x_test)
    soft = np.reshape(prediction, (self.y_test.shape[0], self.classes))
    classes = np.unique(self.y_test)

    # A. Accuracy
    acc_eval = tf.keras.metrics.SparseCategoricalAccuracy()
    acc_eval.update_state(self.y_test, soft)
    acc = acc_eval.result().numpy()
    print(acc)
    results_dict['acc'] = acc

    # B. TPR
    pred_labels = soft.argmax(axis=1)
    total_tp = 0
    total_fp = 0
    total_tn = 0
    total_fn = 0
    for label in classes:
        for i in range(len(pred_labels)):
            if self.y_test[i][0] == pred_labels[i] == label:
                total_tp += 1

            if pred_labels[i] == label and self.y_test[i][0] != label:
                total_fp += 1

            if pred_labels[i] != label and self.y_test[i][0] != label:
                total_tn += 1

            if pred_labels[i] != label and self.y_test[i][0] == label:
                total_fn += 1

    results_dict['TPR'] = total_tp / (total_tp + total_fn)

    # C. FPR
    results_dict['FPR'] = total_fp / (total_tn + total_fp)

    # D. Precision
    results_dict['Presicion'] = total_tp / (total_tp + total_fp)

    # E. AUC – Area Under the ROC Curve
    y_true = self.y_test.reshape((self.y_test.shape[0],))
    y_pred = soft
    results_dict['AUC'] = roc_auc_score(y_true, y_pred, 'macro', multi_class='ovr')
    y_oh = tf.keras.utils.to_categorical(y_true)

    # F. Area under the Precision-Recall
    results_dict['Area under PR'] = average_precision_score(y_oh, y_pred, 'macro')

    # H. Inference time for 1000 instances
    if self.x_test.shape[0] < 1000:
        inf_data = self.x_test
    else:
        inf_data = self.x_test[:1000]
    start = time.time()
    self.model.predict(inf_data)
    end = time.time()
    results_dict['Inferece time'] = end - start
    return results_dict


### *ArticleModel:*

In [48]:
class articleModel:
  def __init__(self, data, X_unlabeled, classes, batch_size, learning_rate):
    (self.x_train, self.y_train), (self.x_test, self.y_test) = data
    self.X_unlabeled = X_unlabeled
    self.input_shape = self.x_train.shape[1:]
    self.bs = batch_size
    self.lr = learning_rate
    self.classes = classes
    self.teacher = self.build_model()
    self.student = self.build_model()
    self.model = None

  def build_model(self):
    base_model = K.applications.EfficientNetB3(include_top=False, weights='imagenet', drop_connect_rate=0.4)
    resize = K.Sequential([
      K.layers.experimental.preprocessing.Resizing(self.input_shape[0], self.input_shape[1])
    ])
    model= K.Sequential()
    model.add(resize)
    model.add(base_model)
    model.add(K.layers.Flatten())
    model.add(K.layers.Dense(512, activation=('relu')))
    model.add(K.layers.Dropout(0.2))
    model.add(K.layers.Dense(256, activation=('relu')))
    model.add(K.layers.Dropout(0.2))
    model.add(K.layers.Dense(self.classes, activation=('softmax')))
    return model


  def train(self, number_of_epochs):
    self.teacher.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.lr), loss='categorical_crossentropy', metrics=['accuracy'])
    self.student.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.lr), loss='categorical_crossentropy', metrics=['accuracy'])
    self.teacher.fit(x=self.x_train, y=self.y_train,
        batch_size=self.bs,
        epochs=number_of_epochs, shuffle=True,
        verbose=1
        )
    x_train_student, y_train_student = pseudo_labelling(self.teacher, self.x_train, self.y_train, self.X_unlabeled, self.classes, threhold=0.0001)
    batch_size = self.bs
    steps_per_epoch = x_train_student.shape[0] // batch_size
    self.student.fit(data_generator(x_train_student, y_train_student, batch_size, data_aug = True),
                        epochs=number_of_epochs,
                        steps_per_epoch = steps_per_epoch,
                        batch_size=batch_size,
                        verbose=1
                        )
    self.model = self.student


  def eval_acc(self):
    results_dict = {}
    prediction = self.model.predict(self.x_test)
    soft = np.reshape(prediction, (self.y_test.shape[0], self.classes))
    classes = np.unique(self.y_test)

    # A. Accuracy
    acc_eval = tf.keras.metrics.CategoricalAccuracy()
    acc_eval.update_state(self.y_test, soft)
    acc = acc_eval.result().numpy()
    results_dict['acc'] = acc
    return acc

  def eval(self):
    '''
    Evaluate and calculate all metrics according to the assignment requirements
    :return: A dictionary contains all metrices
    '''
    results_dict = {}
    prediction = self.model.predict(self.x_test)
    soft = np.reshape(prediction, (self.y_test.shape[0], self.classes))
    classes = np.unique(self.y_test)

    # A. Accuracy
    acc_eval = tf.keras.metrics.CategoricalAccuracy()
    acc_eval.update_state(self.y_test, soft)
    acc = acc_eval.result().numpy()
    print(acc)
    results_dict['acc'] = acc

    # B. TPR
    pred_labels = soft.argmax(axis=1)
    total_tp = 0
    total_fp = 0
    total_tn = 0
    total_fn = 0
    for label in classes:
        for i in range(len(pred_labels)):
            if self.y_test[i][0] == pred_labels[i] == label:
                total_tp += 1

            if pred_labels[i] == label and self.y_test[i][0] != label:
                total_fp += 1

            if pred_labels[i] != label and self.y_test[i][0] != label:
                total_tn += 1

            if pred_labels[i] != label and self.y_test[i][0] == label:
                total_fn += 1

    results_dict['TPR'] = total_tp / (total_tp + total_fn)

    # C. FPR
    results_dict['FPR'] = total_fp / (total_tn + total_fp)

    # D. Precision
    results_dict['Presicion'] = total_tp / (total_tp + total_fp)

    # E. AUC – Area Under the ROC Curve
    y_true = self.y_test.reshape((self.y_test.shape[0],))
    y_pred = soft
    results_dict['AUC'] = roc_auc_score(y_true, y_pred, 'macro', multi_class='ovr')
    y_oh = tf.keras.utils.to_categorical(y_true)

    # F. Area under the Precision-Recall
    results_dict['Area under PR'] = average_precision_score(y_oh, y_pred, 'macro')

    # H. Inference time for 1000 instances
    if self.x_test.shape[0] < 1000:
        inf_data = self.x_test
    else:
        inf_data = self.x_test[:1000]
    start = time.time()
    self.model.predict(inf_data)
    end = time.time()
    results_dict['Inferece time'] = end - start
    return results_dict

### *improveArticleModel:*

In [49]:
class improvedArticleModel:
  def __init__(self, data, X_unlabeled, classes, batch_size, learning_rate):
    (self.x_train, self.y_train), (self.x_test, self.y_test) = data
    self.X_unlabeled = X_unlabeled
    self.input_shape = self.x_train.shape[1:]
    self.bs = batch_size
    self.lr = learning_rate
    self.classes = classes
    self.teacher = self.build_model()
    self.student = self.build_model()
    self.student2 = self.build_model()
    self.model = None

  def build_model(self):
    base_model = K.applications.EfficientNetB3(include_top=False, weights='imagenet', drop_connect_rate=0.4)
    resize = K.Sequential([
      K.layers.experimental.preprocessing.Resizing(self.input_shape[0], self.input_shape[1])
    ])
    model= K.Sequential()
    model.add(resize)
    model.add(base_model)
    model.add(K.layers.Flatten())
    model.add(K.layers.Dense(512, activation=('relu')))
    model.add(K.layers.Dropout(0.2))
    model.add(K.layers.Dense(256, activation=('relu')))
    model.add(K.layers.Dropout(0.2))
    model.add(K.layers.Dense(self.classes, activation=('softmax')))
    return model


  def train(self, number_of_epochs):
    self.teacher.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.lr), loss='categorical_crossentropy', metrics=['accuracy'])
    self.student.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.lr), loss='categorical_crossentropy', metrics=['accuracy'])
    self.student2.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.lr), loss='categorical_crossentropy', metrics=['accuracy'])
    batch_size = self.bs
    self.teacher.fit(x=self.x_train, y=self.y_train,
        batch_size=self.bs,
        #validation_data=(Xv_p, Yv_p),
        epochs=number_of_epochs, shuffle=True,
        #callbacks=callback,
        verbose=1
        ) 
    x_train_student, y_train_student = pseudo_labelling(self.teacher, self.x_train, self.y_train, self.X_unlabeled[:(len(X_unlabeled)//2)], self.classes, threhold=0.0001)
    #steps_per_epoch1 = (x_train_student.shape[0] // 2) // batch_size
    steps_per_epoch1 = x_train_student.shape[0] // batch_size
    self.student.fit(data_generator(x_train_student, y_train_student, batch_size, data_aug = True),
                        epochs=number_of_epochs,
                        steps_per_epoch = steps_per_epoch1,
                        batch_size=batch_size,
                        #validation_data = (Xv_p, Yv_p),
                        #validation_steps = validation_steps,
                        #callbacks=callback,
                        verbose=1
                        )
    x_train_student2, y_train_student2 = pseudo_labelling(self.student, self.x_train, self.y_train, self.X_unlabeled[(len(X_unlabeled)//2):], self.classes, threhold=0.0001)
    steps_per_epoch2 = x_train_student2.shape[0] // batch_size
    self.student2.fit(data_generator(x_train_student2, y_train_student2, batch_size, data_aug = True),
                        epochs=number_of_epochs,
                        steps_per_epoch = steps_per_epoch2,
                        batch_size=batch_size,
                        #validation_data = (Xv_p, Yv_p),
                        #validation_steps = validation_steps,
                        #callbacks=callback,
                        verbose=1
                        )
    self.model = self.student2


  def eval_acc(self):
    results_dict = {}
    prediction = self.model.predict(self.x_test)
    soft = np.reshape(prediction, (self.y_test.shape[0], self.classes))
    classes = np.unique(self.y_test)

    # A. Accuracy
    acc_eval = tf.keras.metrics.CategoricalAccuracy()
    acc_eval.update_state(self.y_test, soft)
    acc = acc_eval.result().numpy()
    results_dict['acc'] = acc
    return acc

  def eval(self):
    '''
    Evaluate and calculate all metrics according to the assignment requirements
    :return: A dictionary contains all metrices
    '''
    results_dict = {}
    prediction = self.model.predict(self.x_test)
    soft = np.reshape(prediction, (self.y_test.shape[0], self.classes))
    classes = np.unique(self.y_test)

    # A. Accuracy
    acc_eval = tf.keras.metrics.CategoricalAccuracy()
    acc_eval.update_state(self.y_test, soft)
    acc = acc_eval.result().numpy()
    print(acc)
    results_dict['acc'] = acc

    # B. TPR
    pred_labels = soft.argmax(axis=1)
    total_tp = 0
    total_fp = 0
    total_tn = 0
    total_fn = 0
    for label in classes:
        for i in range(len(pred_labels)):
            if self.y_test[i][0] == pred_labels[i] == label:
                total_tp += 1

            if pred_labels[i] == label and self.y_test[i][0] != label:
                total_fp += 1

            if pred_labels[i] != label and self.y_test[i][0] != label:
                total_tn += 1

            if pred_labels[i] != label and self.y_test[i][0] == label:
                total_fn += 1

    results_dict['TPR'] = total_tp / (total_tp + total_fn)

    # C. FPR
    results_dict['FPR'] = total_fp / (total_tn + total_fp)

    # D. Precision
    results_dict['Presicion'] = total_tp / (total_tp + total_fp)

    # E. AUC – Area Under the ROC Curve
    y_true = self.y_test.reshape((self.y_test.shape[0],))
    y_pred = soft
    results_dict['AUC'] = roc_auc_score(y_true, y_pred, 'macro', multi_class='ovr')
    y_oh = tf.keras.utils.to_categorical(y_true)

    # F. Area under the Precision-Recall
    results_dict['Area under PR'] = average_precision_score(y_oh, y_pred, 'macro')

    # H. Inference time for 1000 instances
    if self.x_test.shape[0] < 1000:
        inf_data = self.x_test
    else:
        inf_data = self.x_test[:1000]
    start = time.time()
    self.model.predict(inf_data)
    end = time.time()
    results_dict['Inferece time'] = end - start
    return results_dict


## Train functions:
As required, we used external 10-fold cross validation, in addition to an internal 3-fold cross validation for hyperparameter optimization.

In [50]:
def train_baseline_model(dataset_name, data, outer_cv=10, inner_cv=3, random_search_trials=25, inner_epochs=1, outer_epochs=5):
  X, y = data
  outer_skf = StratifiedKFold(n_splits=outer_cv, random_state=7, shuffle=True)
  inner_skf = StratifiedKFold(n_splits=inner_cv, random_state=7, shuffle=True)
  fold_var = 0
  list_of_res = []
  for train_index, val_index in outer_skf.split(X=X, y=y):
    X_train = X[train_index]
    y_train = y[train_index]
    X_val = X[val_index]
    y_val = y[val_index]
    hyper_param_batchsize = np.array([64, 128])
    hyper_param_lr = np.array([0.005, 0.001, 0.0001])
    hyper_param_pooling = np.array(['max', 'avg'])
    hyper_param_weights = np.array(['imagenet', None])
    hyper_params_dict = {}
    max_trail = 0
    max_acc = 0
    for trail in range(random_search_trials):
      batch_size_h = np.random.choice(hyper_param_batchsize)
      learning_rate_h = np.random.choice(hyper_param_lr)
      pooling_h = np.random.choice(hyper_param_pooling)
      weights_h = np.random.choice(hyper_param_weights)
      print(f"hyper params {(batch_size_h, learning_rate_h, pooling_h, weights_h)}")
      acc_list = []
      for train_index_inner, val_index_inner in inner_skf.split(X=X_train, y=y_train):
        X_train_inner = X_train[train_index_inner]
        y_train_inner = y_train[train_index_inner]
        X_val_inner = X_train[val_index_inner]
        y_val_inner = y_train[val_index_inner]
        classes = len(np.unique(y_train_inner))
        model = BaseModelVGG16(((X_train_inner,y_train_inner), (X_val_inner, y_val_inner)), classes, batch_size_h, learning_rate_h, pooling_h, weights_h)
        model.train(inner_epochs)
        acc = model.eval_acc()
        acc_list.append(acc)
        tf.keras.backend.clear_session()
      mean_acc = np.array(acc_list).mean()
      if mean_acc > max_acc:
          max_trail = trail
          max_acc = mean_acc
      hyper_params_dict[trail] = (batch_size_h, learning_rate_h, pooling_h, weights_h, mean_acc)
    
    best_params = hyper_params_dict[max_trail]
    model = BaseModelVGG16(((X_train, y_train), (X_val, y_val)), classes, best_params[0], best_params[1], best_params[2], best_params[3])
    start_timer = time.time()
    model.train(outer_epochs)
    end_timer = time.time()
    eval_res = model.eval()
    results_dict = {}
    results_dict['dataset_name'] = dataset_name
    results_dict['k-fold'] = fold_var
    results_dict['train_time'] = end_timer - start_timer
    results_dict['hyper-parameters'] = f'batch_size={best_params[0]}, learning_rate={best_params[1]}, best_pooling={best_params[2]}, best_init_weights={best_params[3]}'
    results_dict.update(eval_res)
    list_of_res.append(results_dict)
    tf.keras.backend.clear_session()
    fold_var += 1
    tmp = pd.DataFrame(list_of_res)
    tmp.to_csv(f'{path_drive}Results/Baseline_{dataset_name}.csv', index=None)
  return pd.DataFrame(list_of_res)


def train_article_model(dataset_name, data, X_unlabeled, outer_cv=10, inner_cv=3, random_search_trials=25, inner_epochs=1, outer_epochs=5):
  X, y = data
  outer_skf = StratifiedKFold(n_splits=outer_cv, random_state=7, shuffle=True)
  inner_skf = StratifiedKFold(n_splits=inner_cv, random_state=7, shuffle=True)
  fold_var = 0
  list_of_res = []
  for train_index, val_index in outer_skf.split(X=X, y=y):
    X_train = X[train_index]
    y_train = y[train_index]
    X_val = X[val_index]
    y_val = y[val_index]
    hyper_param_batchsize = np.array([64, 128])
    hyper_param_lr = np.array([0.005, 0.001, 0.0001])
    #hyper_param_pooling = np.array(['max', 'avg'])
    #hyper_param_weights = np.array(['imagenet', None])
    hyper_params_dict = {}
    max_trail = 0
    max_acc = 0
    for trail in range(random_search_trials):
      batch_size_h = np.random.choice(hyper_param_batchsize)
      learning_rate_h = np.random.choice(hyper_param_lr)
      #pooling_h = np.random.choice(hyper_param_pooling)
      #weights_h = np.random.choice(hyper_param_weights)
      print(f"hyper params {(batch_size_h, learning_rate_h)}")
      acc_list = []
      for train_index_inner, val_index_inner in inner_skf.split(X=X_train, y=y_train):
        X_train_inner = X_train[train_index_inner]
        y_train_inner = y_train[train_index_inner]
        X_val_inner = X_train[val_index_inner]
        y_val_inner = y_train[val_index_inner]
        classes = len(np.unique(y_train_inner))
        #print("one")
        model = articleModel(((X_train_inner,K.utils.to_categorical(y_train_inner, classes)), (X_val_inner, y_val_inner)), X_unlabeled, classes, batch_size_h, learning_rate_h)
        #print("two")
        model.train(inner_epochs)
        #print("three")
        acc = model.eval_acc()
        acc_list.append(acc)
        tf.keras.backend.clear_session()
      mean_acc = np.array(acc_list).mean()
      if mean_acc > max_acc:
          max_trail = trail
          max_acc = mean_acc
      hyper_params_dict[trail] = (batch_size_h, learning_rate_h, mean_acc)
    
    best_params = hyper_params_dict[max_trail]
    model = articleModel(((X_train_inner,K.utils.to_categorical(y_train_inner, classes)), (X_val_inner, y_val_inner)), X_unlabeled, classes, best_params[0], best_params[1])
    start_timer = time.time()
    model.train(outer_epochs)
    end_timer = time.time()
    eval_res = model.eval()
    results_dict = {}
    results_dict['dataset_name'] = dataset_name
    results_dict['k-fold'] = fold_var
    results_dict['train_time'] = end_timer - start_timer
    results_dict['hyper-parameters'] = f'batch_size={best_params[0]}, learning_rate={best_params[1]}'
    results_dict.update(eval_res)
    list_of_res.append(results_dict)
    tf.keras.backend.clear_session()
    fold_var += 1
    tmp = pd.DataFrame(list_of_res)
    tmp.to_csv(f'{path_drive}Results/Article_{dataset_name}.csv', index=None)
  return pd.DataFrame(list_of_res)


def train_improve_model(dataset_name, data, X_unlabeled, outer_cv=10, inner_cv=3, random_search_trials=25, inner_epochs=1, outer_epochs=5):
  X, y = data
  outer_skf = StratifiedKFold(n_splits=outer_cv, random_state=7, shuffle=True)
  inner_skf = StratifiedKFold(n_splits=inner_cv, random_state=7, shuffle=True)
  fold_var = 0
  list_of_res = []
  for train_index, val_index in outer_skf.split(X=X, y=y):
    X_train = X[train_index]
    y_train = y[train_index]
    X_val = X[val_index]
    y_val = y[val_index]
    hyper_param_batchsize = np.array([64, 128])
    hyper_param_lr = np.array([0.005, 0.001, 0.0001])
    #hyper_param_pooling = np.array(['max', 'avg'])
    #hyper_param_weights = np.array(['imagenet', None])
    hyper_params_dict = {}
    max_trail = 0
    max_acc = 0
    for trail in range(random_search_trials):
      batch_size_h = np.random.choice(hyper_param_batchsize)
      learning_rate_h = np.random.choice(hyper_param_lr)
      #pooling_h = np.random.choice(hyper_param_pooling)
      #weights_h = np.random.choice(hyper_param_weights)
      print(f"hyper params {(batch_size_h, learning_rate_h)}")
      acc_list = []
      for train_index_inner, val_index_inner in inner_skf.split(X=X_train, y=y_train):
        X_train_inner = X_train[train_index_inner]
        y_train_inner = y_train[train_index_inner]
        X_val_inner = X_train[val_index_inner]
        y_val_inner = y_train[val_index_inner]
        classes = len(np.unique(y_train_inner))
        model = improvedArticleModel(((X_train_inner,K.utils.to_categorical(y_train_inner, classes)), (X_val_inner, y_val_inner)), X_unlabeled, classes, batch_size_h, learning_rate_h)
        model.train(inner_epochs)
        acc = model.eval_acc()
        acc_list.append(acc)
        tf.keras.backend.clear_session()
      mean_acc = np.array(acc_list).mean()
      if mean_acc > max_acc:
          max_trail = trail
          max_acc = mean_acc
      hyper_params_dict[trail] = (batch_size_h, learning_rate_h, mean_acc)
    
    best_params = hyper_params_dict[max_trail]
    model = improvedArticleModel(((X_train_inner,K.utils.to_categorical(y_train_inner, classes)), (X_val_inner, y_val_inner)), X_unlabeled, classes, best_params[0], best_params[1])
    start_timer = time.time()
    model.train(outer_epochs)
    end_timer = time.time()
    eval_res = model.eval()
    results_dict = {}
    results_dict['dataset_name'] = dataset_name
    results_dict['k-fold'] = fold_var
    results_dict['train_time'] = end_timer - start_timer
    results_dict['hyper-parameters'] = f'batch_size={best_params[0]}, learning_rate={best_params[1]}'
    results_dict.update(eval_res)
    list_of_res.append(results_dict)
    tf.keras.backend.clear_session()
    fold_var += 1
    tmp = pd.DataFrame(list_of_res)
    tmp.to_csv(f'{path_drive}Results/Improved_{dataset_name}.csv', index=None)
  return pd.DataFrame(list_of_res)
  

 #### RandAugment used in the algorithm teacher-student for robustness:

In [51]:
# Reference: code taken from: https://github.com/heartInsert/randaugment/blob/master/Rand_Augment.py
# Rand Augmentation
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image, ImageEnhance, ImageOps
import numpy as np
import random

class Rand_Augment():
    def __init__(self, Numbers=None, max_Magnitude=None):
        self.transforms = ['autocontrast', 'equalize', 'rotate', 'solarize', 'color', 'posterize',
                           'contrast', 'brightness', 'sharpness', 'shearX', 'shearY', 'translateX', 'translateY']
        if Numbers is None:
            self.Numbers = len(self.transforms) // 2
        else:
            self.Numbers = Numbers
        if max_Magnitude is None:
            self.max_Magnitude = 10
        else:
            self.max_Magnitude = max_Magnitude
        fillcolor = 128
        self.ranges = {
            # these  Magnitude   range , you  must test  it  yourself , see  what  will happen  after these  operation ,
            # it is no  need to obey  the value  in  autoaugment.py
            "shearX": np.linspace(0, 0.3, 10),
            "shearY": np.linspace(0, 0.3, 10),
            "translateX": np.linspace(0, 0.2, 10),
            "translateY": np.linspace(0, 0.2, 10),
            "rotate": np.linspace(0, 360, 10),
            "color": np.linspace(0.0, 0.9, 10),
            "posterize": np.round(np.linspace(8, 4, 10), 0).astype(np.int),
            "solarize": np.linspace(256, 231, 10),
            "contrast": np.linspace(0.0, 0.5, 10),
            "sharpness": np.linspace(0.0, 0.9, 10),
            "brightness": np.linspace(0.0, 0.3, 10),
            "autocontrast": [0] * 10,
            "equalize": [0] * 10,           
            "invert": [0] * 10
        }
        self.func = {
            "shearX": lambda img, magnitude: img.transform(
                img.size, Image.AFFINE, (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
                Image.BICUBIC, fill=fillcolor),
            "shearY": lambda img, magnitude: img.transform(
                img.size, Image.AFFINE, (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
                Image.BICUBIC, fill=fillcolor),
            "translateX": lambda img, magnitude: img.transform(
                img.size, Image.AFFINE, (1, 0, magnitude * img.size[0] * random.choice([-1, 1]), 0, 1, 0),
                fill=fillcolor),
            "translateY": lambda img, magnitude: img.transform(
                img.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude * img.size[1] * random.choice([-1, 1])),
                fill=fillcolor),
            "rotate": lambda img, magnitude: self.rotate_with_fill(img, magnitude),
            # "rotate": lambda img, magnitude: img.rotate(magnitude * random.choice([-1, 1])),
            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(1 + magnitude * random.choice([-1, 1])),
            "posterize": lambda img, magnitude: ImageOps.posterize(img, magnitude),
            "solarize": lambda img, magnitude: ImageOps.solarize(img, magnitude),
            "contrast": lambda img, magnitude: ImageEnhance.Contrast(img).enhance(
                1 + magnitude * random.choice([-1, 1])),
            "sharpness": lambda img, magnitude: ImageEnhance.Sharpness(img).enhance(
                1 + magnitude * random.choice([-1, 1])),
            "brightness": lambda img, magnitude: ImageEnhance.Brightness(img).enhance(
                1 + magnitude * random.choice([-1, 1])),
            "autocontrast": lambda img, magnitude: ImageOps.autocontrast(img),
            "equalize": lambda img, magnitude: img,
            "invert": lambda img, magnitude: ImageOps.invert(img)
        }

    def rand_augment(self):
        """Generate a set of distortions.
             Args:
             N: Number of augmentation transformations to apply sequentially. N  is len(transforms)/2  will be best
             M: Max_Magnitude for all the transformations. should be  <= self.max_Magnitude """

        M = np.random.randint(0, self.max_Magnitude, self.Numbers)

        sampled_ops = np.random.choice(self.transforms, self.Numbers)
        return [(op, Magnitude) for (op, Magnitude) in zip(sampled_ops, M)]

    def __call__(self, image):
        operations = self.rand_augment()
        for (op_name, M) in operations:
            operation = self.func[op_name]
            mag = self.ranges[op_name][M]
            image = operation(image, mag)
        return image

    def rotate_with_fill(self, img, magnitude):
        #  I  don't know why  rotate  must change to RGBA , it is  copy  from Autoaugment - pytorch
        rot = img.convert("RGBA").rotate(magnitude)
        return Image.composite(rot, Image.new("RGBA", rot.size, (128,) * 4), rot).convert(img.mode)

    def test_single_operation(self, image, op_name, M=-1):
        '''
        :param image: image
        :param op_name: operation name in   self.transforms
        :param M: -1  stands  for the  max   Magnitude  in  there operation
        :return:
        '''
        operation = self.func[op_name]
        mag = self.ranges[op_name][M]
        image = operation(image, mag)
        return image

### Predict lable by the trained model:

In [52]:
import os
import random
import numpy as np
import tensorflow as tf
import subprocess as sp
import os
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
img_augment = Rand_Augment(Numbers=2, max_Magnitude=10)

def pseudo_labelling(model, xs, ys, xt, classes, threhold=0.0001):
  """
  Pseudo-label unlabeled data in the teacher model
      First, prepare an image for attaching a pseudo label. As a detailed procedure Make unlabeled images
      into numpy arrays Add a pseudo label to an unlabeled image Leave only pseudo-label data above a certain 
      threshold Align the number of data for each label It will be. 
  """
  x_train_9,x_test_9, y_train_9,y_test_9 = train_test_split(xs, ys, test_size=0.2)

  #y_train_9 = to_categorical(y_train_9)
  #y_test_9 = to_categorical(y_test_9)

  # ============Add a pseudo label to an unlabeled image============

  x_train_imgnet = xt[:-1]
  #Batch size setting
  largest_divisor = 1
  for i in range(2, len(x_train_imgnet)):
    if len(x_train_imgnet) % i == 0:
        largest_divisor = i 
  batch_size = largest_divisor
  #How many steps
  step = int(len(x_train_imgnet)/batch_size)

  #Empty list for pseudo labels
  y_train_imgnet_dummy = []

  for i in range(step):
      # Extract image data for batch size
      x_temp = x_train_imgnet[batch_size*i:batch_size*(i+1)]
      #normalization
      x_temp = x_temp
      #inference
      temp = model.predict(x_temp)
      #Add to empty list
      y_train_imgnet_dummy.extend(temp)

  #List to numpy array
  y_train_imgnet_dummy = np.array(y_train_imgnet_dummy)
  #print("y_train_imgnet_dummy is " + str(y_train_imgnet_dummy))
  # ============Leave only pseudo-label data above a certain threshold============
  #Thresholding
  y_train_imgnet_dummy_th =  y_train_imgnet_dummy[np.max(y_train_imgnet_dummy, axis=1) > threhold]
  #holder1 = y_train_imgnet_dummy[np.max(y_train_imgnet_dummy, axis=1)]
  #holder2 = np.max(y_train_imgnet_dummy, axis=1)
  x_train_imgnet_th = x_train_imgnet[np.max(y_train_imgnet_dummy, axis=1) > threhold]
  #print("y_train_imgnet_dummy_th is " + str(y_train_imgnet_dummy_th))
  #print("holder1 is " + str(holder1))
  #print("holder2 is " + str(holder2))
  #from onehot vector to class index
  y_student_all_dummy_label = np.argmax(y_train_imgnet_dummy_th, axis=1)
  #print("y_student_all_dummy_label is " + str(y_student_all_dummy_label))
  #Count the number of each class of pseudo-labels
  u, counts = np.unique(y_student_all_dummy_label, return_counts=True)

  #Calculate the maximum number of counts
  student_label_max =  max(counts)

  #Separate numpy array for each label
  y_student_per_label = []
  y_student_per_img_path = []

  for i in range(classes):
      temp_l = y_train_imgnet_dummy_th[y_student_all_dummy_label == i]
      y_student_per_label.append(temp_l)
      temp_i = x_train_imgnet_th[y_student_all_dummy_label == i]
      y_student_per_img_path.append(temp_i)

  #Copy data for maximum count on each label
  y_student_per_label_add = []
  y_student_per_img_add = []

  for i in range(classes):
      num = len(y_student_per_label[i])
      temp_l = y_student_per_label[i]
      temp_i = y_student_per_img_path[i]
      add_num = student_label_max - num
      q, mod = divmod(add_num, num)
      temp_l_tile = np.tile(temp_l, (q+1, 1))
      temp_i_tile = np.tile(temp_i, (q+1, 1, 1, 1))
      temp_l_add = temp_l[:mod]
      temp_i_add = temp_i[:mod]
      y_student_per_label_add.append(np.concatenate([temp_l_tile, temp_l_add], axis=0))
      y_student_per_img_add.append(np.concatenate([temp_i_tile, temp_i_add], axis=0))

  #Check the count number of each label
  #print([len(i) for i in y_student_per_label_add])

  #Merge data for each label
  student_train_img = np.concatenate(y_student_per_img_add, axis=0)
  student_train_label = np.concatenate(y_student_per_label_add, axis=0)

  # Combined with the original training data numpy array
  x_train_student = np.concatenate([x_train_9, student_train_img], axis=0)
  y_train_student = np.concatenate([y_train_9, student_train_label], axis=0)

  return x_train_student, y_train_student


# Data generator definition
def get_random_data(x_train_i, y_train_i, data_aug):
  x = tf.keras.preprocessing.image.array_to_img(x_train_i)

  if data_aug:
      seed_image = img_augment(x)
      seed_image = tf.keras.preprocessing.image.img_to_array(seed_image)

  else:
      seed_image = x_train_i

  seed_image = seed_image

  return seed_image, y_train_i

def data_generator(x_train, y_train, batch_size, data_aug):
  n = len(x_train)
  i = 0
  while True:
      image_data = []
      label_data = []
      for b in range(batch_size):
          if i==0:
              p = np.random.permutation(len(x_train))
              x_train = x_train[p]
              y_train = y_train[p]
          image, label = get_random_data(x_train[i], y_train[i], data_aug)
          image_data.append(image)
          label_data.append(label)
          i = (i+1) % n
      image_data = np.array(image_data)
      label_data = np.array(label_data)
      yield image_data, label_data

### Train models delegation function:

In [53]:
def train(model_name, dataset_name, data, X_unlabeled):
  if model_name == 'baseModel':
    return train_baseline_model(dataset_name, data) # implement !!
  elif model_name == 'articleModel':
    return train_article_model(dataset_name, data, X_unlabeled) # need to implement !! 
  elif model_name == 'improveModel':
    return train_improve_model(dataset_name, data, X_unlabeled) # need to implement !! 

## 1. Start the experiment:

Load the datasets:

In [54]:
datasets = get_datasets()

In [55]:
print('names of the datasets:', [name for (i, name) in datasets])

names of the datasets: ['svhn_3', 'Cmater', 'svhn_2', 'Beans', 'svhn_1', 'Cifar100_5', 'Coloret', 'comp_2', 'stl_2', 'Oxford_2', 'comp_3', 'stl_3', 'Rps', 'Cifar100_2', 'Oxford_1', 'Cifar100_1', 'stl_1', 'comp_1', 'Cifar100_4', 'Cifar100_3']


In [None]:
# loading data and using preprocess for training and validation dataset
results = []
models = ['baseModel', 'articleModel', 'improveModel']
for data in datasets:
  for model_name in models:
    dataset_data = data[0]
    dataset_name = data[1]
    X_data, y_data = dataset_data[0], dataset_data[1]
    # Extract 20% for unlabeled data
    X_train_data, X_unlabeled, y_train_data, _ = train_test_split(X_data, y_data, test_size=0.2, stratify=y_data) # X_test - unlableded data
    # split the other data to train and test
    print('---------', 'Train model:', model_name, ', on dataset:', dataset_name, ', number of classes:', len(np.unique(y_train_data)), ', train data size:', len(X_train_data), '---------')
    model = train(model_name, dataset_name, (X_train_data, y_train_data), X_unlabeled)
    results.append(model)
results

### Statistical significance testing of the results:

In [69]:
def friedman_test(excel_name):
    """
    Collects the AUC for each algorithm and calculate friedman test. Returns the samples that ran through the test and
    statistic and p-val of the test
    """
    res = pd.read_excel(excel_name, sheet_name=None)
    baseline_auc = []
    paper_auc = []
    improve_auc = []
    for df_name, df in res.items():
        baseline_auc.append(df[df['Model'] == 'Baseline']['AUC'].mean())
        paper_auc.append(df[df['Model'] == 'Article']['AUC'].mean())
        improve_auc.append(df[df['Model'] == 'Improved']['AUC'].mean())
    x = friedmanchisquare(baseline_auc, paper_auc, improve_auc)
    return baseline_auc, paper_auc, improve_auc, x


def post_hoc_test(baseline_auc, paper_auc, improve_auc):
    """
    Run posthoc_nemenyi test and print the results
    """
    data = np.array([baseline_auc, paper_auc, improve_auc])
    x = scikit_posthocs.posthoc_nemenyi_friedman(data.T)
    print(x)


def stat_test(excel_name):
    baseline_auc, paper_auc, improve_auc, friedman = friedman_test(excel_name)
    print(f'Friedman test p-value = {friedman.pvalue}')
    post_hoc_test(baseline_auc, paper_auc, improve_auc)

In [70]:
stat_test(path_drive+'Results/Results.xlsx')

Friedman test p-value = 0.036787944117144245


### Load the datasets to google drive
#### Not need to run!


In [None]:
import tensorflow_datasets as tfds
import numpy as np
import tensorflow as tf

"""
Data importer - for each dataset there is function called "import_{dataset_name}" 
These functions used tensorflow datasets to load datasets and save them as np array under the Datasets folder
"""
def data_load():
    import_svhn()
    import_stl()
    import_beans()
    import_casava()
    import_cmater()
    import_coloret()
    import_oxford_f()
    import_rps()
    import_ct_birds()
    import_cifar_100()


def import_cifar_100():
    dataset_cifar100 = tf.keras.datasets.cifar100.load_data()
    full_x_cifar100 = np.concatenate((dataset_cifar100[0][0], dataset_cifar100[1][0]), axis=0)
    full_y_cifar100 = np.concatenate((dataset_cifar100[0][1], dataset_cifar100[1][1]), axis=0)
    x_full_1 = np.concatenate([full_x_cifar100[np.where(full_y_cifar100 == x)[0]] for x in range(0, 20)])
    y_full_1 = np.concatenate([full_y_cifar100[np.where(full_y_cifar100 == x)[0]] for x in range(0, 20)])
    y_full_1 = np.array(y_full_1 % 20)
    x_full_2 = np.concatenate([full_x_cifar100[np.where(full_y_cifar100 == x)[0]] for x in range(20, 40)])
    y_full_2 = np.concatenate([full_y_cifar100[np.where(full_y_cifar100 == x)[0]] for x in range(20, 40)])
    y_full_2 = np.array(y_full_2 % 20)
    x_full_3 = np.concatenate([full_x_cifar100[np.where(full_y_cifar100 == x)[0]] for x in range(40, 60)])
    y_full_3 = np.concatenate([full_y_cifar100[np.where(full_y_cifar100 == x)[0]] for x in range(40, 60)])
    y_full_3 = np.array(y_full_3 % 20)
    x_full_4 = np.concatenate([full_x_cifar100[np.where(full_y_cifar100 == x)[0]] for x in range(60, 80)])
    y_full_4 = np.concatenate([full_y_cifar100[np.where(full_y_cifar100 == x)[0]] for x in range(60, 80)])
    y_full_4 = np.array(y_full_4 % 20)
    x_full_5 = np.concatenate([full_x_cifar100[np.where(full_y_cifar100 == x)[0]] for x in range(80, 100)])
    y_full_5 = np.concatenate([full_y_cifar100[np.where(full_y_cifar100 == x)[0]] for x in range(80, 100)])
    y_full_5 = np.array(y_full_5 % 20)
    np.save(f"{datasets_path_drive}Cifar100_1_X.npy", x_full_1)
    np.save(f"{datasets_path_drive}Cifar100_1_Y.npy", y_full_1)
    np.save(f"{datasets_path_drive}Cifar100_2_X.npy", x_full_2)
    np.save(f"{datasets_path_drive}Cifar100_2_Y.npy", y_full_2)
    np.save(f"{datasets_path_drive}Cifar100_3_X.npy", x_full_3)
    np.save(f"{datasets_path_drive}Cifar100_3_Y.npy", y_full_3)
    np.save(f"{datasets_path_drive}Cifar100_4_X.npy", x_full_4)
    np.save(f"{datasets_path_drive}Cifar100_4_Y.npy", y_full_4)
    np.save(f"{datasets_path_drive}Cifar100_5_X.npy", x_full_5)
    np.save(f"{datasets_path_drive}Cifar100_5_Y.npy", y_full_5)


def import_ct_birds():
    ct_data = tfds.load('caltech_birds2010', split='train+test')
    ct_data_np = tfds.as_numpy(ct_data)
    birds_dataset = np.array([x for x in ct_data_np])
    ctbirds_x = np.array([tf.image.resize(x['image'], (64, 64)).numpy() for x in birds_dataset])
    ctbirds_y = np.array([x['label'] for x in birds_dataset])
    ctbirds_y = ctbirds_y.reshape((ctbirds_y.shape[0], 1))
    ctb_x_full_1 = np.concatenate([ctbirds_x[np.where(ctbirds_y == x)[0]] for x in range(0, 70)])
    ctb_y_full_1 = np.concatenate([ctbirds_y[np.where(ctbirds_y == x)[0]] for x in range(0, 70)])
    ctb_x_full_2 = np.concatenate([ctbirds_x[np.where(ctbirds_y == x)[0]] for x in range(70, 140)])
    ctb_y_full_2 = np.concatenate([ctbirds_y[np.where(ctbirds_y == x)[0]] for x in range(70, 140)])
    ctb_y_full_2 = np.array(ctb_y_full_2 % 70)
    ctb_x_full_3 = np.concatenate([ctbirds_x[np.where(ctbirds_y == x)[0]] for x in range(140, 200)])
    ctb_y_full_3 = np.concatenate([ctbirds_y[np.where(ctbirds_y == x)[0]] for x in range(140, 200)])
    ctb_y_full_3 = np.array(ctb_y_full_3 % 70)
    np.save(f"{datasets_path_drive}Ctb_1_X.npy", ctb_x_full_1)
    np.save(f"{datasets_path_drive}Ctb_1_Y.npy", ctb_y_full_1)
    np.save(f"{datasets_path_drive}Ctb_2_X.npy", ctb_x_full_2)
    np.save(f"{datasets_path_drive}Ctb_2_Y.npy", ctb_y_full_2)
    np.save(f"{datasets_path_drive}Ctb_3_X.npy", ctb_x_full_3)
    np.save(f"{datasets_path_drive}Ctb_3_Y.npy", ctb_y_full_3)


def import_rps():
    rps_data = tfds.load('rock_paper_scissors', split='train+test')
    rps_np = tfds.as_numpy(rps_data)
    dataset_rps = np.array([x for x in rps_np])
    rps_x = np.array([tf.image.resize(x['image'], (100, 100)).numpy() for x in dataset_rps])
    rps_y = np.array([x['label'] for x in dataset_rps])
    rps_y = rps_y.reshape((rps_y.shape[0], 1))
    np.save(f"{datasets_path_drive}Rps_X.npy", rps_x)
    np.save(f"{datasets_path_drive}Rps_Y.npy", rps_y)


def import_oxford_f():
    oxford_data = tfds.load('oxford_flowers102', split='train+test+validation')
    oxford_np = tfds.as_numpy(oxford_data)
    dataset_oxford = np.array([x for x in oxford_np])
    oxford_x = np.array([tf.image.resize(x['image'], (64, 64)).numpy() for x in dataset_oxford])
    oxford_y = np.array([x['label'] for x in dataset_oxford])
    oxford_y = oxford_y.reshape((oxford_y.shape[0], 1))
    oxford_x_full_1 = np.concatenate([oxford_x[np.where(oxford_y == x)[0]] for x in range(0, 51)])
    oxford_y_full_1 = np.concatenate([oxford_y[np.where(oxford_y == x)[0]] for x in range(0, 51)])
    oxford_x_full_2 = np.concatenate([oxford_x[np.where(oxford_y == x)[0]] for x in range(51, 102)])
    oxford_y_full_2 = np.concatenate([oxford_y[np.where(oxford_y == x)[0]] for x in range(51, 102)])
    oxford_y_full_2 = np.array(oxford_y_full_2 % 51)
    np.save(f"{datasets_path_drive}Oxford_1_X.npy", oxford_x_full_1)
    np.save(f"{datasets_path_drive}Oxford_1_Y.npy", oxford_y_full_1)
    np.save(f"{datasets_path_drive}Oxford_2_X.npy", oxford_x_full_2)
    np.save(f"{datasets_path_drive}Oxford_2_Y.npy", oxford_y_full_2)


def import_coloret():
    coloret_data = tfds.load('colorectal_histology', split='train')
    coloret_np = tfds.as_numpy(coloret_data)
    dataset_coloret = np.array([x for x in coloret_np])
    coloret_x = np.array([x['image'] for x in dataset_coloret])
    coloret_y = np.array([x['label'] for x in dataset_coloret])
    coloret_y = coloret_y.reshape((coloret_y.shape[0], 1))
    np.save(f"{datasets_path_drive}Coloret_X.npy", coloret_x)
    np.save(f"{datasets_path_drive}Coloret_Y.npy", coloret_y)


def import_cmater():
    cmater_data = tfds.load('cmaterdb', split='train+test')
    cmater_np = tfds.as_numpy(cmater_data)
    dataset_cmater = np.array([x for x in cmater_np])
    cmater_x = np.array([x['image'] for x in dataset_cmater])
    cmater_y = np.array([x['label'] for x in dataset_cmater])
    cmater_y = cmater_y.reshape((cmater_y.shape[0], 1))
    np.save(f"{datasets_path_drive}Cmater_X.npy", cmater_x)
    np.save(f"{datasets_path_drive}Cmater_Y.npy", cmater_y)


def import_casava():
    casave_data = tfds.load('cassava', split='train+test+validation')
    casava_np = tfds.as_numpy(casave_data)
    dataset_casava = np.array([x for x in casava_np])
    casava_x = np.array([tf.image.resize(x['image'], (64, 64)).numpy() for x in dataset_casava])
    casava_y = np.array([x['label'] for x in dataset_casava])
    casava_y = casava_y.reshape((casava_y.shape[0], 1))
    np.save(f"{datasets_path_drive}Casava_X.npy", casava_x)
    np.save(f"{datasets_path_drive}Casava_Y.npy", casava_y)


def import_beans():
    data_test = tfds.load('beans', split='train+test+validation')
    test_np = tfds.as_numpy(data_test)
    dataset_beans = np.array([x for x in test_np])
    beans_x = np.array([x['image'] for x in dataset_beans])
    beans_y = np.array([x['label'] for x in dataset_beans])
    beans_x = tf.image.resize(beans_x, (32, 32)).numpy()
    beans_y = beans_y.reshape((beans_y.shape[0], 1))
    np.save(f"{datasets_path_drive}Beans_X.npy", beans_x)
    np.save(f"{datasets_path_drive}Beans_Y.npy", beans_y)


def import_stl():
    leaves_data = tfds.load('stl10', split='train+test')
    leaves_np = tfds.as_numpy(leaves_data)
    dataset_leaves = np.array([x for x in leaves_np])
    leaves_x = np.array([x['image'] for x in dataset_leaves])
    leaves_y = np.array([x['label'] for x in dataset_leaves])
    leaves_y = leaves_y.reshape((leaves_y.shape[0], 1))
    x_stl_1 = np.concatenate([leaves_x[np.where(leaves_y == x)[0]] for x in range(0, 4)])
    y_stl_1 = np.concatenate([leaves_y[np.where(leaves_y == x)[0]] for x in range(0, 4)])
    x_stl_2 = np.concatenate([leaves_x[np.where(leaves_y == x)[0]] for x in range(4, 8)])
    y_stl_2 = np.concatenate([leaves_y[np.where(leaves_y == x)[0]] for x in range(4, 8)])
    y_stl_2 = np.array(y_stl_2 % 4)
    x_stl_3 = np.concatenate([leaves_x[np.where(leaves_y == x)[0]] for x in range(6, 10)])
    y_stl_3 = np.concatenate([leaves_y[np.where(leaves_y == x)[0]] for x in range(6, 10)])
    y_stl_3 = np.array(y_stl_3 % 4)
    np.save(f"{datasets_path_drive}stl_1_X.npy", x_stl_1)
    np.save(f"{datasets_path_drive}stl_1_Y.npy", y_stl_1)
    np.save(f"{datasets_path_drive}stl_2_X.npy", x_stl_2)
    np.save(f"{datasets_path_drive}stl_2_Y.npy", y_stl_2)
    np.save(f"{datasets_path_drive}stl_3_X.npy", x_stl_3)
    np.save(f"{datasets_path_drive}stl_3_Y.npy", y_stl_3)


def import_svhn():
    svhn_data = tfds.load('svhn_cropped', split='train+test')
    svhn_np = tfds.as_numpy(svhn_data)
    dataset_svhn = np.array([x for x in svhn_np])
    svhn_x = np.array([x['image'] for x in dataset_svhn])
    svhn_y = np.array([x['label'] for x in dataset_svhn])
    svhn_y = svhn_y.reshape((svhn_y.shape[0], 1))
    x_shvn_1 = np.concatenate([svhn_x[np.where(svhn_y == x)[0]] for x in range(0, 4)])
    y_shvn_1 = np.concatenate([svhn_y[np.where(svhn_y == x)[0]] for x in range(0, 4)])
    x_shvn_2 = np.concatenate([svhn_x[np.where(svhn_y == x)[0]] for x in range(4, 8)])
    y_shvn_2 = np.concatenate([svhn_y[np.where(svhn_y == x)[0]] for x in range(4, 8)])
    y_shvn_2 = np.array(y_shvn_2 % 4)
    x_shvn_3 = np.concatenate([svhn_x[np.where(svhn_y == x)[0]] for x in range(6, 10)])
    y_shvn_3 = np.concatenate([svhn_y[np.where(svhn_y == x)[0]] for x in range(6, 10)])
    y_shvn_3 = np.array(y_shvn_3 % 4)
    np.save(f"{datasets_path_drive}svhn_1_X.npy", x_shvn_1)
    np.save(f"{datasets_path_drive}svhn_1_Y.npy", y_shvn_1)
    np.save(f"{datasets_path_drive}svhn_2_X.npy", x_shvn_2)
    np.save(f"{datasets_path_drive}svhn_2_Y.npy", y_shvn_2)
    np.save(f"{datasets_path_drive}svhn_3_X.npy", x_shvn_3)
    np.save(f"{datasets_path_drive}svhn_3_Y.npy", y_shvn_3)
  
data_load()