<a href="https://colab.research.google.com/github/stano45/DeepFlora/blob/main/DeepFlora.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import tensorflow as tf
import numpy as np
import IPython
import functools
from PIL import Image # image processing
from tqdm import tqdm # ranges are displayed 
import matplotlib.pyplot as plt
import zipfile
from google.colab import files
import random
import math
import shutil

In [None]:
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices()[1].physical_device_desc)
tf.config.list_physical_devices('GPU')

In [None]:
def mkdir (name):
    if not os.path.exists(name):
        os.mkdir(name)

In [None]:
from google.colab import files
import os

if not os.path.exists("kaggle.json"):
  files.upload() #upload kaggle.json

!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

DATASET_NAMES = ['mushrooms-classification-common-genuss-images']

!kaggle datasets download -d maysee/mushrooms-classification-common-genuss-images --force

!ls

In [None]:
import zipfile

mkdir('datasets')
os.chdir("/content")

for dataset in DATASET_NAMES:
  with zipfile.ZipFile(dataset + ".zip","r") as z:
    z.extractall("datasets")
  os.remove(dataset + ".zip")

In [None]:
"""
process all datasets and clear space
"""

from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

#path to datasets
DATASET_PATH = 'datasets' 

#path to processed images
PROCESSED_PATH = "processed_data"
mkdir(PROCESSED_PATH)

dataset_info = dict()

#size of resized images
IMG_SIZE = (128, 128)

#number of removed images due to insufficient size
#(avoid having to add padding, that is bad for training)
removed_total = 0

#iterate over all datasets
datasets = os.listdir(DATASET_PATH)
for dataset in datasets:

    if (dataset == 'mushrooms'):
      continue

    #iterate over all categories in dataset
    categories_path = os.path.join(DATASET_PATH, dataset)
    categories = os.listdir(categories_path)

    processed_dataset = os.path.join(PROCESSED_PATH, dataset.lower())
    mkdir(processed_dataset)
    
    dataset_info[dataset.lower()] = dict()
    for category in categories:
        
        dataset_info[dataset.lower()][category.lower()] = 0

        #iterate over all images in category
        category_path = os.path.join(categories_path, category)

        processed_category = os.path.join(processed_dataset, category.lower())
        mkdir(processed_category)
        
        images = os.listdir(category_path)
        t_images = tqdm(images)
        t_images.set_description("Parsing category {} from dataset {}".format(category, dataset))

        for image in t_images:
            
            # if the image format is not jpg then skip it
            if not any([image.endswith(x) for x in [".jpg", ".jpeg"]]):
              continue

            #if image in os.listdir(processed_category):
            #  continue

            # image is loaded
            img = Image.open(os.path.join(category_path, image))

            if img.size < IMG_SIZE:
              removed_total += 1
              continue

            # images resized to the given size, save image, delete old version
            img = img.resize(IMG_SIZE)
            img.save(os.path.join(processed_category, image))
            
            dataset_info[dataset.lower()][category.lower()] += 1
        

print(dataset_info)

In [None]:
"""
!!!Deletes all unprocessed datasets!!!
Execute this block if not enough space on disk
"""
!rm -R datasets

In [None]:
def get_count_in_dataset(dataset : str):
  return sum(list(dataset_info[dataset].values()))

get_count_in_dataset("mushrooms")

In [None]:
TRAIN_PERCENT = 0.8 #percentage of training data from all data, range (0,1)
VAL_PERCENT = 0.2 #percentage of validation data from training data, range (0,1)

for dataset in dataset_info:

  #change directories to the right dataset, create train, test and val folders
  dataset_path = os.path.join('/content', PROCESSED_PATH, dataset)
  os.chdir(dataset_path)
  mkdir('train')
  train_path = os.path.join(dataset_path, 'train')
  mkdir('val')
  val_path = os.path.join(dataset_path, 'val')
  mkdir('test')
  test_path = os.path.join(dataset_path, 'test')

  for category in dataset_info[dataset]:

    train_threshold = int((dataset_info[dataset][category] * TRAIN_PERCENT) - 1)
    val_threshold = int(train_threshold * VAL_PERCENT)

    #print(category, " with {} images, t_thres={}, v_thres={}".format(dataset_info[dataset][category], train_threshold, val_threshold))
    v = 0
    tr = 0
    tes = 0
    for i, image in enumerate(np.random.permutation(os.listdir(os.path.join(dataset_path, category)))):
      source = os.path.join(dataset_path, category, image)
      target = ""
      if i < val_threshold:
        target = val_path
        #v += 1
      elif i < train_threshold:
        target = train_path
        #tr += 1
      else:
        target = test_path
        #tes += 1
      
      #new_dest = os.path.join(target, image)
     # print(source, new_dest)
      target = os.path.join(target, category + "_{}".format(i))
      shutil.move(source, target)
     # os.replace(source, new_dest)
      
    #print(v, tr, tes, v + tr + tes, dataset_info[dataset][category])
    os.rmdir(category)





In [None]:
last_indices = np.zeros(len(categories), dtype=int)



def get_batch(dataset, num_samples, use='train'):
    images_left = num_per_dataset[dataset]
    if images_left < num_samples:
        raise Exception("Insufficient amount of unseen samples in dataset!")
    
    categories = categories_per_dataset[dataset]
    samples_per_category = math.ceil(num_samples / len(categories))

    if samples_per_category == 0:
        print("Warning: number of samples less than number of categories, last n categories won't be picked from!")
        samples_per_category = 1
    
    #x = batch, y = labels
    x = []
    y = []
    
    #iterate until all samples evenly distributed or no more left in dataset
    while num_samples > 0 and images_left > 0:
        
        #iterate starting from first category
        for i, category in enumerate(categories):

            #get path of category and list all images
            path = os.path.join(PROCESSED_PATH, dataset, category)
            images = os.listdir(path)

            #pick from each category samples_per_category
            #or the remaining num_samples if less than samples_per_category
            for j in range(min(samples_per_category, num_samples)):

                #check if category not empty
                if last_indices[i] < len(images):

                    #get new image
                    image_path = os.path.join(path,images[last_indices[i]])
                    image = Image.open(image_path)

                    #save image and label
                    x.append(tf.Variable(tf.keras.preprocessing.image.img_to_array(image)))
                    y.append(i)

                    #move indeces
                    last_indices[i] += 1
                    num_samples -= 1
                    images_left -= 1

    return x, y

for i in range(5):
  x,y = get_batch('mushrooms', 6500)
  print("y = ", y)
  print(last_indices)


In [None]:
def accuracy(y1: tf.Variable, y2: tf.Variable):
    a, b = y1.numpy(), y2.numpy()
    if a.shape != b.shape:
        return 0.0
    sum_of_tru = sum([a==b for a,b in zip(a, b)])
    avg = su/len(a)
    return avg

def train_batch(model, x, y, loss_fn, optimizer_fn, metric_fn=None):
    if optimizer_fn is None:
        pred = model(x)
    else:
        with tf.GradientTape() as tape:
            pred = model(x)
    loss = loss_fn(y, pred)
    metric = None
    if metric_fn is not None:
        metric = metric_fn(y, pred)
    if optimizer_fn is not None:
        grad = tape.gradient(loss, model.trainable_variables)
        optimizer_fn.apply_gradients(zip(model.trainable_variables, grad))
    return pred, loss, metric

def train(model, x, y, epochs, batch_size, loss_fn, optimizer_fn, metric_fn=None):

    # random permutation von 0 bis len(y)-1
    random_indices = np.random.permutation(len(y))

    # offset ist hier die anzahl 
    offset = len(y)//batch_size

    # 0 arrays for losses and metrics
    losses = [0 for x in range(epochs)]
    metrics = [0 for x in range(epochs)]
    test_losses = [0 for x in range(epochs)]
    test_metrics = [0 for x in range(epochs)]

    # train test split
    # train_dataset, test_dataset = split(dataset, test_rate)

    for i, b in nqdm(epochs, offset):
        # get x, y
        tensors, labels = get_batch(train_dataset, batch_size)

        # get test data
        test_tensors, test_labels = get_batch(test_dataset, batch_size)

        # get y_pred
        preds, loss, _ = train_batch(model, tensors, labels, loss_fn, optimizer_fn, metric_fn)

        # get y_test_pred
        test_preds, test_loss, test_metric = train_batch(model, tensors, labels, loss_fn, optimizer_fn=None, metric_fn)

        # add losses together
        losses[i] += loss
        test_losses[i] += test_loss

        # calculate metric (if metric_fn is defined)
        metric = None
        test_metric = None
        if metric_fn is not None:
            metric = metric_fn(labels, preds)
            test_metric = metric_fn(test_labels, test_preds)

            # add metrics together 
            metrics[i] += metric
            test_metrics[i] += test_metric
    
    # losses and metrics are plotted

In [None]:
# Training hyperparameters
batch_size = 32
num_epochs = 2  # keep small to run faster
learning_rate = 5e-4

optimizer = tf.keras.optimizers.Adam(learning_rate) # define our optimizer#parameters of CNN
num_filters = 5

#make model
def cnn_classifier(num_outputs):
    Conv2D = functools.partial(tf.keras.layers.Conv2D, padding='same', activation='relu')
    BatchNormalization = tf.keras.layers.BatchNormalization
    Flatten = tf.keras.layers.Flatten
    Dense = functools.partial(tf.keras.layers.Dense, activation='relu')
    MaxPool2D = functools.partial(tf.keras.layers.MaxPool2D, pool_size=(2, 2), strides=None, padding="same")
    model = tf.keras.Sequential([    
        # size = (120, 240)
        
        Conv2D(filters=32, kernel_size=5,  strides=2),
        BatchNormalization(),
        MaxPool2D(),
        
        Conv2D(filters=2*num_filters, kernel_size=5,  strides=2),
        BatchNormalization(),
        MaxPool2D(),

        Conv2D(filters=4*num_filters, kernel_size=3,  strides=2),
        BatchNormalization(),
        MaxPool2D(),

        Conv2D(filters=6*num_filters, kernel_size=3,  strides=2),
        BatchNormalization(),

        Flatten(),
        Dense(512),
        Dense(num_outputs, activation="softmax"),
        
    ])
    return model


cnn_model = cnn_classifier(5)