# **0. CONNECT TO DRIVE**

In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# **1. IMPORT LIBRARIES**

In [3]:
!pip install split-folders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting split-folders
  Downloading split_folders-0.5.1-py3-none-any.whl (8.4 kB)
Installing collected packages: split-folders
Successfully installed split-folders-0.5.1


In [4]:
import os
import splitfolders
import numpy as np
import matplotlib.pyplot as plt
import cv2 as cv
import tensorflow as tf
from PIL import Image
from google.colab import files

# **2. UNZIP THE DATASET**

Data sources:
<br>https://data.mendeley.com/datasets/t2r6rszp5c/1
<br>https://data.mendeley.com/datasets/tgv3zb82nd/1

In [None]:
!!unzip '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Dataset/Cerscospora-20210326T085017Z-001.zip' -d '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Dataset/Raw'

In [None]:
!unzip '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Dataset/Healthy-20210326T083815Z-001.zip' -d '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Dataset/Raw'

In [None]:
!unzip '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Dataset/Leaf rust-20210326T083416Z-001.zip' -d '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Dataset/Raw'

In [None]:
!unzip '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Dataset/Phoma-20210326T082051Z-001.zip' -d '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Dataset/Raw'

In [None]:
!unzip '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Dataset/Miner-20210326T082341Z-001.zip' -d '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Dataset/Raw'

# **3. EXPLORE THE DATA**

In [5]:
ROOT_PATH = '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Dataset'

In [6]:
raw_data = os.path.join(ROOT_PATH, 'Raw')

In [7]:
os.listdir(raw_data)

['Leaf rust', 'Phoma', 'Miner', 'Cerscospora', 'Healthy']

In [8]:
cerc_raw = os.path.join(raw_data, os.listdir(raw_data)[0])
healthy_raw = os.path.join(raw_data, os.listdir(raw_data)[1])
lr_raw = os.path.join(raw_data, os.listdir(raw_data)[2])
phoma_raw = os.path.join(raw_data, os.listdir(raw_data)[3])
miner_raw = os.path.join(raw_data, os.listdir(raw_data)[4])

In [9]:
# checking corrupted images

corrupted = []
for category in sorted(os.listdir(raw_data)):
  for image in sorted(os.listdir(os.path.join(raw_data, category))):
    image_path = os.path.join(raw_data, category, image)
    try:
      Image.open(image_path)
    except:
      corrupted.append(image_path)
      print('Corrupted: {}'.format(image_path))

In [10]:
len(corrupted)

0

In [11]:
# checking after removing corrupted image

for category in os.listdir(raw_data):
  print('Number of image in {} class: {} images'.format(category, len(os.listdir(os.path.join(raw_data, category)))))

Number of image in Leaf rust class: 8336 images
Number of image in Phoma class: 6571 images
Number of image in Miner class: 16978 images
Number of image in Cerscospora class: 7681 images
Number of image in Healthy class: 18983 images


# **4. PUT IMAGES TO TRAIN, VAL, AND TEST FOLDERS**

In [14]:
in_data = os.path.join(ROOT_PATH, 'Raw')
out_data = os.path.join(ROOT_PATH, 'Train_Val_Test')

In [15]:
os.listdir(in_data), os.listdir(out_data)

(['Leaf rust', 'Phoma', 'Miner', 'Cerscospora', 'Healthy'], [])

In [16]:
splitfolders.ratio(in_data, output=out_data, seed=1337, ratio=(.8, .1, .1))

Copying files: 58549 files [16:28, 59.26 files/s]


In [17]:
os.listdir(out_data)

['train', 'val', 'test']

In [18]:
for folder in os.listdir(out_data):
  print(folder)
  for category in os.listdir(os.path.join(out_data,folder)):
    print('{}: {} images'.format(category, len(os.listdir(os.path.join(out_data,folder,category)))))
  print('')

train
Leaf rust: 6668 images
Phoma: 5256 images
Miner: 13582 images
Cerscospora: 6144 images
Healthy: 15186 images

val
Leaf rust: 833 images
Phoma: 657 images
Miner: 1697 images
Cerscospora: 768 images
Healthy: 1898 images

test
Leaf rust: 835 images
Phoma: 658 images
Miner: 1699 images
Cerscospora: 769 images
Healthy: 1899 images



In [None]:
!zip -r '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Dataset/Train_Val_Test.zip' '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Dataset/Train_Val_Test'

In [21]:
files.download('/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Dataset/Train_Val_Test.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>