<a href="https://colab.research.google.com/github/sovb/Garbage-Classification/blob/main/Garbage_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

We will begin by enabling autoreload so that we can "reimport" a module without having to restart Python completely. This is becasue when we run the first time the module will be in the sys.modlue dictionary and the interpreter will not import it again if it is already present there.
As for matplotlib inline, with this backend, the output of plotting commands is displayed inline within frontends like the Jupyter notebook, directly below the code cell that produced it. The resulting plots will then also be stored in the notebook document.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline


In [2]:
from fastai.vision import *
from fastai.metrics import error_rate
from pathlib import Path
from glob2 import glob
from sklearn.metrics import confusion_matrix
#added by sovra
import scipy
from scipy import *
import skimage
from skimage import filters
from skimage import io 
import matplotlib.pyplot as plt  #added by sovra
import pandas as pd
import numpy as np
import os
import zipfile as zf
import shutil
import re
import seaborn as sns
import random

**Extract Data from TrashNet dataset zip file**
https://github.com/garythung/trashnet/blob/master/data/dataset-resized.zip

In [3]:
files = zf.ZipFile("/content/drive/MyDrive/Colab Notebooks/dataset-resized.zip",'r')

files.extractall()
files.close()

In [4]:
os.listdir(os.path.join(os.getcwd(),"dataset-resized"))

['.DS_Store', 'trash', 'plastic', 'glass', 'cardboard', 'paper', 'metal']

**Organize images into different folders**
1.   Cardboard
2.   Glass
3.   Paper
4.   Metal
5.   Plastic
6.   Trash

Train, validation and test ratio being used here will be 50-25-25 however, we can modify this to 70-15-15 later on and see if there is a difference. 

In [5]:
## splits indices for a folder into train, validation, and test indices with random sampling
    ## input: folder path
    ## output: train, valid, and test indices    
def split_indices(folder,seed1,seed2):    
    n = len(os.listdir(folder))
    full_set = list(range(1,n+1))

    ## train indices
    random.seed(seed1)
    train = random.sample(list(range(1,n+1)),int(.5*n))

    ## temp
    remain = list(set(full_set)-set(train))

    ## separate remaining into validation and test
    random.seed(seed2)
    valid = random.sample(remain,int(.5*len(remain)))
    test = list(set(remain)-set(valid))
    
    return(train,valid,test)

## gets file names for a particular type of trash, given indices
    ## input: waste category and indices
    ## output: file names 
def get_names(waste_type,indices):
    file_names = [waste_type+str(i)+".jpg" for i in indices]
    return(file_names)    

## moves group of source files to another folder
    ## input: list of source files and destination folder
    ## no output
def move_files(source_files,destination_folder):
    for file in source_files:
        shutil.move(file,destination_folder)

In [6]:
## paths will be train/cardboard, train/glass, etc...
subsets = ['train','valid']
waste_types = ['cardboard','glass','metal','paper','plastic','trash']

## create destination folders for data subset and waste type
for subset in subsets:
    for waste_type in waste_types:
        folder = os.path.join('data',subset,waste_type)
        if not os.path.exists(folder):
            os.makedirs(folder)
            
if not os.path.exists(os.path.join('data','test')):
    os.makedirs(os.path.join('data','test'))
          
## move files to destination folders for each waste type
for waste_type in waste_types:
    source_folder = os.path.join('dataset-resized',waste_type)
    train_ind, valid_ind, test_ind = split_indices(source_folder,1,1)
    
    ## move source files to train
    train_names = get_names(waste_type,train_ind)
    train_source_files = [os.path.join(source_folder,name) for name in train_names]
    train_dest = "data/train/"+waste_type
    move_files(train_source_files,train_dest)
      
    
    ## move source files to valid
    valid_names = get_names(waste_type,valid_ind)
    valid_source_files = [os.path.join(source_folder,name) for name in valid_names]
    valid_dest = "data/valid/"+waste_type
    move_files(valid_source_files,valid_dest)
    
    ## move source files to test
    test_names = get_names(waste_type,test_ind)
    test_source_files = [os.path.join(source_folder,name) for name in test_names]
    ## I use data/test here because the images can be mixed up
    move_files(test_source_files,"data/test")

In [7]:
## get a path to the folder with images
path = Path(os.getcwd())/"data"
path

PosixPath('/content/data')

https://concordiauniversity.on.worldcat.org/search?queryString=Recyclable%20waste%20image%20recognition%20based%20on%20deep%20learning&databaseList= 

We are pre-processing the image here to convert the input RGB image into a single-channel gray image,
and perform Gaussian blur on the gray image., randomly crop the image, rotate it, then finally crop the image into a 256x256 pixel RGB image and normalize.

In [9]:
## create destination folders for data subset and waste type preprocessed versions
# this makes train, valid and test prep folders


t_path = glob('/content/data/train/*/*')
for i in range(len(t_path)):
  t_path = glob('/content/data/train/*/*') #we can say /content/data/test/* but for the others
  #we have to say /content/data/train/*/* and /content/data/valid/*/*
  #print(len(t_path),t_path)#this tells me the name of all te files in t_path
  og_image = skimage.io.imread(t_path[i])  #since this index is valid we can loop over to change the images :D
  # plotting the original image
  image = skimage.color.rgb2gray(og_image) #colour => b&w
  image = skimage.filters.gaussian(image, sigma = 3, truncate = 3) #gaussian blur =>removes noise
  lx, ly = image.shape
  image = image[lx//4:-lx//4, ly//4:-ly//4] #crop image
  image = scipy.ndimage.rotate(image, 45, reshape=False) #rotate at 45 angle
  image = skimage.transform.resize(image, (256, 256)) #crop 256x256 pixels
  image = skimage.color.gray2rgb(image) #from grey back to rbg
  image = (image - np.min(image)) / (np.max(image) - np.min(image)) #normalization
  io.imsave(t_path[i], image)

t_path = glob('/content/data/valid/*/*')
for i in range(len(t_path)):
  t_path = glob('/content/data/valid/*/*') 
  #print(len(t_path),t_path)#this tells me the name of all te files in t_path
  og_image = skimage.io.imread(t_path[i])  
  image = skimage.color.rgb2gray(og_image) #colour => b&w
  image = skimage.filters.gaussian(image, sigma = 3, truncate = 3) #gaussian blur =>removes noise
  lx, ly = image.shape
  image = image[lx//4:-lx//4, ly//4:-ly//4] #crop image
  image = scipy.ndimage.rotate(image, 45, reshape=False) #rotate at 45 angle
  image = skimage.transform.resize(image, (256, 256)) #crop 256x256 pixels
  image = skimage.color.gray2rgb(image) #from grey back to rbg
  image = (image - np.min(image)) / (np.max(image) - np.min(image)) #normalization
  io.imsave(t_path[i], image)

t_path = glob('/content/data/test/*')
for i in range(len(t_path)):
  t_path = glob('/content/data/test/*')
  og_image = skimage.io.imread(t_path[i]) 
  image = skimage.transform.resize(og_image, (256, 256)) #crop 256x256 pixels
  image = (image - np.min(image)) / (np.max(image) - np.min(image)) #normalization
  io.imsave(t_path[i], image)
"""#plotting code
i, (im1) = plt.subplots(1)
i.set_figwidth(15)
im1.imshow(image, cmap = 'gist_rainbow')
"""





"#plotting code\ni, (im1) = plt.subplots(1)\ni.set_figwidth(15)\nim1.imshow(image, cmap = 'gist_rainbow')\n"

Now that we have preprocessed, 
**LET'S START TRAINING!!!**

In [None]:
learn = create_cnn(data,models.resnet34,metrics=error_rate)

In [None]:
learn.model

In [None]:
learn.lr_find(start_lr=1e-6,end_lr=1e1)
learn.recorder.plot()

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(2,max_lr=5.13e-03)

In [None]:
interp = ClassificationInterpretation.from_learner(learn)
losses,idxs = interp.top_losses()

In [None]:
interp.plot_top_losses(6, figsize=(15,11))

In [None]:
doc(interp.plot_top_losses)
interp.plot_confusion_matrix(figsize=(12,12), dpi=60)

In [None]:
interp.most_confused(min_val=2)

In [None]:
preds = learn.get_preds(ds_type=DatasetType.Test)

In [None]:
data.classes

In [None]:
## saves the index (0 to 5) of most likely (max) predicted class for each image
max_idxs = np.asarray(np.argmax(preds[0],axis=1))

In [None]:
result = []
for max_idx in max_idxs:
    result.append(data.classes[max_idx])

In [None]:
result

In [None]:
learn.data.test_ds[0][0]

In [None]:
y = []

## convert POSIX paths to string first
for label_path in data.test_ds.items:
    y.append(str(label_path))
    
## then extract waste type from file path
pattern = re.compile("([a-z]+)[0-9]+")
for i in range(len(y)):
    y[i] = pattern.search(y[i]).group(1)

In [None]:
## predicted values
print(result[0:5])
## actual values
print(y[0:5])

In [None]:
learn.data.test_ds[0][0]

In [None]:
cm = confusion_matrix(y,result)
print(cm)

In [None]:
df_cm = pd.DataFrame(cm,waste_types,waste_types)

plt.figure(figsize=(10,8))
sns.heatmap(df_cm,annot=True,fmt="d",cmap="YlGnBu")

In [None]:
correct = 0

for r in range(len(cm)):
    for c in range(len(cm)):
        if (r==c):
            correct += cm[r,c]

In [None]:
accuracy = correct/sum(sum(cm))
accuracy