## Data Scraping
We will be scraping acne images from https://dermnetnz.org/

In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import shutil

def download_image(image_url, image_name):
    url = image_url
    response = requests.get(url, stream=True)
    with open(image_name, 'wb') as out_file:
        shutil.copyfileobj(response.raw, out_file)
    del response

In [4]:
def scrape_dermnet(url):
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.content, 'html.parser')
    textBlock = soup.find('section', class_="textBlock")
    imgs = textBlock.findAll('img')
    img_dataset = pd.DataFrame([[img.attrs['alt'], "https://dermnetnz.org/" + img.attrs['data-src']] for img in imgs], columns=['alt', 'image_url'])
    img_dataset['image_name'] = [f'data/image_{e}.jpg' for e in range(img_dataset.shape[0])]
    return img_dataset

In [5]:
# Download images
img_dataset = scrape_dermnet("https://dermnetnz.org/topics/acne-face-images")
for row in img_dataset.itertuples():
    download_image(row.image_url, row.image_name)

FileNotFoundError: [Errno 2] No such file or directory: 'data/image_0.jpg'

## Sliding Window

To construct smaller images based on size of sliding window

In [6]:
from os import listdir, rename
from os.path import isfile, join
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
import random

In [7]:
images = [join('scraped_data', f) for f in listdir('scraped_data') if isfile(join('scraped_data', f))]

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'scraped_data'

In [39]:
# Set size of sliding window
window_size = 50

In [68]:
# Slicing images
counter = 0
for image_path in images:
    img = np.asarray(Image.open(image_path))
    for i in range(0,img.shape[0],window_size):
        for j in range(0,img.shape[1],window_size):
            sub_img = img[i:i+window_size, j:j+window_size, :]
            # Save image
            img_name = image_path.split('/')[-1].split('.')[0]
            Image.fromarray(sub_img).save(f'temp_dataset/{counter}.jpg')
            counter += 1

In [81]:
# Create csv for tagging
cropped_images = sorted([f for f in listdir('temp_dataset') if isfile(join('temp_dataset', f))])
shuffled_images_name = cropped_images.copy()
random.shuffle(shuffled_images_name)
filenames_rows = list(zip(cropped_images, shuffled_images_name))

In [82]:
tagging_df = pd.DataFrame(filenames_rows, columns=['Image File', 'New Image Name'])

In [87]:
# Rename files with shuffled names
for idx, row in tagging_df.iterrows():
    rename(f'temp_dataset/{row["Image File"]}', f'dataset/{row["New Image Name"]}')

In [83]:
tagging_df.head()

Unnamed: 0,Image File,New Image Name
0,0.jpg,2193.jpg
1,1.jpg,4976.jpg
2,10.jpg,2599.jpg
3,100.jpg,6201.jpg
4,1000.jpg,1952.jpg


In [None]:
tagging_df.to_csv('original_filename.csv', index=False)

## Modelling

In [41]:
from os import listdir, rename
from os.path import isfile, join
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from collections import Counter
from skimage.transform import rotate, AffineTransform, warp
from skimage.util import random_noise

In [42]:
# !pip3 install scikit-image

In [32]:
labels = pd.read_csv('labels.csv')

In [33]:
labels.head()

Unnamed: 0,filename,label
0,0.jpg,1
1,1.jpg,1
2,2.jpg,5
3,3.jpg,4
4,4.jpg,1


In [34]:
labels.shape

(1501, 2)

### Helper Functions

In [35]:
def load_images(labels):
    X = []
    y = []
    for row in labels.itertuples():
        img_path = join('dataset', row.filename)
        img = np.asarray(Image.open(img_path))
        
        # Keep squares for now
        if img.shape[0] == img.shape[1]:
            y.append(row.label)
            X.append(img)
    return X, y


In [65]:
def augment_data(X_train, y_train):
    final_X_train = []
    final_y_train = []
    for i in range(len(X_train)):
        final_X_train.append(X_train[i])
        final_X_train.append(rotate(X_train[i], angle=45, mode = 'wrap'))
        final_X_train.append(np.fliplr(X_train[i]))
        final_X_train.append(np.flipud(X_train[i]))
        final_X_train.append(random_noise(X_train[i],var=0.2**2))
        
        final_y_train += [y_train[i]] * 5
    return final_X_train, final_y_train

In [66]:
def get_accuracy(scores, labels):
    num_data = scores.size(0)
    predicted_labels = scores.argmax(dim=1)
    indicator = (predicted_labels == labels)
    num_matches = indicator.sum()
    return 100*num_matches.float()/num_data  

def get_error( scores , labels ):

    bs=scores.size(0)
    predicted_labels = scores.argmax(dim=1)
    indicator = (predicted_labels == labels)
    num_matches=indicator.sum()
    
    return 1-num_matches.float()/bs   

def eval_on_test_set(test_data, test_label, net, mean, std, bs):

    running_error=0
    running_acc = 0
    num_batches=0

    for i in range(0,500,bs):

        minibatch_data =  test_data[i:i+bs]
        minibatch_label = test_label[i:i+bs]
        
        inputs = (minibatch_data - mean)/std

        scores= net( inputs ) 

        error = get_error( scores.detach() , minibatch_label)
        acc = get_accuracy( scores.detach() , minibatch_label)

        running_error += error.item()
        running_acc += acc.item()

        num_batches+=1

    total_error = running_error/num_batches
    total_acc = running_acc/num_batches
    print(running_error, num_batches)
    print( 'error rate on test set =', total_error*100 ,'percent')
    print('accuracy =', total_acc)

In [67]:
X, y = load_images(labels)

In [68]:
Counter(y)

Counter({5: 146, 4: 81, 2: 202, 1: 264, 3: 91, 0: 194})

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 13, stratify=y)

In [70]:
X_train, y_train = augment_data(X_train, y_train)

### Training

In [48]:
import torch

In [97]:
class convnet(torch.nn.Module):

    def __init__(self, input_channel, hidden_layer, num_class):

        super(convnet, self).__init__()
      
        self.conv1a = torch.nn.Conv2d(input_channel, hidden_layer,  kernel_size=5, padding=1 )
        self.pool1a  = torch.nn.MaxPool2d(2,2)
        self.conv1b = torch.nn.Conv2d(hidden_layer,  hidden_layer,  kernel_size=5, padding=1 )
        self.pool1b  = torch.nn.MaxPool2d(2,2)
        
        self.linear1 = torch.nn.Linear(12100, num_class)

    def forward(self, x):
        x = self.conv1a(x)
        x = torch.nn.functional.relu(x)
        x = self.pool1a(x)
        x = self.conv1b(x)
        x = torch.nn.functional.relu(x)
        x = self.pool1b(x)
        
        x = x.view(-1, 12100)
        x = self.linear1(x)
        
        return x

In [98]:
def train_network(X_train, y_train, X_test, y_test):
    X_train = torch.from_numpy(np.array(X_train).astype(np.float32))
    y_train = torch.from_numpy(np.array(y_train).astype(np.float32)).type(torch.LongTensor)
    X_test = torch.from_numpy(np.array(X_test).astype(np.float32))
    y_test = torch.from_numpy(np.array(y_test).astype(np.float32)).type(torch.LongTensor)
    X_train = X_train.view(-1, 3, 50, 50)
    X_test = X_test.view(-1, 3, 50, 50)
    
    my_lr = 0.01
    criterion = torch.nn.CrossEntropyLoss()
    bs = 128
    num_class = 6

    net = convnet(3, 100, num_class)
    mean = X_train.mean()
    std = X_train.std()
    num_train_data = X_train.size()[0]
    
    for epoch in range(5):
        # create a new optimizer at the beginning of each epoch: give the current learning rate.   
        optimizer=torch.optim.Adam( net.parameters() , lr=my_lr )

        # set the running quatities to zero at the beginning of the epoch
        running_loss=0
        running_error=0
        running_acc = 0
        num_batches=0

        # set the order in which to visit the image from the training set
        shuffled_indices = torch.randperm(num_train_data)

        for count in range(0,num_train_data,bs):

            # Set the gradients to zeros
            optimizer.zero_grad()

            # create a minibatch       
            indices = shuffled_indices[count:count+bs]
            minibatch_data =  X_train[indices]
            minibatch_label =  y_train[indices]


            # normalize the minibatch (this is the only difference compared to before!)
            inputs = (minibatch_data - mean)/std

            # tell Pytorch to start tracking all operations that will be done on "inputs"
            inputs.requires_grad_()

            # forward the minibatch through the net 
            scores=net( inputs ) 

            # Compute the average of the losses of the data points in the minibatch
            loss =  criterion( scores , minibatch_label) 

            # backward pass to compute dL/dU, dL/dV and dL/dW   
            loss.backward()

            # do one step of stochastic gradient descent: U=U-lr(dL/dU), V=V-lr(dL/dU), ...
            optimizer.step()


            # START COMPUTING STATS

            # add the loss of this batch to the running loss
            running_loss += loss.detach().item()

            # compute the error made on this batch and add it to the running error       
            error = get_error( scores.detach() , minibatch_label)
            acc = get_accuracy(scores.detach() , minibatch_label)
            running_error += error.item()
            running_acc += acc.item()

            num_batches+=1        


        # compute stats for the full training set
        total_loss = running_loss/num_batches
        total_error = running_error/num_batches
        total_acc = running_acc/num_batches


        print('epoch=',epoch, '\t loss=', total_loss , '\t error=', total_error*100 ,'percent')
        print('accuracy =', total_acc)
        print(' ')

    eval_on_test_set(X_test, y_test, net, mean, std, bs)


In [99]:
train_network(X_train, y_train, X_test, y_test)

epoch= 0 	 loss= 16.275446553384104 	 error= 79.02145731833673 percent
accuracy = 20.978542635517737
 
epoch= 1 	 loss= 1.7324328768637873 	 error= 74.63277655263101 percent
accuracy = 25.367223493514523
 
epoch= 2 	 loss= 1.713539638826924 	 error= 72.99467171392133 percent
accuracy = 27.0053283322242
 
epoch= 3 	 loss= 1.7133833054573304 	 error= 73.03643418896583 percent
accuracy = 26.96356564183389
 
epoch= 4 	 loss= 1.7130451394665627 	 error= 73.07819704855642 percent
accuracy = 26.921803012970955
 


NameError: name 'bs' is not defined