In [None]:
# install dependency for Google Colab
!pip install onedrivedownloader

In [None]:
from datetime import timedelta
import math
from time import time
import os
import shutil
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from PIL import Image

from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import *
from sklearn.svm import *
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import *
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

import torch.nn as nn
import torch.optim as optim
import torch
import torchvision
from torchvision.datasets import MNIST
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms
import torchvision.transforms as transforms
from torch.utils.data import random_split, DataLoader
import torch.nn.functional as F

from onedrivedownloader import download

# Fitting a function

## Plotting a curve

In [None]:
granularity = 0.05
x = np.arange(0, 2*np.pi, granularity).reshape(-1, 1)
y = np.sin(x).reshape(-1, 1)

In [None]:
len(x)

In [None]:
plt.plot(x, y)
plt.show()

In [None]:
# TODO: sample out some training data and add a bit of noise to it

# select random indices
# idx_train = np.random.choice( ?, size=?) # Docs: https://numpy.org/doc/stable/reference/random/generated/numpy.random.choice.html
x_train = x[idx_train]
# add noise, as real world data is rarely perfect
y_train = y[idx_train] # + np.random.normal(loc=?, scale=?, size=(len(idx_train), 1)) # Docs: https://numpy.org/doc/stable/reference/random/generated/numpy.random.normal.html

In [None]:
len(x_train)

In [None]:
# TODO: plot the training data with the original sine curve
plt.plot(x, y)
# plt.scatter( ?, ?, color='orange') # Docs: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.scatter.html
plt.show()

## Learning a curve

In [None]:
# TODO: what classifiers could we try? Can you find more sklearn regressors that work better?
# Can we improve them by changing the hyperparamters? (for instance the hidden_layer_sizes in the MLPRegressor)
classifiers = [
    # RandomForestRegressor(), # Docs: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
    # LinearRegression(), # Docs: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
    # MLPRegressor(), # Docs: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor
]

In [None]:
for clf in classifiers:
  # TODO: train the classifier and evaluate it. We want to find the best one.
  # How can we see the performace of a classifier?
  print(clf)

  # clf.fit( ?, ? )
  # score = clf.score( ?, ?)
  # print(score)
  # y_pred = clf.predict(?)
  
  # TODO: plot sine, train data, and prediction
  plt.plot(x, y)
  # plt.scatter( ?, ?, color='orange')
  # plt.plot( ?, ?, color='green')
  plt.show()


# Image classification example: Character recognition

## Import data

In [None]:
mnist_dataset = MNIST(root = 'data_mnist/', train=True, download=True, transform=transforms.ToTensor())

In [None]:
len(mnist_dataset)

In [None]:
train_data, test_data = random_split(mnist_dataset, [0.8, 0.2])

In [None]:
print(len(train_data))
print(len(test_data))

In [None]:
# data loaders
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
test_loader = DataLoader(test_data, batch_size=128, shuffle=False)

## Plotting

In [None]:
image, label = mnist_dataset[1000]
plt.imshow(transforms.ToPILImage()(image), cmap='gray')
print('Label:', label)

## Training

In [None]:
class MnistModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.flatten = nn.Flatten()
    self.linear = nn.Linear(28*28, 10)

  def forward(self, x):
    x = self.flatten(x)
    x = F.softmax(self.linear(x))
    return x


In [None]:
def train(model, device, train_loader, optimizer, criterion, epochs, test_loader=None):
  for epoch in range(epochs):
    start = time()

    n_correct = 0
    n_total = 0
    for data, target in tqdm(train_loader):
      data, target = data.to(device), target.to(device)
      optimizer.zero_grad()
      output = model(data)
      loss = criterion(output, target)
      loss.backward()
      optimizer.step()

      n_correct += torch.sum(torch.argmax(output, dim=1) == target).item()
      n_total += len(output)
    train_acc = n_correct / n_total

    print(f'Epoch: {epoch} Loss: {loss.item():.2f} Train acc: {train_acc:.2f}', end='')

    n_correct = 0
    n_total = 0
    if test_loader:
      with torch.no_grad():
        for data, target in test_loader:
          data, target = data.to(device), target.to(device)
          output = model(data)
          n_correct += torch.sum(torch.argmax(output, dim=1) == target).item()
          n_total += len(output)
      test_acc = n_correct / n_total
      print(f' Test acc: {test_acc:.2f}', end='')

    stop = time()

    print(f' Time: {timedelta(seconds=stop-start)}')
    print()

In [None]:
device_id = 'cpu'
if torch.cuda.is_available(): # check for nvidia gpu
    device_id = 'cuda'
elif torch.backends.mps.is_available(): # check for apple gpu
    device_id = 'mps'
device = torch.device(device_id)
print(device)

In [None]:
model = MnistModel().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
train(model, device, train_loader, optimizer, criterion, 10, test_loader=test_loader)

# Finding our lost red dots

## Import data

In [None]:
# definition of some utility functions to import the data
class ImageDataset(Dataset):
  def __init__(self, root_dir, transform=None, size=None, random_seed=42):
    self.root_dir = root_dir
    self.transform = transform
    self.random = random.Random(random_seed)
    self.files = os.listdir(self.root_dir)
    if size:
      if len(self.files) < size:
        raise Exception(f'Only found {len(self.files)} files in root directory, but the requested dataset size is {size}')
      self.files = self.random.sample(self.files, size)

  def __getitem__(self, idx):
    filename = self.files[idx]
    img_path = os.path.join(self.root_dir, filename)
    img = Image.open(img_path)
    img = self.transform(img) if self.transform is not None else img

    dot_count = torch.tensor([int(filename.split('_')[0])])

    return img, dot_count

  def __len__(self):
    return len(self.files)

def prepare_data(path, size=1000, transform=None, batch_size=128, shuffle=True):
  if transform is None:
    transform = transforms.ToTensor()

  dataset = ImageDataset(root_dir=path, transform=transform, size=size)
  dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle)
  return dataloader, dataset

def download_data_dots(root='data_dots'):
  url = 'https://goforeoy-my.sharepoint.com/:u:/g/personal/sebastian_landl_gofore_com/EcD5rowDcglFp6j1lSHRI2MBL2bPNssdXGyNRxEeh8cvRA?e=kjfNhN'
  filename = 'data_dots.zip'

  if os.path.exists(root):
    print(f'Folder {root} already exists. Not downloading.')
    return
  download(url, filename=filename, unzip=True, unzip_path=root, clean=True)

In [None]:
# This cell downloads the data and unpacks it into the folder structure defined above
download_data_dots()

In [None]:
# data parameters; here we specify the amount of data to load. More data may improve our model in exchange for longer training times.
train_data_size = 10_000
test_data_size = 1_000
train_data_path = 'data_dots/train'
test_data_path = 'data_dots/test'

In [None]:
train_loader, train_dataset = prepare_data(train_data_path, size=train_data_size, batch_size=128)
test_loader, test_dataset = prepare_data(test_data_path, size=test_data_size, batch_size=128)

## Plotting data

In [None]:
# Let's look at one of the images
x, y = train_dataset[42]
print(x.shape)
print(y)
plt.imshow(transforms.ToPILImage()(x))

## Learning data

In [None]:
# How many of our predictions are correct?
def accuracy(model, device, data_loader):
  n_correct = 0
  n_total = 0
  with torch.no_grad():
    for data, target in data_loader:
      data, target = data.to(device), target.to(device)
      output = model(data)
      n_correct += torch.sum(torch.round(output) == target).item()
      n_total += len(output)

  return n_correct / n_total

def train(model, device, train_loader, optimizer, criterion, epochs, test_loader=None):
  for epoch in range(epochs):
    start = time()

    n_correct = 0
    n_total = 0
    for data, target in tqdm(train_loader):
      data, target = data.to(device), target.to(device)
      optimizer.zero_grad()
      output = model(data)
      loss = criterion(output, target)
      loss.backward()
      optimizer.step()

      n_correct += torch.sum(torch.round(output) == target).item()
      n_total += len(output)
    train_acc = n_correct / n_total

    print(f'Epoch: {epoch} Loss: {loss.item():.2f} Train acc: {train_acc:.2f}', end='')

    if test_loader:
      test_acc = accuracy(model, device, test_loader)
      print(f' Test acc: {test_acc:.2f}', end='')

    stop = time()

    print(f' Time: {timedelta(seconds=stop-start)}')
    print()

In [None]:
# The training can run on differnt devices
# cpu is the default, as it is always available, but on
# some machines a more specialized device is available
device_id = 'cpu'
if torch.cuda.is_available(): # check for nvidia gpu
    device_id = 'cuda'
elif torch.backends.mps.is_available(): # check for apple gpu
    device_id = 'mps'
device = torch.device(device_id)
print(device)

Now let's get to training and improving out model.  
You can look into:
- Model architecture
  - [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)  
    The Linear Layer is essentially a matrix that transforms a vector of a given size to another vector of the specified output size. 
  - [nn.Conv2d](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html)  
    2D convolution is very good for image tasks. It takes one (or more) 2D matrix as an input and creates one (or more) 2D matrix as an output. Our images are basically 3 layers (Red, Green and Blue) of 32 by 32 matrices. [Here](https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md) are some illustrations of how the convolution operator works on a single layer. Keep in mind that passing data from a 2D convolution layer to a liner layer requires you to flatten the data to a 1D array. You can check out [this website](https://www.baeldung.com/cs/convolutional-layer-size) for an explanation of how to calculate the output size. Depending on what hyperparameters you want to use in the convolution layer and we assume a square image you can start with this simple formula:  
		`conv_out_size = (image_size - conv_kernel_size + 2 * padding) + 1`  
    `linear_layer_in_size = conv_out_channels * conv_out_size * conv_out_size`
- Amount of data
- Number of training iterations
- Any other parameter you can see really

In [None]:
class Model(nn.Module):
  def __init__(self, image_size=32):
    super().__init__()
    # TODO: define the architecture of the network
    # self.flatten = nn.Flatten() # takes input of any dimension (32x32x3 for our images) and flattens it into a 1D array for linear layers
    # self.layer1 = ?
    # ...
    
  def forward(self, x):
    # TODO: define the forward pass
    # x = self.flatten(x)
    # x = F.relu(self.layer1(x))
    # ...
    return x

In [None]:
# TODO: run the training, see how your model performs, improve the model
model = Model().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.L1Loss() # List of pytorch loss functions: https://neptune.ai/blog/pytorch-loss-functions
train(model, device, train_loader, optimizer, criterion, 10, test_loader=test_loader)

In [None]:
# Let's check one of our images
x, y = test_dataset[42]
with torch.no_grad():
    pred = model(x.unsqueeze(dim=0).to(device))
print('Actual number of dots:', y.item())
print('Prediction           :', pred.item())
plt.imshow(transforms.ToPILImage()(x))