In [1]:
notebook_dir = "/content/drive/MyDrive/Colab Notebooks/Cattle_Identification"

from google.colab import drive
drive.mount('/content/drive')
import sys
import os
sys.path.insert(0, notebook_dir)
os.chdir(notebook_dir)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
def individual_sample_dict(url):
  """
  Parse the dataset's URL and generate a dictionary whose key is the cattle ID and
  value is a list of pairs of sample number and image URL
  Args:
      url: the URL of the dataset.
  Returns:
      individual_sample_dict: A dictionary: {'cattle_000': [('000', image_url),('001', image_url) ...] ... }
  """
  import requests
  response = requests.get(url)
  individual_sample_dict = {}
  for line in response.text.split('"name":'):
    if '"contentType":"directory"' and 'cattle_' in line:
      individual = line.split('"')[1].split('"')[0]
      response1 = requests.get(url + individual)
      sample_list = []
      for line1 in response1.text.split('"name":'):
        if '"contentType":"file"' and '.png' in line1:
          sample = line1.split('"')[1].split('.png')[0]
          sample_list.append((sample, 'https://raw.githubusercontent.com/pwang697/Cattle_Identification/main/images/' + individual + '/' + sample + '.png' ))
      individual_sample_dict[individual] = sample_list

  return individual_sample_dict

dataset_dict = individual_sample_dict("https://github.com/pwang697/Cattle_Identification/tree/main/images/")


In [3]:
def retrieve_image(url, local_path):
  """
  Retrieve and download image
  """
  import requests
  response = requests.get(url)
  if response.status_code == 200:
      with open(local_path, 'wb') as file:
          file.write(response.content)
      print(f"Image downloaded successfully: {local_path}")
  else:
      print(f"Failed to retrieve image. Status code: {response.status_code}")


In [4]:
def naive_train_test_split(dataset_dict):
  """
  Separate two samples from each individual
  """
  import random
  random.seed(0)
  train_dict = {}
  test_dict = {}
  for item in dataset_dict:
    temp_list = dataset_dict[item].copy()
    random_samples = random.sample(temp_list, 2)
    test_dict[item] = random_samples
    for rs in random_samples:
      temp_list.remove(rs)
    train_dict[item] = temp_list
  return train_dict, test_dict

In [5]:
train_dict, test_dict = naive_train_test_split(dataset_dict)
test_dict['cattle_000']

[('006',
  'https://raw.githubusercontent.com/pwang697/Cattle_Identification/main/images/cattle_000/006.png'),
 ('011',
  'https://raw.githubusercontent.com/pwang697/Cattle_Identification/main/images/cattle_000/011.png')]

In [6]:
def make_local_image_dataset(dataset_dict, directory):
  if os.path.isdir(directory):
    print(f"Directory existed: {directory}")
  else:
    os.mkdir(directory)
    for item in dataset_dict:
      os.mkdir(directory + '/' + item)
      for sample in dataset_dict[item]:
        local_path = directory + '/' + item + '/' + sample[0] + '.png'
        retrieve_image(sample[1], local_path)

train_dir = notebook_dir + '/' + 'train'
test_dir = notebook_dir + '/' + 'test'
make_local_image_dataset(train_dict, train_dir)
make_local_image_dataset(test_dict, test_dir)

Directory existed: /content/drive/MyDrive/Colab Notebooks/Cattle_Identification/train
Directory existed: /content/drive/MyDrive/Colab Notebooks/Cattle_Identification/test


In [7]:
def add_black_canva(img, height, width):
  """
  Performs adding black canvas under input image

  Parameters
  -----------------------
  img: input image
  height: height of canvas (height of output image)
  width: width of canvas (width of output image)
  """
  import numpy as np
  black_canva = np.zeros((height,width,3), dtype=np.uint8)
  rows, cols = img.shape[:2]
  start_x, start_y = width//2 - cols//2, height//2 - rows//2
  black_canva[start_y:start_y+rows, start_x:start_x+cols] = img
  return black_canva

In [8]:
def generate_numpy_arr_dataset(directory):
  import cv2
  import numpy as np
  dir_list = os.listdir(directory)
  X_list = []
  y_list = []
  for dir in dir_list:
    file_list = os.listdir(directory + '/' + dir)
    dir_int = int(dir.split('_')[1])
    for img in file_list:
      img_array = cv2.imread(directory + '/' + dir + '/' + img)
      black_canva = add_black_canva(img_array, 1000, 1000)
      X_list.append(black_canva)
      y_list.append(dir_int)
  X_array = np.array(X_list, dtype=np.uint8)
  y_array = np.array(y_list, dtype=np.uint8)
  return X_array, y_array

In [9]:
X_train, y_train = generate_numpy_arr_dataset(train_dir)
X_test, y_test = generate_numpy_arr_dataset(test_dir)

In [12]:
import numpy as np
np.savez(notebook_dir + '/datasets/train.npz', images = X_train, labels = y_train)
np.savez(notebook_dir + '/datasets/test.npz', images = X_test, labels = y_test)