In [1]:
### Import libraries
import torch
import pandas as pd
from PIL import Image
import torchvision.transforms as transforms
from torchvision import datasets, models
import numpy as np
import json
import requests
import matplotlib.pyplot as plt
import warnings
import os
from google.cloud import storage
warnings.filterwarnings('ignore')
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torchvision
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import splitfolders
# %load_ext tensorboard
import datetime
import time
import zipfile
import csv
from tqdm import tqdm

import random
import shutil

In [2]:
# Initialise a client
storage_client = storage.Client("aicore-study")
# Create a bucket object for our bucket
bucket = storage_client.get_bucket('pytorch_training_images_13_class')
# Create a blob object from the filepath
blob = bucket.blob("pytorch_images.zip")
# Download the file to a destination
blob.download_to_filename("pytorch_images.zip")

In [3]:
with zipfile.ZipFile('pytorch_images.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [4]:
os.getcwd()
pytorch_images = os.path.join(os.getcwd(), 'pytorch_images')
os.remove(os.path.join(pytorch_images, '.DS_Store'))
# Image classes
class_list = os.listdir(pytorch_images)

In [5]:
## remove all hidden files from class folders
for img_class in class_list:
    os.remove(os.path.join(pytorch_images, img_class, '.DS_Store'))

In [6]:
class_list

['diytools',
 'office',
 'homegarden',
 'phones',
 'clothes',
 'computers',
 'booksfilmmusicgames',
 'videogames',
 'healthbeauty',
 'kidstoys',
 'appliances',
 'sportsleisure',
 'other']

In [7]:
encoder = {}
for class_no, class_label in enumerate(class_list):
    encoder[class_label] = class_no

In [12]:
for img_class in class_list:
    for image in os.listdir(os.path.join(pytorch_images, img_class)):
        if not image.lower().endswith(('.jpg', '.jpeg')):
            os.remove(os.path.join(os.getcwd(),pytorch_images,img_class,image))

In [11]:
## Clean the crazy images
for img_class in class_list:
    for image in os.listdir(os.path.join(pytorch_images, img_class)):
        image_fp = os.path.join(pytorch_images, img_class, str(image))
        image_obj = Image.open(os.path.join(pytorch_images, img_class, image)).convert('RGB').save(image_fp)
        #if image_obj.mode != 'RGB':
        #    print(f'removing image {image} from class {img_class}')
        #    os.remove(image_fp)

In [13]:
img_class_dict = {}
for img_class in class_list:
    for image in os.listdir(os.path.join(os.getcwd(), pytorch_images, img_class)):
        img_class_dict[image] = encoder[img_class]

In [14]:
## write the image class mapping to a csv file
img_class_df = pd.DataFrame.from_dict(img_class_dict, orient='index', columns=['class'])
img_class_df['image'] = img_class_df.index
img_class_df = img_class_df.reset_index()
col_order = ['image', 'class']
img_class_df = img_class_df[col_order]
img_class_df.to_csv('pytorch_images_training_data.csv', index=False)

In [15]:
img_class_df.head()

Unnamed: 0,image,class
0,Speedy_Consumer_Image_Banners_290x267_43.jpg,0
1,51sZwwkkl9L.jpg,0
2,download-media.jpeg,0
3,drills-12v-18v-battery-bosch-diy--9f2ea6a88ddb...,0
4,1223-M020-PLP-stanley-fatmax-battery-desktab-v...,0


In [16]:
splitfolders.ratio('pytorch_images', output='pytorch_images_tv_split_2', seed=42, ratio=(0.7,0.3))


Copying files: 0 files [00:00, ? files/s][A
Copying files: 428 files [00:00, 4241.11 files/s][A
Copying files: 853 files [00:00, 3716.17 files/s][A
Copying files: 1262 files [00:00, 3874.03 files/s][A
Copying files: 1653 files [00:00, 3672.98 files/s][A
Copying files: 2151 files [00:00, 4112.06 files/s][A
Copying files: 2621 files [00:00, 4287.86 files/s][A
Copying files: 3054 files [00:00, 3981.10 files/s][A


In [2]:
## randomly pick 50 images from the master dir and push to another copy of the dir
os.mkdir('pytorch_images_lite')
source_dir = 'pytorch_images/'
destination_dir = 'pytorch_images_lite'

In [3]:
for folder in os.listdir(source_dir):
    folder_path = os.path.join(source_dir, folder)
    if os.path.isdir(folder_path):  # Only process subdirectories
        files = os.listdir(folder_path)
        if len(files) > 100:
            # Randomly sample 50 files
            sampled_files = random.sample(files, 100)

            # Create a new folder in the destination directory
            new_folder_path = os.path.join(destination_dir, folder)
            os.makedirs(new_folder_path, exist_ok=True)

            # Copy the sampled files to the new folder
            for file_name in sampled_files:
                source_file_path = os.path.join(folder_path, file_name)
                destination_file_path = os.path.join(new_folder_path, file_name)
                shutil.copy2(source_file_path, destination_file_path)

In [4]:
splitfolders.ratio('pytorch_images_lite', output='pytorch_images_lite_split', seed=42, ratio=(0.7,0.3))

Copying files: 1300 files [00:00, 3917.78 files/s]
