## Preprocessing data
This class help to load and preprocess saved cars data

In [None]:
import os, pickle, numpy as np
from PIL import Image

In [None]:
# make real directory path from list of folders
def make_directory_path(*directories):
    path = ''
    for directory in directories:
        path += directory.rstrip('/') + '/' # cut the "/" char from end of the string, then readd it
    return path

# Load pickle file
def load_pickle(file_path):
    with open(file_path, 'rb') as file:
        return pickle.load(file)

In [None]:
class data_preprocessor:
    def __init__(self, main_dir, detail_ext='pkl', image_exts=['jpg','jpeg','png','bmp']):
        self.main_dir = make_directory_path(main_dir)
        self.detail_ext = detail_ext
        self.image_exts = image_exts
        self.load_in()
    
    def load_in(self):
        # read all diretories from the main diretory
        brands = [brand for brand in os.listdir(self.main_dir) if os.path.isdir(self.main_dir + brand)]

        # Brands
        cars_final = []
        for brand in brands:
            brand_dir = make_directory_path(self.main_dir, brand)
            cars = [car for car in os.listdir(brand_dir) if os.path.isdir(brand_dir + car)]

            # Cars
            for car in cars:
                car_dir = make_directory_path(brand_dir, car)
                files = [file for file in os.listdir(car_dir) if not os.path.isdir(car_dir + file)]
                
                # Files
                images = []
                details_file = None
                for file in files:
                    file_path = car_dir + file
                    extension = os.path.splitext(file)[1][1:].lower()
                    if extension == self.detail_ext:
                        details_file = file_path
                    elif extension in self.image_exts:
                        images.append(file_path)

                if details_file != None:
                    details = load_pickle(details_file)
                    car_final = {'brand': brand.lower(), 'details': details, 'images': images}
                    cars_final.append(car_final)
        self.cars = cars_final
        
    def image_preprocess(self, file):
        image = Image.open(file)
        #image = image.resize((width, height), Image.ANTIALIAS)
        image = image.convert('L') # convert image to greyscale
        image = np.array(image)
        image = np.uint8(image)
        return image
        
    def stream(self, count, brands=None, with_images=True):
        needed_details = [
            {'key': 'ár (eur)', 'as': 'euro'},
            {'key': 'kilométeróra állása', 'as': 'km'},
            {'key': 'évjárat', 'as': 'year'},
            {'key': 'üzemanyag', 'as': 'fuel'},
        ]
        i = 0
        while True:
            result = self.low_level_stream(i, count, needed_details, brands, with_images)
            if result == False:
                break
            cars, i = result
            for k in range(len(cars)):
                
                # format details
                cars[k]['details']['euro'] = cars[k]['details']['euro'][2:].replace('.', '')
                cars[k]['details']['km'] = cars[k]['details']['km'][:-3].replace(' ', '')
                cars[k]['details']['year'] = cars[k]['details']['year'][0:4]
                cars[k]['details']['fuel'] = cars[k]['details']['fuel'].lower()
                
            yield cars

    def low_level_stream(self, i, count, needed_details, brands=None, with_images=True):
        # return False if the iterator too big, it is useful in a while loop
        if i >= len(self.cars):
            return False
        
        if brands != None:
            brands = [brand.lower() for brand in brands] # make brands lowercase
            
        cars = []
        while len(cars) < count:
            if i >= len(self.cars):
                break
                
            car = self.cars[i]
            if (brands == None or car['brand'] in brands) and (not with_images or len(car['images']) > 0):
                details = dict()
                details['brand'] = car['brand']
                all_detail_exist = True
                for needed_detail in needed_details:
                    # check detail exist
                    if needed_detail['key'] not in car['details']:
                        all_detail_exist = False
                        break
                    details[ needed_detail['as'] ] = car['details'][ needed_detail['key'] ]

                # append only if all needed detail exists
                if all_detail_exist:
                    result = dict()
                    result['details'] = details
                    if with_images:
                        images = []
                        for image in car['images']:
                            images.append( self.image_preprocess(image) )
                        result['images'] = images
                    cars.append(result)
            i += 1
        return cars, i


In [None]:
directory = 'data'
DP = data_preprocessor(directory)
for cars in DP.stream(10):
    print()
    print(cars)
    print()