In [1]:
import pandas as pd
import os
import glob

In [2]:
data_path = '/Users/sonynka/HTW/IC/data/aboutyou/'

# Merge data
Merge all subcategories in a given category into a category.csv.

In [3]:
def merge_csv_data(path, csv_save_path):
    """ Find all csvs in the category's subfolders and merge them to one csv file in the data_path """
    
    # for each subfolder in category (subcagetogry) merge all csv files into one data_path/category.csv
    if os.path.isdir(path):
        csv_files = glob.glob(os.path.join(path, '*.csv'))
        df = pd.DataFrame()
        for file in csv_files:
            df_file = pd.read_csv(file, sep=';', encoding='utf-8')
            df = df.append(df_file)
        
        df.to_csv(csv_save_path, sep=';', encoding='utf-8', index=False)

In [4]:
for category in os.listdir(data_path):
    category_path = os.path.join(data_path, category)
    save_path = os.path.join(data_path, category + '.csv')
    merge_csv_data(category_path, save_path)

# Clean data
## Remove Duplicates

Drop duplicated rows which could occured during scraping data and appending to csvs.

If the count of img_path is more than unique, drop the duplicated img_path. This can occurr, if the same image is assigned to two different colors. In that case, just keep the first color.

*Note: We apply this to img_path, since it is acceptable that img_url is duplicated -> one image can be assigned to two different subcategories, such as Kurze Kleider und Cocktailkleider. If one image should only have one subcategory assigned, duplicated should be dropped on img_url not on img_path.*

In [5]:
def remove_duplicates(df):
    
    # drop duplicated rows
    print('REMOVING DUPLICATES')
    print('Shape', df.shape)
    df_clean = df.drop_duplicates()
    print('Without duplicates', df_clean.shape)
    df_clean.describe()
    
    return df_clean

## Check images and CSVs consistency
### CSV -> image path check
Make sure all the image paths in the csvs actually exist in the folder structure.

In [6]:
def check_csv_consistency(df):
    """ Check if all the image paths in the csv actually exist in the folder structure """
        
    print('CHECKING CSV CONSISTENCY')
    df['exists'] = [True if os.path.exists(x) else False for x in df['img_path']]
    not_exist_imgs_count = df[df['exists'] == False].shape[0]

    print('Not existing images included in CSV: ', not_exist_imgs_count)
    
    if not_exist_imgs_count > 0:
        print('Deleting not-existing images from csv...')
        df = df[df['exists'] == True]
    df = df.drop('exists', axis=1)

    return df

### image path -> CSV check
Make sure all the images in the folders have an entry in the csv.

In [7]:
def check_img_consistency(category, df_category):
    """ Check if all the existing images in the folder structure are included in the csv """
    
    print('CHECKING IMAGE CONSISTENCY')
    for subcategory in os.listdir(os.path.join(data_path, category)):
        
        subcategory_path = os.path.join(data_path, category, subcategory)
        
        if os.path.isdir(subcategory_path):
            img_names = [os.path.join(subcategory_path, o) for o in os.listdir(subcategory_path)]
            row_names = df_category[df_category['subcategory'] == subcategory]['img_path']
            diff_imgs = list(set(img_names).difference(row_names))

            # if more images in folders than in the csv
            if len(diff_imgs) > 0:
                print('Found non-matching images', diff_imgs)

                # ask user if non-matching images should be deleted
                while True:
                    delete_imgs = input('Delete non-matching images? (yes/no): ')
                    if delete_imgs == 'yes':
                        for img_path in diff_imgs:
                            os.remove(img_path)
                        imgs_count = len(os.listdir(subcategory_path))
                        print('Deleted non-matching images')
                        break
                    if delete_imgs == 'no':
                        print('Not deleting images')
                        break
            else:
                print('Subcategory {} OK'.format(subcategory))

In [8]:
for category in os.listdir(data_path):
    if os.path.isdir(os.path.join(data_path, category)):
        print('\nCleaning data for category', category)
        print('-' * 50)
        df_category = pd.read_csv(os.path.join(data_path, category + '.csv'), sep=';', encoding='utf-8')

        df_clean = remove_duplicates(df_category)
        df_consistent = check_csv_consistency(df_clean)
        check_img_consistency(category, df_consistent)

        print('Writing clean data to', os.path.join(data_path, category + '.csv'))
        df_consistent.to_csv(os.path.join(data_path, category + '.csv'), sep=';', encoding='utf-8', index=False)


Cleaning data for category jumpsuits & overalls
--------------------------------------------------
REMOVING DUPLICATES
Shape (476, 7)
Without duplicates (476, 7)
CHECKING CSV CONSISTENCY
Not existing images included in CSV:  0
CHECKING IMAGE CONSISTENCY
Subcategory kurze jumpsuits OK
Subcategory lange jumpsuits OK
Writing clean data to /Users/sonynka/HTW/IC/data/aboutyou/jumpsuits & overalls.csv

Cleaning data for category jacken
--------------------------------------------------
REMOVING DUPLICATES
Shape (3610, 7)
Without duplicates (3610, 7)
CHECKING CSV CONSISTENCY
Not existing images included in CSV:  0
CHECKING IMAGE CONSISTENCY
Subcategory übergangsjacken OK
Subcategory winterjacken OK
Subcategory bomberjacken OK
Subcategory jeansjacken OK
Subcategory steppjacken OK
Subcategory cargojacken OK
Subcategory lederjacken OK
Subcategory regenjacken OK
Subcategory blousons OK
Subcategory daunenjacken OK
Subcategory parka OK
Subcategory outdoorjacken OK
Writing clean data to /Users/sony

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Not existing images included in CSV:  0
CHECKING IMAGE CONSISTENCY
Subcategory boleros OK
Subcategory strickjacken OK
Subcategory grobstrickpullover OK
Subcategory feinstrickpullover OK
Writing clean data to /Users/sonynka/HTW/IC/data/aboutyou/strick.csv

Cleaning data for category jeans
--------------------------------------------------
REMOVING DUPLICATES
Shape (1321, 7)
Without duplicates (1321, 7)
CHECKING CSV CONSISTENCY
Not existing images included in CSV:  0
CHECKING IMAGE CONSISTENCY
Subcategory straight leg OK
Subcategory latzhosen OK
Subcategory bootcut OK
Subcategory boyfriend OK
Subcategory slim fit OK
Subcategory jeggings OK
Subcategory jeans shorts OK
Writing clean data to /Users/sonynka/HTW/IC/data/aboutyou/jeans.csv

Cleaning data for category blusen & tuniken
--------------------------------------------------
REMOVING DUPLICATES
Shape (4301, 7)
Without duplicates (4301, 7)
CHECKING CSV CONSISTENCY
Not existing images included in CSV:  0
CHECKING IMAGE CONSISTENCY
Subca

# Merge all data
Merge all category csv into once csv -> aboutyou.csv

In [9]:
merge_csv_data(data_path, '/Users/sonynka/HTW/IC/data/aboutyou.csv')