In [1]:
import yaml
import pymongo
from urllib.parse import quote_plus as quote
import hashlib

In [36]:
# Подключимся к нашей коллекции

with open('../env/credsw.yaml', 'r') as file:
    creds_dict = yaml.safe_load(file)
    
url = 'mongodb://{user}:{pw}@{hosts}/?{rs}&authSource={auth_src}&{am}&tls=true&tlsCAFile={cert_file}'.format(
    user=creds_dict['username'],
    pw=quote(creds_dict['password']),
    hosts=creds_dict['host'],
    rs='replicaSet=rs01',
    auth_src=creds_dict['database'],
    am='authMechanism=DEFAULT',
    cert_file='../env/root.crt'
    )

dbs = pymongo.MongoClient(url)

db = dbs[creds_dict['database']]

collection = db['initial_dataset']
unique_collection = db['unique_dataset']

In [3]:
# Выгрузим данные из коллекции
pipeline = [{ "$unwind" : "$data_result.boxes" },
            {"$project":{
                "size": "$data_result.boxes.size",
                "stacking": "$data_result.boxes.stacking",
                "turnover": "$data_result.boxes.turnover",
                "loading_size": "$data_result.cargo_space.loading_size",
                "filling_space_percent": "$data_result.cargo_space.calculation_info.filling_space_percent"
                }},
            {"$group": {"_id": "$_id",
                        "loading_size": {"$first":"$loading_size"},
                        "filling_space_percent": {"$first":"$filling_space_percent"},
                        "boxes": {
                            "$push":  {
                                "size": "$size",
                                "stacking": "$stacking",
                                "turnover": "$turnover"                           
                            }
                            
                        }
            }
            }
            ]
result = collection.aggregate(pipeline)
dataset = [i for i in result]

In [4]:
def as_is_hash(box): #box_i = dataset[0]['boxes'][i]
    if box['turnover']:
        size_lst = sorted([x for x in box['size_scale'].values()])
    else:
        size_lst = [box['size_scale']['height'],
                    min(box['size_scale']['width'], box['size_scale']['length']),
                    max(box['size_scale']['width'], box['size_scale']['length'])]
    hash_object = hashlib.md5(
        ('h'+str(size_lst[0])+\
         'w'+str(size_lst[1])+\
         'l'+str(size_lst[2])+\
         's'+str(box['stacking'])+\
         't'+str(box['turnover'])
        ).encode())
    return hash_object.hexdigest()

In [5]:
def hash_cont(loading_size):
    hash_object = hashlib.md5(
        ('h'+str(loading_size['height'])+\
         'w'+str(loading_size['width'])+\
         'l'+str(loading_size['length'])
        ).encode())
    return hash_object.hexdigest()

In [13]:
def scale_item(item):
    loading_size = item['loading_size']
    norm_base = max(loading_size['width'], loading_size['height'], loading_size['length'])
    item['loading_size_scale'] = {k: round(v*100/norm_base,2) for k,v in loading_size.items()}
    for box in item['boxes']:
        box_size = box['size']
        box['size_scale'] = {k: round(v*100/norm_base,2) for k,v in box_size.items()}
        box['hash'] = as_is_hash(box)
    hash_object = hashlib.md5(
        (hash_cont(loading_size) + '_' + \
        ','.join(sorted([box['hash'] for box in item['boxes']]))).encode()
    )
    item['hash'] = hash_object.hexdigest()
    return item

In [14]:
dataset_scaled = [scale_item(item) for item in dataset]

In [15]:
len(dataset_scaled) # total containers

628

In [16]:
len({x['hash'] for x in dataset_scaled}) # Уникальных контейнеро-наполнений

493

In [32]:
def clean_item(item):
    result = dict()
    result['hash'] = item['hash']
    result['filling_space_percent'] = item['filling_space_percent']
    result['loading_size_scale'] = item['loading_size_scale']
    boxes = list()
    for box in item['boxes']:
        boxes.append(
            {'hash': box['hash'],
             'size_scale': box['size_scale'],
             'volume':  box['size_scale']['width'] * box['size_scale']['height'] * box['size_scale']['length'],
             'turnover': box['turnover'],
             'stacking': box['stacking']
             }
        )
    result['boxes'] = sorted(boxes, key = lambda x: x['volume'], reverse = True)
    return result
    

In [33]:
dataset_cleaned = [clean_item(item) for item in dataset_scaled]

In [41]:
for item in dataset_cleaned:
    result = unique_collection.update_one(
        {'_id': item['hash']},
        {'$set': {'filling_space_percent': item['filling_space_percent'],
                  'loading_size_scale': item['loading_size_scale'],
                  'boxes':item['boxes']                                    
                  }
         },
        upsert=True)