In [1]:
import yaml
import pymongo
from urllib.parse import quote_plus as quote
import hashlib

In [2]:
# Подключимся к нашей коллекции

with open('../env/credsw.yaml', 'r') as file:
    creds_dict = yaml.safe_load(file)
    
url = 'mongodb://{user}:{pw}@{hosts}/?{rs}&authSource={auth_src}&{am}&tls=true&tlsCAFile={cert_file}'.format(
    user=creds_dict['username'],
    pw=quote(creds_dict['password']),
    hosts=creds_dict['host'],
    rs='replicaSet=rs01',
    auth_src=creds_dict['database'],
    am='authMechanism=DEFAULT',
    cert_file='../env/root.crt'
    )

dbs = pymongo.MongoClient(url)

db = dbs[creds_dict['database']]

collection = db['initial_dataset']
unique_collection = db['unique_dataset']

In [3]:
# Выгрузим данные из коллекции
pipeline = [{ "$unwind" : "$data_result.boxes" },
            {"$project":{
                "size": "$data_result.boxes.size",
                "stacking": "$data_result.boxes.stacking",
                "turnover": "$data_result.boxes.turnover",
                "loading_size": "$data_result.cargo_space.loading_size",
                "filling_space_percent": "$data_result.cargo_space.calculation_info.filling_space_percent",
                "density_percent": "$data_result.cargo_space.calculation_info.density_percent"
                }},
            {"$group": {"_id": "$_id",
                        "loading_size": {"$first":"$loading_size"},
                        "filling_space_percent": {"$first":"$filling_space_percent"},
                        "density_percent": {"$max":"$density_percent"},
                        "boxes": {
                            "$push":  {
                                "size": "$size",
                                "stacking": "$stacking",
                                "turnover": "$turnover"                           
                            }
                            
                        }
            }
            }
            ]
result = collection.aggregate(pipeline)
dataset = [i for i in result]

In [4]:
def as_is_hash(box): #box_i = dataset[0]['boxes'][i]
    if box['turnover']:
        size_lst = sorted([x for x in box['size_scale'].values()])
    else:
        size_lst = [box['size_scale']['height'],
                    min(box['size_scale']['width'], box['size_scale']['length']),
                    max(box['size_scale']['width'], box['size_scale']['length'])]
    hash_object = hashlib.md5(
        ('h'+str(size_lst[0])+\
         'w'+str(size_lst[1])+\
         'l'+str(size_lst[2])+\
         's'+str(box['stacking'])+\
         't'+str(box['turnover'])
        ).encode())
    return hash_object.hexdigest()

In [5]:
def hash_cont(loading_size):
    hash_object = hashlib.md5(
        ('h'+str(loading_size['height'])+\
         'w'+str(loading_size['width'])+\
         'l'+str(loading_size['length'])
        ).encode())
    return hash_object.hexdigest()

In [6]:
def scale_item(item):
    loading_size = item['loading_size']
    norm_base = max(loading_size['width'], loading_size['height'], loading_size['length'])
    item['loading_size_scale'] = {k: round(v*100/norm_base,2) for k,v in loading_size.items()}
    for box in item['boxes']:
        box_size = box['size']
        box['size_scale'] = {k: round(v*100/norm_base,2) for k,v in box_size.items()}
        box['hash'] = as_is_hash(box)
    hash_object = hashlib.md5(
        (hash_cont(loading_size) + '_' + \
        ','.join(sorted([box['hash'] for box in item['boxes']]))).encode()
    )
    item['hash'] = hash_object.hexdigest()
    return item

In [7]:
dataset_scaled = [scale_item(item) for item in dataset]

In [8]:
len(dataset_scaled) # total containers

628

In [9]:
len({x['hash'] for x in dataset_scaled}) # Уникальных контейнеро-наполнений

493

In [10]:
def clean_item(item):
    result = dict()
    result['hash'] = item['hash']
    result['filling_space_percent'] = item['filling_space_percent']
    result['density_percent'] = item['density_percent']
    result['loading_size_scale'] = item['loading_size_scale']
    boxes = list()
    for box in item['boxes']:
        boxes.append(
            {'hash': box['hash'],
             'size_scale': box['size_scale'],
             'volume':  box['size_scale']['width'] * box['size_scale']['height'] * box['size_scale']['length'],
             'turnover': box['turnover'],
             'stacking': box['stacking']
             }
        )
    result['boxes'] = sorted(boxes, key = lambda x: x['volume'], reverse = True)
    return result
    

In [11]:
dataset_cleaned = [clean_item(item) for item in dataset_scaled]

In [18]:
densities = dict()
for item  in dataset_cleaned:
    if item['hash'] in densities.keys():
        if item['density_percent'] not in densities[item['hash']]:
            densities[item['hash']].append(item['density_percent'])
    else:
        densities[item['hash']] = [item['density_percent']]
        
densities

{'ebd3f12f2cafb3c2170bfb4749afc299': [94.59571],
 'd589af23443b249ede2b42bd4363d2fc': [81.91095],
 '881a0402727af3bb0ab0b5385c5cf450': [70.83333, 70.3125],
 '9ab627ac1e331dc9b1fb57e7325d4cb2': [88.0],
 '5aac61c2e9b5dc09a0293b7c6a39b08f': [100.0],
 '94fb0840b520c748532aa15cbec4e8c9': [76.33322],
 'e24c89800200d51999051f657f75f36f': [83.807],
 '56c3e497a3deac83fa8b140ef19603cb': [94.98881],
 'c2a81e8b56d4ceecf99c61f34d160ada': [63.5741],
 '5e6f939c0d03d14763f51017976dae1c': [80.63824, 63.38312],
 'f35628479cfac98e192d28bfec748ea5': [100.0],
 '570444c63b8a7bf9fb61678aec36a525': [34.03689],
 'e3b8057b33960053a7d2282cb3289a50': [33.7685],
 '1d08f6b40fb94aa78ba06403e7c1bc8c': [91.90237],
 '1a7a8051de835189d9bfdc474cdf6f9c': [58.79771],
 'a3c02f2e3414e1948b55263d696e73f0': [81.90553],
 'c7f2c3c97974fa642ff5535a029efa15': [89.15496],
 '0037910ad307e15481162e5d62a2520e': [92.61204],
 'd62dad5a4de9e2a6305395af975d77c2': [56.64959],
 '1df1b615204fd0f843cf3b0ffe185019': [74.73759, 64.10234],
 '0fb

In [20]:
densities_max = {k:max(v) for k,v in densities.items()}

In [21]:
dataset_cleaned_short = list()
for item  in dataset_cleaned:
    if item['density_percent'] == densities_max[item['hash']]:
            dataset_cleaned_short.append(item)
len(dataset_cleaned_short)

608

In [29]:
for item in dataset_cleaned_short:
    result = unique_collection.update_one(
        {'_id': item['hash']},
        {'$set': {'filling_space_percent': item['filling_space_percent'],
                  'density_percent': item['density_percent'],
                  'loading_size_scale': item['loading_size_scale'],
                  'boxes':item['boxes']                                    
                  }
         },
        upsert=True)