In [183]:
import os, sys, traceback, re
import openpyxl
from openpyxl.compat import range
from openpyxl.cell import get_column_letter

"""
Data config
"""
INDEX = 0
NAME = 1
SIZE = 6
AMOUNT = 11
MATERIAL = 12
UNITS = 17
MATERIAL_AMOUNT = 19
STANDART = 21
COMMENT = 24

PAYLOAD_DATA_INDEXES = [
    INDEX, NAME, SIZE, 
    AMOUNT, MATERIAL, UNITS, 
    MATERIAL_AMOUNT, STANDART, 
    COMMENT
]

FISRT_LIST = 'Лист1'
FISRT_LIST_FIRST_DATA_ROW = 22
OTHER_LISTS_FIRST_DATA_ROW = 2

REPEAT_SYMBOLS = ['——ıı——', ]

"""
Script config
"""
DEFAULT_DIR_PATH = os.path.dirname(sys.argv[0])
DEFAULT_RESULT_FILE_NAME = 'output.xlsx'
DEFAULT_RESULT_FILE_PATH = os.path.join(DEFAULT_DIR_PATH, DEFAULT_RESULT_FILE_NAME)

"""
LOGIC
""" 
def remove_spaces(value):
        """
        Returns given string with all the spaces removed.
        """
        return re.sub('[\s+]', '', str(value))       
            
def get_files(dir_path=DEFAULT_DIR_PATH):
    """
    Returns list of excel file paths in
    a given directory
    """
    result = []
    for path, subdirs, files in os.walk(dir_path):
        for file in files:
            filename, file_extension = os.path.splitext(file)
            if file_extension in ['.xls', '.xlsx']:
                file_path = os.path.join(path, file)
                result.append(file_path)   
    return result

def getValueWithMergeLookup(sheet, cell):
    idx = cell.coordinate
    for range_ in sheet.merged_cell_ranges:
        merged_cells = list(openpyxl.utils.rows_from_range(range_))
        for row in merged_cells:
            if idx in row:
                # If this is a merged cell,
                # return  the first cell of the merge range
                return sheet.cell(merged_cells[0][0]).value
    return sheet.cell(idx).value

def pre_process(rows):
    """
    Filters and preprocess data to make it ready for the merge.
    """
    filtered_rows = [
        row for row in rows if row[NAME] is not None and row[INDEX] is not None
    ]
    payloaded = []
    for row in filtered_rows:
        payloaded_row = []
        if type(row[0]) == int:
            for idx in PAYLOAD_DATA_INDEXES:
                payloaded_row.append(row[idx])
            payloaded.append(payloaded_row)
    #handle repeat sybmols
    for row_index, row in enumerate(payloaded):
        for value_index, value in enumerate(row):
            if value in REPEAT_SYMBOLS:
                row[value_index] = payloaded[row_index - 1][value_index] 
            elif value is None:
                row[value_index] = '-'
            row[value_index] = str(row[value_index])
    return payloaded 


class ExtraParamsObject:
    def __init__(self, size, amount):
        self.size = size
        self.amount = amount

        
class OutputRow:
    def form_extra_params(self, size, amount):
        params = []
        obj = ExtraParamsObject(size, amount)
        params.append(obj)
        return params
    
    def __init__(self, data):
        data_index = 0
        data_name = 1
        data_size = 2
        data_amount = 3
        data_material = 4
        data_units = 5
        data_material_amount = 6
        data_standart = 7
        data_comment = 8

        self.name_material = '%s - %s' % (data[data_name], data[data_material])
        self.nomen = '-'
        self.extra_params = self.form_extra_params(
            data[data_size],
            data[data_amount]
        )  
        self.nomen_number = '-'
        self.code = '-'
        self.category = '-'
        self.standart = data[data_standart]
        self.units = data[data_units]
        self.amount = '-'
        # service props
        self.material = data[data_material]
        self.primary_size = remove_spaces(str(data[data_size])).split('×')[0] 
        self.name = data[data_name]

def merge_row(output, row):
    """
    Merges given row with existing output if need.
    """
    def is_match(existed, new):
        return existed.standart == new.standart and \
                existed.material == new.material and \
                existed.primary_size == new.primary_size               
    
    new = OutputRow(row)
    if not output:
        output.append(new)
    else:
        match = [existed for existed in output if is_match(existed, new)]
        if not match: output.append(new)
        elif len(match) > 1: 
            print('Invalid parsing algorythm')
        else:
            match = match[0]
            merge_target = output[output.index(match)]
            if not merge_target.name_material == new.name_material:
                merge_target.name_material = ', '.join([new.name, merge_target.name_material])
            print("match on primary size detected")
            # check if size is equal
            # if so merge like [s-a1+a2, ...]     
            extra_param_equal = [item for item in match.extra_params \
                      if item.size == new.extra_params[0].size]
            if extra_param_equal:
                equal = extra_param_equal[0]
                print('size equal match detected')
                for_edit = match.extra_params[match.extra_params.index(equal)]
                # кол-во заготовок складывается
                for_edit.amount = str(int(new.extra_params[0].amount) + int(for_edit.amount))
                return output
            # else merge in list [s-a; s-a...]
            merge_target.extra_params += new.extra_params             
    return output

def merge(rows):
    output = []
    print('Processing output...')
    print(' ')
    rows = pre_process(rows)
    for row in rows:
        output = merge_row(output, row)
    
    to_file_format = []
    for item in output:
        obj = [
            item.name_material,
            item.nomen,
            '; '.join([(', '.join([extra_param.size, extra_param.amount])) \
                       for extra_param in item.extra_params]),
            item.nomen_number,
            item.code,
            item.category,
            item.standart,
            item.units,
            item.amount
        ]
        
        to_file_format.append(obj)   
        
    return to_file_format

def build_results_file(rows, result_file_path):
    """
    Build an excel file based on results dict and 
    a given path.
    """
    wb = openpyxl.load_workbook('template.xlsx')
    dest_filename = os.path.join(result_file_path, DEFAULT_RESULT_FILE_NAME)
    ws = wb.active   
    for row in rows:
        for value_index, value in enumerate(row):
            row[value_index] = str(value).encode('utf-8')
        ws.append(row)   
    wb.save(filename = dest_filename)
   
def process_files(dir_path=DEFAULT_DIR_PATH, result_file_path=DEFAULT_RESULT_FILE_PATH):
    """
    Application level logic.
    """
    try:
        files = get_files(dir_path)
        rows_to_process = []
        if files:
            for file in files:
                workbook = openpyxl.load_workbook(filename=file)   
                for sheet in workbook:
                    for row in sheet:
                        # add rows by certain condition
                        row_index = (lambda x: x[0].row)(row)
                        if (sheet is not workbook[FISRT_LIST] \
                         and row_index >= OTHER_LISTS_FIRST_DATA_ROW) \
                            or row_index >= FISRT_LIST_FIRST_DATA_ROW:                           
                            merged_cells_awared_row = []
                            for cell in row:
                                value = getValueWithMergeLookup(sheet, cell)
                                merged_cells_awared_row.append(value)                                                       
                            rows_to_process.append(merged_cells_awared_row)                            
            result = merge(rows_to_process) 
            print(len(result))
            for row in result: print(row)             
            #build_results_file(result, result_file_path)            
            print(' ')
            print('Success')
        else:
            print('No files to process')    
    except Exception as ex:
        print('Error while processing')
        print(ex)
        traceback.print_exc()


In [184]:
process_files('c:/kub')

Processing output...
 
match on primary size detected
match on primary size detected
match on primary size detected
match on primary size detected
match on primary size detected
match on primary size detected
match on primary size detected
match on primary size detected
size equal match detected
match on primary size detected
match on primary size detected
size equal match detected
68
['Молот Тора, Лист - Ст 20', '-', '≠1×15×20, 2', '-', '-', '-', 'Г.1577-93', 'м²', '-']
['Круг - Ст 12Х18Н9Т', '-', 'Ø20×30, 1', '-', '-', '-', 'Г.5632-72', '-', '-']
['Круг - Ст 12Х18Н9Т', '-', 'Ø22×17, 1', '-', '-', '-', 'Г.5632-72', '-', '-']
['Шестигранник 5,5h-12 - Ст 45', '-', 'L=270, -', '-', '-', '-', 'Г.1051-73', 'мм', '-']
['Круг - Ст 12Х18Н10Т', '-', 'Ø6×25, -', '-', '-', '-', 'Г.5632-72', '-', '-']
['QWERTY, Альтрон, Круг - Ст 20', '-', 'Ø7×120, -; Ø7×180, -; Ø7×600, -; Ø7×120000, 125', '-', '-', '-', 'Г.1050-88', '-', '-']
['Круг - Ст 20', '-', 'Ø8×40, -', '-', '-', '-', 'Г.1050-88', '-', '-'