To do list:

- [x] - List all xlsx files in dir

> ReadAnchorData.get_xlsx_files()
---
- [x] - Create dict to identify anchor file category

>ReadAnchorData.types
---
- [x] - Create method for reading each file type and appropiate sheet name
    - [x] - BOM
    > ReadAnchorData.read_bom()

    - [x] - Forecast
    > ReadAnchorData.read_fcst()
---
- [x] - Create method to run through all files and run appropiate method to get data

> ReadAnchorData.read_files()
---
- [x] - Consolidate data into appropaite dataframes with identifier info to seperate out each source/release
> ReadAnchorData.prep_group()
---
- [x] - Write data to file
---

[ ] - Stretch - write function only read newly added files

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import random

In [76]:
class ReadAnchorData:
    '''Usage
    -
    - Start with reading all xlsx files within a directory

    >>> def get_xlsx_files(base_dir)

    - Loop through types category, filtering to files that fit within cat and then running the method for pulling the data
    
    >>> def read_files(file_paths)
    This will run the following methods based on the types attr in AnchorCategories
    >>> class AnchorCategories.types
        >>> def read_fcst()
        >>> def read_bom()

    - Merge data where applicable
    '''

    def get_xlsx_files(base_dir):
        # Returns all xlsx files (incl subdirs)
        file_paths = Path(base_dir).glob("*/*.xlsx")
        return [str(x.resolve()) for x in file_paths]


    def read_files(file_paths):
        dfs = {}
        for key, value in AnchorCategories.types.items():
            name, method = value
            dfs[name] = pd.DataFrame()
        for key, value in AnchorCategories.types.items():
            name, method = value
            print(name)
            applicable_files = [x for x in file_paths if key in x]
            # applicable_files = ReadAnchorData.get_type_of_file(key, file_paths)
            for file in applicable_files:
                dfs[name] = pd.concat([method(file, key), dfs[name]])
        return dfs

    def get_type_of_file(key, file_paths):
        return [x for x in file_paths if key in x]

    def get_release(filepath : Path):
        # The release is given by the name of the parent folder
        parentfol = str(filepath.parent.absolute())
        return parentfol[len(parentfol)-7:]

    def readfcst(filepath, category):
        # The xlsx files can contain multiple sheets and no std naming convention. This will open each sheet until it finds the matching indicator columns
        # Each forecast will have these indicator columns, but the forecast column headers (202101, 202102 etc) can be different. These are transposed to the YYYYMM column
        sheetnames = pd.ExcelFile(filepath).sheet_names
        for sheet in sheetnames:
            df : pd.DataFrame = pd.read_excel(filepath, sheet, keep_default_na=False)
            indicator_cols = ['Total Forecast']
            if indicator_cols[0] in df.columns:
                df = df.drop('Total Forecast', axis=1)
                dates = [i for i in df.columns if '20' in i] # This will get all forecast column names
                dims = [i for i in df.columns if '20' not in i] # Inverse of the above
                df = df.melt(id_vars=dims, value_vars=dates, var_name='YYYYMM', value_name='SetForecast')
                parent, filename = ReadAnchorData.file_info(filepath)
                df['release'] = parent[:4]+parent[5:]
                df['sourceFile'] = filename
                df['keyfigure'] = category
                df['isForecast'] = df['YYYYMM'].astype('int32') >= df['release'].astype('int32')
                return df

    def read_bom(filepath, category):
        sheetnames = pd.ExcelFile(filepath).sheet_names
        for sheet in sheetnames:
            df : pd.DataFrame = pd.read_excel(filepath, sheet)
            indicator_cols = ['BOM']
            if indicator_cols[0] in df.columns:
                parent, filename = ReadAnchorData.file_info(filepath)
                df['release'] = parent[:4]+parent[5:]
                df['sourcefile'] = filename
                return df

    def file_info(pth):
        start = -1
        stop = -1
        for _ in range(0, 2):
            start = stop
            stop = pth[::-1].find(f'\\', stop + 1)
        parent = pth[len(pth)-stop:len(pth)-start-1]
        filename = pth[len(pth)-start:]
        return parent, filename

    def prep_group(data : dict):
        result = {}
        for key, value in data.copy().items():
            if key == 'BOM':
                result[key] = value
            elif key == 'SetForecast':
                df = value
                df = df.reset_index(drop=True)
                cols = ['keyfigure', 'sourceFile', 'SetForecast', 'CAPEX Cost$"', 'CAPEX Cost$', 'Cost$']
                df = df.convert_dtypes('string')
                key_cols = list(df.columns.drop(cols))
                result[key] = pd.pivot_table(df,values='SetForecast',index=key_cols,columns=['keyfigure'],aggfunc=np.sum).reset_index()
        return result


class AnchorCategories:
    types = {
        'Bom_Report': ('BOM', ReadAnchorData.read_bom),
        'SetsPerMonthcombined': ('SetForecast', ReadAnchorData.readfcst),
        'SetsPerMonthConst': ('SetForecast', ReadAnchorData.readfcst),
        'SetsPerMonthUnConst': ('SetForecast', ReadAnchorData.readfcst)
        }


In [3]:
class GetAnchorData:
    def __init__(self):
        self.filepaths = []
        self.files = {}
        self.set_forecast = pd.DataFrame()
        self.set_definitions = pd.DataFrame()

    def get_xlsx_files(self, base_dir):
        # Returns all xlsx files (incl subdirs)
        file_paths = Path(base_dir).glob("*/*.xlsx")
        result = [str(x.resolve()) for x in file_paths]
        self.filepaths = result

    def read_files(self, filepaths):
        GLOBAL_Sets_PerMonth_ = [x for x in filepaths if 'GLOBAL_Sets_PerMonth_' in x]
        self.set_forecast = self.ReadMethods().readfcsts(GLOBAL_Sets_PerMonth_)
        Bom_Report = [x for x in filepaths if 'Bom_Report' in x]
        self.set_definitions = self.ReadMethods().readboms(Bom_Report)

    def write_files(self, path):
        if not self.set_forecast.empty:
            key = 'setforecast'
            self.set_forecast.to_csv(f'{path}\\{key}.csv', index=False)
        if not self.set_definitions.empty:
            key = 'setdefinitions'
            self.set_definitions.to_csv(f'{path}\\{key}.csv', index=False)
    
    class ReadMethods:
        def readfcsts(self, filepaths : list):
            # The xlsx files can contain multiple sheets and no std naming convention. This will open each sheet until it finds the matching indicator columns
            # Each forecast will have these indicator columns, but the forecast column headers (202101, 202102 etc) can be different. These are transposed to the YYYYMM column
            all_dfs = []
            for filepath in filepaths:
                parent, filename = self.file_info(filepath)
                release = parent[:4]+parent[5:]
                release = ''.join([x for x in release if x.isalnum()])
                sheetnames = pd.ExcelFile(filepath).sheet_names
                dfs = []
                for sheet in sheetnames:
                    df : pd.DataFrame = pd.read_excel(filepath, sheet, keep_default_na=False)
                    df = df.drop('Total Forecast', axis=1)
                    displayoption = df.iloc[0].loc['Display Option']
                    dates = [i for i in df.columns if '20' in i] # This will get all forecast column names
                    dims = [i for i in df.columns if '20' not in i] # Inverse of the above
                    df = df.melt(id_vars=dims, value_vars=dates, var_name='YYYYMM', value_name=displayoption)
                    df = df[df[displayoption] != 0]
                    df = df[~df[displayoption].isna()]
                    dfs.append(df)
                df = pd.concat(dfs)
                df['release'] = release
                df['sourceFile'] = filename
                df['isForecast'] = df['YYYYMM'].astype('int32') >= df['release'].astype('int32')
                all_dfs.append(df)
                print(filepath)
            df = pd.concat(all_dfs)
            df = df.drop(['Display Option', 'sourceFile'], axis=1)
            cols = df.columns.to_list()
            [remove_ignore_error(cols, x) for x in ['Combined', 'Constrained', 'Supply Update', 'Un-Constrained', 'Display Option', 'Total Forecast', 'sourceFile']]
            df = df.groupby(by=cols, dropna=False).sum().reset_index()            
            return df

        def remove_ignore_error(list_obj, x):
            try:
                list_obj.remove(x)
            except ValueError:
                pass  # do nothing!

        def readboms(self, filepaths : list):
            all_dfs = []
            for filepath in filepaths:
                parent, filename = self.file_info(filepath)
                release = parent[:4]+parent[5:]
                release = ''.join([x for x in release if x.isalnum()])
                sheetnames = pd.ExcelFile(filepath).sheet_names
                dfs = []
                for sheet in sheetnames:
                    df : pd.DataFrame = pd.read_excel(filepath, sheet, keep_default_na=False)
                    dfs.append(df)
                df = pd.concat(dfs)
                df['release'] = release
                df['sourceFile'] = filename
                all_dfs.append(df)
                print(filepath)
            df = pd.concat(all_dfs)
            return df

        def file_info(self, pth):
            start = -1
            stop = -1
            for _ in range(0, 2):
                start = stop
                stop = pth[::-1].find(f'\\', stop + 1)
            parent = pth[len(pth)-stop:len(pth)-start-1]
            filename = pth[len(pth)-start:]
            return parent, filename

        
    methods = {
        # 'Bom_Report': ('BOM', ReadAnchorData.read_bom),
        'GLOBAL_Sets_PerMonth': ('SetForecast', ReadMethods.readfcsts)
        }

In [4]:
base_dir = r'data\\'

In [5]:
test = GetAnchorData()
test.get_xlsx_files(base_dir)
paths_to_use = [x for x in test.filepaths if 'Bom_Report' in x]
data = test.ReadMethods().readboms(paths_to_use)

C:\Users\jonny\Documents\Programming\data-utility\data\2022-05\SP Bom_Report05-09-2022.xlsx


  warn("Workbook contains no default style, apply openpyxl's default")


C:\Users\jonny\Documents\Programming\data-utility\data\2022-06\SP Bom_Report06-06-2022.xlsx


In [6]:
data

Unnamed: 0,Business Unit,Product Line,Brand,Sub Brand,BOM,BOM Description,BOM Start Date,BOM End Date,BOM Category,Business Category,...,Comments,Anchor BOM,NaEventId,BOM Region,Created By,Created At,Updated By,Updated At,release,sourceFile
0,SPINE,MIS FIXATION,TELIGEN,SPI_BI_ASPAC_TELIGEN_001,001_ASPAC_TELIGEN,WW PROCEDURE KIT PRO,202204,209912,CORE,NPI,...,,,,ASPAC,SJain110,"Thu, 07 Apr 2022 14:15:49 GMT",,"Thu, 01 Jan 1970 00:00:00 GMT",202205,SP Bom_Report05-09-2022.xlsx
1,SPINE,ANTERIOR CERVICAL,PROTI 360 ACIS,SPI_BI_AU_ACIS_001,001_AU_ACIS,DPY/SYN_INS_AU_ACIS PROTI 360,202106,209912,CORE,BASE BUSINESS,...,,NO MIXED,,ASPAC,BRanawat,"Fri, 02 Jul 2021 12:04:35 GMT",SSunkar8,"Tue, 09 Nov 2021 03:49:10 GMT",202205,SP Bom_Report05-09-2022.xlsx
2,SPINE,ANTERIOR CERVICAL,PROTI 360 ACIS,SPI_BI_AU_ACIS_001,001_AU_ACIS,DPY/SYN_INS_AU_ACIS PROTI 360,202106,209912,CORE,BASE BUSINESS,...,,NO MIXED,,ASPAC,BRanawat,"Fri, 02 Jul 2021 13:00:30 GMT",BACKGROUND JOB,"Sat, 11 Dec 2021 04:27:42 GMT",202205,SP Bom_Report05-09-2022.xlsx
3,SPINE,ANTERIOR CERVICAL,PROTI 360 ACIS,SPI_BI_AU_ACIS_001,001_AU_ACIS,DPY/SYN_INS_AU_ACIS PROTI 360,202106,209912,CORE,BASE BUSINESS,...,,NO MIXED,,ASPAC,BRanawat,"Fri, 02 Jul 2021 13:00:30 GMT",BACKGROUND JOB,"Sat, 11 Dec 2021 04:27:43 GMT",202205,SP Bom_Report05-09-2022.xlsx
4,SPINE,ANTERIOR CERVICAL,PROTI 360 ACIS,SPI_BI_AU_ACIS_001,001_AU_ACIS,DPY/SYN_INS_AU_ACIS PROTI 360,202106,209912,CORE,BASE BUSINESS,...,,NO MIXED,,ASPAC,BRanawat,"Fri, 02 Jul 2021 13:00:30 GMT",BACKGROUND JOB,"Sat, 11 Dec 2021 04:27:43 GMT",202205,SP Bom_Report05-09-2022.xlsx
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48658,SPINE,IB FUSION - POSTERIOR,CONDUIT PLIF,SPI_BI_KR_CONDUIT_PLIF_,_KR_CONDUIT_PLIF,DS_INS_KR_CONDUIT PLIF,202106,209912,CORE,NPI,...,,,,ASPAC,BRanawat,"Mon, 28 Jun 2021 14:14:19 GMT",SSunkar8,"Tue, 09 Nov 2021 03:50:01 GMT",202206,SP Bom_Report06-06-2022.xlsx
48659,SPINE,IB FUSION - POSTERIOR,CONDUIT TLIF-C,SPI_BI_KR_CONDUIT_TLIF_,_KR_CONDUIT_TLIF,DS_INS_KR_CONDUIT TLIF,202106,209912,CORE,NPI,...,,,,ASPAC,BRanawat,"Mon, 28 Jun 2021 14:14:19 GMT",SSunkar8,"Tue, 09 Nov 2021 03:50:01 GMT",202206,SP Bom_Report06-06-2022.xlsx
48660,SPINE,ANTERIOR CERVICAL,CONDUIT CIF,SPI_BI_SG_CONDUIT_CIF_,_SG_CONDUIT_CIF,SG_CONDUIT - CIF,202106,209912,CORE,NPI,...,,,,ASPAC,BRanawat,"Mon, 28 Jun 2021 14:14:19 GMT",SSunkar8,"Tue, 09 Nov 2021 03:50:01 GMT",202206,SP Bom_Report06-06-2022.xlsx
48661,SPINE,IB FUSION - POSTERIOR,CONDUIT PLIF,SPI_BI_SG_CONDUIT_PLIF_,_SG_CONDUIT_PLIF,SG_CONDUIT - PLIF,202106,209912,CORE,NPI,...,,,,ASPAC,BRanawat,"Mon, 28 Jun 2021 14:14:19 GMT",SSunkar8,"Tue, 09 Nov 2021 03:50:01 GMT",202206,SP Bom_Report06-06-2022.xlsx


In [1]:
df = data
df = df.drop(['Display Option', 'sourceFile'], axis=1)
cols = df.columns.to_list()
print(cols)
def remove_ignore_error(list_obj, x):
    try:
        list_obj.remove(x)
    except ValueError:
        pass  # do nothing!
[remove_ignore_error(cols, x) for x in ['Combined', 'Constrained', 'Supply Update', 'Un-Constrained', 'Display Option', 'Total Forecast', 'sourceFile']]
df2 = df.groupby(by=cols, dropna=False).sum().reset_index()

NameError: name 'data' is not defined

In [None]:
df2 = df.groupby(by=cols, dropna=False).sum().reset_index()

In [None]:
df2['Value'] = df2[['Combined', 'Constrained', 'Supply Update', 'Un-Constrained']].sum(axis=1)

In [None]:
df2 = df2[df2['Value'] != 0]

In [None]:
write_dir = r'C:\Users\jgreenw9\Desktop\Waterfall'
key = 'anchorpull'
df2.to_csv(f'{write_dir}\\{key}.csv', index=False)

In [None]:
df2

Unnamed: 0,Business Unit,Business Category,Region,Country,Product Line,Brand,Sub Brand/Event ID,BOM,BOM Description,BOM Category,...,release,isForecast,Market,shortFall,carryover,Combined,Constrained,Supply Update,Un-Constrained,Value
87,JOINTS,BASE BUSINESS,ASPAC,95,KNEES,ATTUNE PRIMARY,ATTUNE PRIMARY CEMENTED,AP_9501_ATTUNE STANDARD,ATTUNE STANDARD,CORE,...,202203,True,,,,15.0,15.0,0.0,15.0,45.0
369,JOINTS,BASE BUSINESS,ASPAC,95,SHOULDERS,GLOBAL UNITE,GLOBAL UNITE SHORT STEM,AP_9501_GLOBAL UNITE SHORT STEM,GLOBAL UNITE SHORT STEM,CORE,...,202203,False,,,,0.0,0.0,0.0,10.0,10.0
370,JOINTS,BASE BUSINESS,ASPAC,95,SHOULDERS,GLOBAL UNITE,GLOBAL UNITE SHORT STEM,AP_9501_GLOBAL UNITE SHORT STEM,GLOBAL UNITE SHORT STEM,CORE,...,202203,False,,,,0.0,0.0,0.0,10.0,10.0
442,JOINTS,BASE BUSINESS,ASPAC,95,SHOULDERS,GLOBAL UNITE,GLOBAL UNITE SHORT STEM,AP_9501_GLOBAL UNITE SHORT STEM_IMPLANTS,GLOBAL UNITE SHORT STEM_IMPLANTS,CORE,...,202203,False,,,,0.0,0.0,0.0,10.0,10.0
505,JOINTS,BASE BUSINESS,ASPAC,AU,HIPS,AAI,AAI INSTRUMENTS,AP_AU01_ANTERIOR APPROACH RETRACTOR INSTRUMENT...,AAI INSTRUMENTS,CORE,...,202203,False,,,,0.0,0.0,0.0,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13691139,TRAUMA,NPI,,,TR94_NEW PRODUCTS,TR949999_NEW PRODUCTS,TR_NA_US_Mini Frag,TI PLATES,TI PLATES,CORE,...,202206,False,91,,,0.0,3.0,0.0,3.0,6.0
13691148,TRAUMA,NPI,,,TR94_NEW PRODUCTS,TR949999_NEW PRODUCTS,TR_NA_US_Mini Frag,TI PLATES,TI PLATES,CORE,...,202204,True,91,,,5.0,5.0,0.0,5.0,15.0
13691153,TRAUMA,NPI,,,TR94_NEW PRODUCTS,TR949999_NEW PRODUCTS,TR_NA_US_Mini Frag,TI PLATES,TI PLATES,CORE,...,202205,True,91,,,5.0,5.0,0.0,5.0,15.0
13691154,TRAUMA,NPI,,,TR94_NEW PRODUCTS,TR949999_NEW PRODUCTS,TR_NA_US_Mini Frag,TI PLATES,TI PLATES,CORE,...,202206,False,91,0.0,,5.0,0.0,0.0,0.0,5.0


In [None]:
df['Value'] = df['Value'].astype('int64')

In [None]:
data2 = df[df['Value'] != 0]