To do list:

- [x] - List all xlsx files in dir

> ReadAnchorData.get_xlsx_files()
---
- [x] - Create dict to identify anchor file category

>ReadAnchorData.types
---
- [x] - Create method for reading each file type and appropiate sheet name
    - [x] - BOM
    > ReadAnchorData.read_bom()

    - [x] - Forecast
    > ReadAnchorData.read_fcst()
---
- [x] - Create method to run through all files and run appropiate method to get data

> ReadAnchorData.read_files()
---
- [x] - Consolidate data into appropaite dataframes with identifier info to seperate out each source/release
> ReadAnchorData.prep_group()
---
- [x] - Write data to file
---

[ ] - Stretch - write function only read newly added files

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import random

In [63]:
class ReadAnchorData:
    '''Usage
    -
    - Start with reading all xlsx files within a directory

    >>> def get_xlsx_files(base_dir)

    - Loop through types category, filtering to files that fit within cat and then running the method for pulling the data
    
    >>> def read_files(file_paths)
    This will run the following methods based on the types attr in AnchorCategories
    >>> class AnchorCategories.types
        >>> def read_fcst()
        >>> def read_bom()

    - Merge data where applicable
    '''

    def get_xlsx_files(base_dir):
        # Returns all xlsx files (incl subdirs)
        file_paths = Path(base_dir).glob("*/*.xlsx")
        return [str(x.resolve()) for x in file_paths]


    def read_files(file_paths):
        dfs = {}
        for key, value in AnchorCategories.types.items():
            name, method = value
            dfs[name] = pd.DataFrame()
        for key, value in AnchorCategories.types.items():
            name, method = value
            print(name)
            applicable_files = [x for x in file_paths if key in x]
            # applicable_files = ReadAnchorData.get_type_of_file(key, file_paths)
            for file in applicable_files:
                dfs[name] = pd.concat([method(file, key), dfs[name]])
        return dfs

    def get_type_of_file(key, file_paths):
        return [x for x in file_paths if key in x]

    def get_release(filepath : Path):
        # The release is given by the name of the parent folder
        parentfol = str(filepath.parent.absolute())
        return parentfol[len(parentfol)-7:]

    def readfcst(filepath, category):
        # The xlsx files can contain multiple sheets and no std naming convention. This will open each sheet until it finds the matching indicator columns
        # Each forecast will have these indicator columns, but the forecast column headers (202101, 202102 etc) can be different. These are transposed to the YYYYMM column
        sheetnames = pd.ExcelFile(filepath).sheet_names
        for sheet in sheetnames:
            df : pd.DataFrame = pd.read_excel(filepath, sheet, keep_default_na=False)
            indicator_cols = ['Total Forecast']
            if indicator_cols[0] in df.columns:
                df = df.drop('Total Forecast', axis=1)
                dates = [i for i in df.columns if '20' in i] # This will get all forecast column names
                dims = [i for i in df.columns if '20' not in i] # Inverse of the above
                df = df.melt(id_vars=dims, value_vars=dates, var_name='YYYYMM', value_name='SetForecast')
                parent, filename = ReadAnchorData.file_info(filepath)
                df['release'] = parent[:4]+parent[5:]
                df['sourceFile'] = filename
                df['keyfigure'] = category
                df['isForecast'] = df['YYYYMM'].astype('int32') >= df['release'].astype('int32')
                return df

    def read_bom(filepath, category):
        sheetnames = pd.ExcelFile(filepath).sheet_names
        for sheet in sheetnames:
            df : pd.DataFrame = pd.read_excel(filepath, sheet)
            indicator_cols = ['BOM']
            if indicator_cols[0] in df.columns:
                parent, filename = ReadAnchorData.file_info(filepath)
                df['release'] = parent[:4]+parent[5:]
                df['sourcefile'] = filename
                return df

    def file_info(pth):
        start = -1
        stop = -1
        for _ in range(0, 2):
            start = stop
            stop = pth[::-1].find(f'\\', stop + 1)
        parent = pth[len(pth)-stop:len(pth)-start-1]
        filename = pth[len(pth)-start:]
        return parent, filename

    def prep_group(data : dict):
        result = {}
        for key, value in data.copy().items():
            if key == 'BOM':
                result[key] = value
            elif key == 'SetForecast':
                df = value
                df = df.reset_index(drop=True)
                cols = ['keyfigure', 'sourceFile', 'SetForecast', 'CAPEX Cost$"', 'CAPEX Cost$', 'Cost$']
                df = df.convert_dtypes('string')
                key_cols = list(df.columns.drop(cols))
                result[key] = pd.pivot_table(df,values='SetForecast',index=key_cols,columns=['keyfigure'],aggfunc=np.sum).reset_index()
        return result


class AnchorCategories:
    types = {
        'Bom_Report': ('BOM', ReadAnchorData.read_bom),
        'SetsPerMonthcombined': ('SetForecast', ReadAnchorData.readfcst),
        'SetsPerMonthConst': ('SetForecast', ReadAnchorData.readfcst),
        'SetsPerMonthUnConst': ('SetForecast', ReadAnchorData.readfcst)
        }


In [60]:
class GetAnchorData:
    def __init__(self):
        filepaths = []
        files = {}

    def get_xlsx_files(self, base_dir):
        # Returns all xlsx files (incl subdirs)
        file_paths = Path(base_dir).glob("*/*.xlsx")
        result = [str(x.resolve()) for x in file_paths]
        self.filepaths = result

    
    class ReadMethods:
        def readfcst(self, filepath, category):
            # The xlsx files can contain multiple sheets and no std naming convention. This will open each sheet until it finds the matching indicator columns
            # Each forecast will have these indicator columns, but the forecast column headers (202101, 202102 etc) can be different. These are transposed to the YYYYMM column
            sheetnames = pd.ExcelFile(filepath).sheet_names
            dfs = []
            for sheet in sheetnames:
                df : pd.DataFrame = pd.read_excel(filepath, sheet, keep_default_na=False)
                df = df.drop('Total Forecast', axis=1)
                displayoption = df.iloc[0].loc['Display Option']
                dates = [i for i in df.columns if '20' in i] # This will get all forecast column names
                dims = [i for i in df.columns if '20' not in i] # Inverse of the above
                df = df.melt(id_vars=dims, value_vars=dates, var_name='YYYYMM', value_name=displayoption)
                dfs.append(df)
            df = pd.concat(dfs)
            parent, filename = self.file_info(filepath)
            df['release'] = parent[:4]+parent[5:]
            df['sourceFile'] = filename
            df['isForecast'] = df['YYYYMM'].astype('int32') >= df['release'].astype('int32')
            return df

        def file_info(pth):
            start = -1
            stop = -1
            for _ in range(0, 2):
                start = stop
                stop = pth[::-1].find(f'\\', stop + 1)
            parent = pth[len(pth)-stop:len(pth)-start-1]
            filename = pth[len(pth)-start:]
            return parent, filename

In [6]:
base_dir = r'data\\'
all_files = ReadAnchorData.get_xlsx_files(base_dir)

In [61]:
test = GetAnchorData()
test2 = test.ReadMethods.readfcst(all_files[0],'SetForecast')

In [62]:
test2

Unnamed: 0,Business Unit,Business Category,Region,Market,Product Line,Brand,Sub Brand/Event ID,BOM,BOM Description,BOM Category,Demand StreamId,CAPEX Cost$,YYYYMM,Combined,release,sourceFile,isForecast
0,JOINTS,BASE BUSINESS,EMEA,IR,KNEES,ATTUNE,ATTUNE PRIMARY CEMENTED,(IR) ATTUNE,ATTUNE PRIMARY TKR,CORE,Non Revenue,20251.30,202101,0,202205,GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx,False
1,JOINTS,BASE BUSINESS,EMEA,IR,HIPS,SELF CENTERING,SELF CENTERING,(IR) BIPOLAR,BIPOLAR SELF CENTRING,CORE,Non Revenue,2229.99,202101,0,202205,GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx,False
2,JOINTS,BASE BUSINESS,EMEA,94,KNEES,SIGMA HP TC3 REVISION,SIGMA HP TC3 REVISION,(IR) DIFFICULT PRIMARY KNEE,DIFFICULT PRIMARY KNEE,CORE,Non Revenue,3796.34,202101,0,202205,GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx,False
3,JOINTS,BASE BUSINESS,EMEA,IR,KNEES,SIGMA HP TC3 REVISION,SIGMA HP TC3 REVISION,(IR) DIFFICULT PRIMARY KNEE,DIFFICULT PRIMARY KNEE,CORE,Non Revenue,18445.17,202101,0,202205,GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx,False
4,JOINTS,BASE BUSINESS,EMEA,IR,HIPS,CORAIL PRIMARY,CORAIL PRIMARY,(IR)CORAIL,CORAIL,CORE,Non Revenue,8740.83,202101,0,202205,GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
716179,TRAUMA,NPI,EMEA,SE,TR14_FOOT & ANKLE,TR140500_2.7 CORTEX SCREWS,5414_Patella Fracture Fixation,YVAPA06O,VA PATELLA IMPLANT SET SCREWS TI - STERILE,OPTION,Non Revenue,0.00,202612,0,202205,GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx,True
716180,TRAUMA,NPI,EMEA,IL,TR03_SMALL FRAGMENTS,TR030127_SF - VA-LCP PATELLA,5414_Patella Fracture Fixation,YVAPA07O,VA PATELLA CUSTOMIZED SET,OPTION,Non Revenue,0.00,202612,0,202205,GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx,True
716181,TRAUMA,NPI,EMEA,IT,TR03_SMALL FRAGMENTS,TR030127_SF - VA-LCP PATELLA,5414_Patella Fracture Fixation,YVAPA08O,VA PATELLA CUSTOMIZED SET,OPTION,Non Revenue,0.00,202612,0,202205,GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx,True
716182,TRAUMA,NPI,EMEA,ES,TR03_SMALL FRAGMENTS,TR030127_SF - VA-LCP PATELLA,5414_Patella Fracture Fixation,YVAPA09O,VA PATELLA CUSTOMIZED SET,OPTION,Non Revenue,0.00,202612,0,202205,GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx,True


In [44]:
df = test2
cols = df.columns.to_list()
print(cols)
cols.remove('SetForecast')
cols.remove('')
df.pivot(columns=cols,values='SetForecast')

['Business Unit', 'Business Category', 'Region', 'Market', 'Product Line', 'Brand', 'Sub Brand/Event ID', 'BOM', 'BOM Description', 'BOM Category', 'Display Option', 'Demand StreamId', 'CAPEX Cost$', 'YYYYMM', 'SetForecast', 'release', 'sourceFile', 'isForecast']


ValueError: list.remove(x): x not in list

In [15]:
paths_to_use = random.choices(all_files,k=4)
paths_to_use

['C:\\Users\\jonny\\Documents\\Programming\\data-utility\\data\\2022-05\\GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx',
 'C:\\Users\\jonny\\Documents\\Programming\\data-utility\\data\\2022-05\\GLOBAL_Sets_PerMonth_CONSTRAINED_REPORT.xlsx',
 'C:\\Users\\jonny\\Documents\\Programming\\data-utility\\data\\2022-05\\GLOBAL_Sets_PerMonth_UNCONSTRAINED_REPORT.xlsx',
 'C:\\Users\\jonny\\Documents\\Programming\\data-utility\\data\\2022-05\\GLOBAL_Sets_PerMonth_SUPPLYUPDATE_REPORT.xlsx']

In [16]:
data = ReadAnchorData.read_files(all_files)

BOM


  warn("Workbook contains no default style, apply openpyxl's default")


SetForecast
SetForecast
SetForecast


In [6]:
data_prepped = ReadAnchorData.prep_group(data)

In [7]:
write_dir = r'C:\Users\jgreenw9\Desktop\Waterfall'
for key, value in data_prepped.items():
    print(f'{write_dir}\\{key}.csv')
    value.to_csv(f'{write_dir}\\{key}.csv', index=False)


C:\Users\jgreenw9\Desktop\Waterfall\BOM.csv
C:\Users\jgreenw9\Desktop\Waterfall\SetForecast.csv
