To do list:

- [x] - List all xlsx files in dir

> ReadAnchorData.get_xlsx_files()
---
- [x] - Create dict to identify anchor file category

>ReadAnchorData.types
---
- [x] - Create method for reading each file type and appropiate sheet name
    - [x] - BOM
    > ReadAnchorData.read_bom()

    - [x] - Forecast
    > ReadAnchorData.read_fcst()
---
- [x] - Create method to run through all files and run appropiate method to get data

> ReadAnchorData.read_files()
---
- [x] - Consolidate data into appropaite dataframes with identifier info to seperate out each source/release
> ReadAnchorData.prep_group()
---
- [x] - Write data to file
---

[ ] - Stretch - write function only read newly added files

In [5]:
import pandas as pd
import numpy as np
from pathlib import Path
import random

In [6]:
class ReadAnchorData:
    '''Usage
    -
    - Start with reading all xlsx files within a directory

    >>> def get_xlsx_files(base_dir)

    - Loop through types category, filtering to files that fit within cat and then running the method for pulling the data
    
    >>> def read_files(file_paths)
    This will run the following methods based on the types attr in AnchorCategories
    >>> class AnchorCategories.types
        >>> def read_fcst()
        >>> def read_bom()

    - Merge data where applicable
    '''

    def get_xlsx_files(base_dir):
        # Returns all xlsx files (incl subdirs)
        file_paths = Path(base_dir).glob("*/*.xlsx")
        return [str(x.resolve()) for x in file_paths]


    def read_files(file_paths):
        dfs = {}
        for key, value in AnchorCategories.types.items():
            name, method = value
            dfs[name] = pd.DataFrame()
        for key, value in AnchorCategories.types.items():
            name, method = value
            print(name)
            applicable_files = [x for x in file_paths if key in x]
            # applicable_files = ReadAnchorData.get_type_of_file(key, file_paths)
            for file in applicable_files:
                dfs[name] = pd.concat([method(file, key), dfs[name]])
        return dfs

    def get_type_of_file(key, file_paths):
        return [x for x in file_paths if key in x]

    def get_release(filepath : Path):
        # The release is given by the name of the parent folder
        parentfol = str(filepath.parent.absolute())
        return parentfol[len(parentfol)-7:]

    def readfcst(filepath, category):
        # The xlsx files can contain multiple sheets and no std naming convention. This will open each sheet until it finds the matching indicator columns
        # Each forecast will have these indicator columns, but the forecast column headers (202101, 202102 etc) can be different. These are transposed to the YYYYMM column
        sheetnames = pd.ExcelFile(filepath).sheet_names
        for sheet in sheetnames:
            df : pd.DataFrame = pd.read_excel(filepath, sheet, keep_default_na=False)
            indicator_cols = ['Total Forecast']
            if indicator_cols[0] in df.columns:
                df = df.drop('Total Forecast', axis=1)
                dates = [i for i in df.columns if '20' in i] # This will get all forecast column names
                dims = [i for i in df.columns if '20' not in i] # Inverse of the above
                df = df.melt(id_vars=dims, value_vars=dates, var_name='YYYYMM', value_name='SetForecast')
                parent, filename = ReadAnchorData.file_info(filepath)
                df['release'] = parent[:4]+parent[5:]
                df['sourceFile'] = filename
                df['keyfigure'] = category
                df['isForecast'] = df['YYYYMM'].astype('int32') >= df['release'].astype('int32')
                return df

    def read_bom(filepath, category):
        sheetnames = pd.ExcelFile(filepath).sheet_names
        for sheet in sheetnames:
            df : pd.DataFrame = pd.read_excel(filepath, sheet)
            indicator_cols = ['BOM']
            if indicator_cols[0] in df.columns:
                parent, filename = ReadAnchorData.file_info(filepath)
                df['release'] = parent[:4]+parent[5:]
                df['sourcefile'] = filename
                return df

    def file_info(pth):
        start = -1
        stop = -1
        for _ in range(0, 2):
            start = stop
            stop = pth[::-1].find(f'\\', stop + 1)
        parent = pth[len(pth)-stop:len(pth)-start-1]
        filename = pth[len(pth)-start:]
        return parent, filename

    def prep_group(data : dict):
        result = {}
        for key, value in data.copy().items():
            if key == 'BOM':
                result[key] = value
            elif key == 'SetForecast':
                df = value
                df = df.reset_index(drop=True)
                cols = ['keyfigure', 'sourceFile', 'SetForecast', 'CAPEX Cost$"', 'CAPEX Cost$', 'Cost$']
                df = df.convert_dtypes('string')
                key_cols = list(df.columns.drop(cols))
                result[key] = pd.pivot_table(df,values='SetForecast',index=key_cols,columns=['keyfigure'],aggfunc=np.sum).reset_index()
        return result


class AnchorCategories:
    types = {
        'Bom_Report': ('BOM', ReadAnchorData.read_bom),
        'SetsPerMonthcombined': ('SetForecast', ReadAnchorData.readfcst),
        'SetsPerMonthConst': ('SetForecast', ReadAnchorData.readfcst),
        'SetsPerMonthUnConst': ('SetForecast', ReadAnchorData.readfcst)
        }


In [7]:
class GetAnchorData:
    def __init__(self):
        filepaths = []
        files = {}

    def get_xlsx_files(self, base_dir):
        # Returns all xlsx files (incl subdirs)
        file_paths = Path(base_dir).glob("*/*.xlsx")
        result = [str(x.resolve()) for x in file_paths]
        self.filepaths = result

    def read_files(self, filepaths):
        dfs = {}
        for key, value in self.methods.items():
            name, method = value
            dfs[name] = pd.DataFrame()
        for key, value in self.methods.items():
            name, method = value
            print(name)
            applicable_files = [x for x in filepaths if key in x]
            # applicable_files = ReadAnchorData.get_type_of_file(key, file_paths)
            for file in applicable_files:
                dfs[name] = pd.concat([method(file, key), dfs[name]])
        return dfs

    
    class ReadMethods:
        def readfcsts(self, filepaths : list):
            # The xlsx files can contain multiple sheets and no std naming convention. This will open each sheet until it finds the matching indicator columns
            # Each forecast will have these indicator columns, but the forecast column headers (202101, 202102 etc) can be different. These are transposed to the YYYYMM column
            all_dfs = []
            for filepath in filepaths:
                parent, filename = self.file_info(filepath)
                release = parent[:4]+parent[5:]
                release = ''.join([x for x in release if x.isalnum()])
                sheetnames = pd.ExcelFile(filepath).sheet_names
                dfs = []
                for sheet in sheetnames:
                    df : pd.DataFrame = pd.read_excel(filepath, sheet, keep_default_na=False)
                    df = df.drop('Total Forecast', axis=1)
                    displayoption = df.iloc[0].loc['Display Option']
                    dates = [i for i in df.columns if '20' in i] # This will get all forecast column names
                    dims = [i for i in df.columns if '20' not in i] # Inverse of the above
                    df = df.melt(id_vars=dims, value_vars=dates, var_name='YYYYMM', value_name=displayoption)
                    dfs.append(df)
                df = pd.concat(dfs)
                df['release'] = release
                df['sourceFile'] = filename
                df['isForecast'] = df['YYYYMM'].astype('int32') >= df['release'].astype('int32')
                all_dfs.append(df)
                print(filepath)
            df = pd.concat(all_dfs)
            return df

        def file_info(self, pth):
            start = -1
            stop = -1
            for _ in range(0, 2):
                start = stop
                stop = pth[::-1].find(f'\\', stop + 1)
            parent = pth[len(pth)-stop:len(pth)-start-1]
            filename = pth[len(pth)-start:]
            return parent, filename

        
    methods = {
        # 'Bom_Report': ('BOM', ReadAnchorData.read_bom),
        'GLOBAL_Sets_PerMonth': ('SetForecast', ReadMethods.readfcsts)
        }

In [8]:
base_dir = r'data\\'

In [40]:
test = GetAnchorData()
test.get_xlsx_files(base_dir)
paths_to_use = [x for x in test.filepaths if 'GLOBAL' in x]
data = test.ReadMethods().readfcsts(paths_to_use)

C:\Users\JGreenw9\Desktop\Python\data-utility\data\2022-03\GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx
C:\Users\JGreenw9\Desktop\Python\data-utility\data\2022-03\GLOBAL_Sets_PerMonth_CONSTRAINED_REPORT.xlsx
C:\Users\JGreenw9\Desktop\Python\data-utility\data\2022-03\GLOBAL_Sets_PerMonth_SUPPLYUPDATE_REPORT.xlsx
C:\Users\JGreenw9\Desktop\Python\data-utility\data\2022-03\GLOBAL_Sets_PerMonth_UNCONSTRAINED_REPORT.xlsx
C:\Users\JGreenw9\Desktop\Python\data-utility\data\2022-04\GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx
C:\Users\JGreenw9\Desktop\Python\data-utility\data\2022-04\GLOBAL_Sets_PerMonth_CONSTRAINED_REPORT.xlsx
C:\Users\JGreenw9\Desktop\Python\data-utility\data\2022-04\GLOBAL_Sets_PerMonth_SUPPLYUPDATE_REPORT.xlsx
C:\Users\JGreenw9\Desktop\Python\data-utility\data\2022-04\GLOBAL_Sets_PerMonth_UNCONSTRAINED_REPORT.xlsx
C:\Users\JGreenw9\Desktop\Python\data-utility\data\2022-05\GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx
C:\Users\JGreenw9\Desktop\Python\data-utility\data\2022-05\GLOBAL_S

In [48]:
df = data
df = df.drop(['Display Option', 'sourceFile'], axis=1)
cols = df.columns.to_list()
print(cols)
def remove_ignore_error(list_obj, x):
    try:
        list_obj.remove(x)
    except ValueError:
        pass  # do nothing!
[remove_ignore_error(cols, x) for x in ['Combined', 'Constrained', 'Supply Update', 'Un-Constrained', 'Display Option', 'Total Forecast', 'sourceFile']]

['Business Unit', 'Business Category', 'Region', 'Country', 'Product Line', 'Brand', 'Sub Brand/Event ID', 'BOM', 'BOM Description', 'BOM Category', 'Demand StreamId', 'CAPEX Cost$', 'YYYYMM', 'Combined', 'release', 'isForecast', 'Constrained', 'Supply Update', 'Un-Constrained', 'Market', 'shortFall', 'carryover']


['Business Unit',
 'Business Category',
 'Region',
 'Country',
 'Product Line',
 'Brand',
 'Sub Brand/Event ID',
 'BOM',
 'BOM Description',
 'BOM Category',
 'Demand StreamId',
 'CAPEX Cost$',
 'YYYYMM',
 'release',
 'isForecast',
 'Market',
 'shortFall',
 'carryover']

In [52]:
df2 = df.groupby(by=cols, dropna=False).sum().reset_index()

In [54]:
df2['Value'] = df2[['Combined', 'Constrained', 'Supply Update', 'Un-Constrained']].sum(axis=1)

In [55]:
df2 = df2[df2['Value'] != 0]

In [57]:
write_dir = r'C:\Users\jgreenw9\Desktop\Waterfall'
key = 'anchorpull'
df2.to_csv(f'{write_dir}\\{key}.csv', index=False)

In [56]:
df2

Unnamed: 0,Business Unit,Business Category,Region,Country,Product Line,Brand,Sub Brand/Event ID,BOM,BOM Description,BOM Category,...,release,isForecast,Market,shortFall,carryover,Combined,Constrained,Supply Update,Un-Constrained,Value
87,JOINTS,BASE BUSINESS,ASPAC,95,KNEES,ATTUNE PRIMARY,ATTUNE PRIMARY CEMENTED,AP_9501_ATTUNE STANDARD,ATTUNE STANDARD,CORE,...,202203,True,,,,15.0,15.0,0.0,15.0,45.0
369,JOINTS,BASE BUSINESS,ASPAC,95,SHOULDERS,GLOBAL UNITE,GLOBAL UNITE SHORT STEM,AP_9501_GLOBAL UNITE SHORT STEM,GLOBAL UNITE SHORT STEM,CORE,...,202203,False,,,,0.0,0.0,0.0,10.0,10.0
370,JOINTS,BASE BUSINESS,ASPAC,95,SHOULDERS,GLOBAL UNITE,GLOBAL UNITE SHORT STEM,AP_9501_GLOBAL UNITE SHORT STEM,GLOBAL UNITE SHORT STEM,CORE,...,202203,False,,,,0.0,0.0,0.0,10.0,10.0
442,JOINTS,BASE BUSINESS,ASPAC,95,SHOULDERS,GLOBAL UNITE,GLOBAL UNITE SHORT STEM,AP_9501_GLOBAL UNITE SHORT STEM_IMPLANTS,GLOBAL UNITE SHORT STEM_IMPLANTS,CORE,...,202203,False,,,,0.0,0.0,0.0,10.0,10.0
505,JOINTS,BASE BUSINESS,ASPAC,AU,HIPS,AAI,AAI INSTRUMENTS,AP_AU01_ANTERIOR APPROACH RETRACTOR INSTRUMENT...,AAI INSTRUMENTS,CORE,...,202203,False,,,,0.0,0.0,0.0,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13691139,TRAUMA,NPI,,,TR94_NEW PRODUCTS,TR949999_NEW PRODUCTS,TR_NA_US_Mini Frag,TI PLATES,TI PLATES,CORE,...,202206,False,91,,,0.0,3.0,0.0,3.0,6.0
13691148,TRAUMA,NPI,,,TR94_NEW PRODUCTS,TR949999_NEW PRODUCTS,TR_NA_US_Mini Frag,TI PLATES,TI PLATES,CORE,...,202204,True,91,,,5.0,5.0,0.0,5.0,15.0
13691153,TRAUMA,NPI,,,TR94_NEW PRODUCTS,TR949999_NEW PRODUCTS,TR_NA_US_Mini Frag,TI PLATES,TI PLATES,CORE,...,202205,True,91,,,5.0,5.0,0.0,5.0,15.0
13691154,TRAUMA,NPI,,,TR94_NEW PRODUCTS,TR949999_NEW PRODUCTS,TR_NA_US_Mini Frag,TI PLATES,TI PLATES,CORE,...,202206,False,91,0.0,,5.0,0.0,0.0,0.0,5.0


In [26]:
df['Value'] = df['Value'].astype('int64')

In [28]:
data2 = df[df['Value'] != 0]

In [10]:
[x for x in test.filepaths if 'GLOBAL' in x]

['C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2022-03\\GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx',
 'C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2022-03\\GLOBAL_Sets_PerMonth_CONSTRAINED_REPORT.xlsx',
 'C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2022-03\\GLOBAL_Sets_PerMonth_SUPPLYUPDATE_REPORT.xlsx',
 'C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2022-03\\GLOBAL_Sets_PerMonth_UNCONSTRAINED_REPORT.xlsx',
 'C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2022-04\\GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx',
 'C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2022-04\\GLOBAL_Sets_PerMonth_CONSTRAINED_REPORT.xlsx',
 'C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2022-04\\GLOBAL_Sets_PerMonth_SUPPLYUPDATE_REPORT.xlsx',
 'C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2022-04\\GLOBAL_Sets_PerMonth_UNCONSTRAINED_REPORT.xlsx',
 'C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2022-05\\GLO

In [None]:
write_dir = r'C:\Users\jgreenw9\Desktop\Waterfall'
for key, value in data_prepped.items():
    print(f'{write_dir}\\{key}.csv')
    value.to_csv(f'{write_dir}\\{key}.csv', index=False)


C:\Users\jgreenw9\Desktop\Waterfall\BOM.csv
C:\Users\jgreenw9\Desktop\Waterfall\SetForecast.csv
