To do list:

- [x] - List all xlsx files in dir

> ReadAnchorData.get_xlsx_files()
---
- [x] - Create dict to identify anchor file category

>ReadAnchorData.types
---
- [x] - Create method for reading each file type and appropiate sheet name
    - [x] - BOM
    > ReadAnchorData.read_bom()

    - [x] - Forecast
    > ReadAnchorData.read_fcst()
---
- [x] - Create method to run through all files and run appropiate method to get data

> ReadAnchorData.read_files()
---
- [x] - Consolidate data into appropaite dataframes with identifier info to seperate out each source/release
> ReadAnchorData.prep_group()
---
- [x] - Write data to file
---

[ ] - Stretch - write function only read newly added files

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import random

In [2]:
class ReadAnchorData:
    '''Usage
    -
    - Start with reading all xlsx files within a directory

    >>> def get_xlsx_files(base_dir)

    - Loop through types category, filtering to files that fit within cat and then running the method for pulling the data
    
    >>> def read_files(file_paths)
    This will run the following methods based on the types attr in AnchorCategories
    >>> class AnchorCategories.types
        >>> def read_fcst()
        >>> def read_bom()

    - Merge data where applicable
    '''

    def get_xlsx_files(base_dir):
        # Returns all xlsx files (incl subdirs)
        file_paths = Path(base_dir).glob("*/*.xlsx")
        return [str(x.resolve()) for x in file_paths]


    def read_files(file_paths):
        dfs = {}
        for key, value in AnchorCategories.types.items():
            name, method = value
            dfs[name] = pd.DataFrame()
        for key, value in AnchorCategories.types.items():
            name, method = value
            print(name)
            applicable_files = [x for x in file_paths if key in x]
            # applicable_files = ReadAnchorData.get_type_of_file(key, file_paths)
            for file in applicable_files:
                dfs[name] = pd.concat([method(file, key), dfs[name]])
        return dfs

    def get_type_of_file(key, file_paths):
        return [x for x in file_paths if key in x]

    def get_release(filepath : Path):
        # The release is given by the name of the parent folder
        parentfol = str(filepath.parent.absolute())
        return parentfol[len(parentfol)-7:]

    def readfcst(filepath, category):
        # The xlsx files can contain multiple sheets and no std naming convention. This will open each sheet until it finds the matching indicator columns
        # Each forecast will have these indicator columns, but the forecast column headers (202101, 202102 etc) can be different. These are transposed to the YYYYMM column
        sheetnames = pd.ExcelFile(filepath).sheet_names
        for sheet in sheetnames:
            df : pd.DataFrame = pd.read_excel(filepath, sheet, keep_default_na=False)
            indicator_cols = ['Total Forecast']
            if indicator_cols[0] in df.columns:
                df = df.drop('Total Forecast', axis=1)
                dates = [i for i in df.columns if '20' in i] # This will get all forecast column names
                dims = [i for i in df.columns if '20' not in i] # Inverse of the above
                df = df.melt(id_vars=dims, value_vars=dates, var_name='YYYYMM', value_name='SetForecast')
                parent, filename = ReadAnchorData.file_info(filepath)
                df['release'] = parent[:4]+parent[5:]
                df['sourceFile'] = filename
                df['keyfigure'] = category
                df['isForecast'] = df['YYYYMM'].astype('int32') >= df['release'].astype('int32')
                return df

    def read_bom(filepath, category):
        sheetnames = pd.ExcelFile(filepath).sheet_names
        for sheet in sheetnames:
            df : pd.DataFrame = pd.read_excel(filepath, sheet)
            indicator_cols = ['BOM']
            if indicator_cols[0] in df.columns:
                parent, filename = ReadAnchorData.file_info(filepath)
                df['release'] = parent[:4]+parent[5:]
                df['sourcefile'] = filename
                return df

    def file_info(pth):
        start = -1
        stop = -1
        for _ in range(0, 2):
            start = stop
            stop = pth[::-1].find(f'\\', stop + 1)
        parent = pth[len(pth)-stop:len(pth)-start-1]
        filename = pth[len(pth)-start:]
        return parent, filename

    def prep_group(data : dict):
        result = {}
        for key, value in data.copy().items():
            if key == 'BOM':
                result[key] = value
            elif key == 'SetForecast':
                df = value
                df = df.reset_index(drop=True)
                cols = ['keyfigure', 'sourceFile', 'SetForecast', 'CAPEX Cost$"', 'CAPEX Cost$', 'Cost$']
                df = df.convert_dtypes('string')
                key_cols = list(df.columns.drop(cols))
                result[key] = pd.pivot_table(df,values='SetForecast',index=key_cols,columns=['keyfigure'],aggfunc=np.sum).reset_index()
        return result


class AnchorCategories:
    types = {
        'Bom_Report': ('BOM', ReadAnchorData.read_bom),
        'SetsPerMonthcombined': ('SetForecast', ReadAnchorData.readfcst),
        'SetsPerMonthConst': ('SetForecast', ReadAnchorData.readfcst),
        'SetsPerMonthUnConst': ('SetForecast', ReadAnchorData.readfcst)
        }


In [17]:
base_dir = r'data\\'
all_files = ReadAnchorData.get_xlsx_files(base_dir)

In [19]:
all_files

['C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2021-12\\GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx',
 'C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2021-12\\GLOBAL_Sets_PerMonth_CONSTRAINED_REPORT.xlsx',
 'C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2021-12\\GLOBAL_Sets_PerMonth_SUPPLYUPDATE_REPORT.xlsx',
 'C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2021-12\\GLOBAL_Sets_PerMonth_UNCONSTRAINED_REPORT.xlsx',
 'C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2021-12\\SP Bom_Report12-06-2021.xlsx',
 'C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2022-02\\GLOBAL_Sets_PerMonth_COMBINED_REPORT.xlsx',
 'C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2022-02\\GLOBAL_Sets_PerMonth_CONSTRAINED_REPORT.xlsx',
 'C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2022-02\\GLOBAL_Sets_PerMonth_SUPPLYUPDATE_REPORT.xlsx',
 'C:\\Users\\JGreenw9\\Desktop\\Python\\data-utility\\data\\2022-02\\GLOBAL_Sets_PerMonth_

In [6]:
paths_to_use = random.choices(all_files,k=4)
paths_to_use

['\\\\na.jnj.com\\dpyusdfsroot\\RY_Company\\Supply Chain Mgmt\\Spine Plan-NPI\\Conduit\\Jonny\\Powerbi\\Anchor\\2022-02\\SP WW 202202-202401 SetsPerMonthcombined02-08-2022.xlsx',
 '\\\\na.jnj.com\\dpyusdfsroot\\RY_Company\\Supply Chain Mgmt\\Spine Plan-NPI\\Conduit\\Jonny\\Powerbi\\Anchor\\2022-01\\SP WW 202201-202312 SetsPerMonthUnConst01-10-2022.xlsx',
 '\\\\na.jnj.com\\dpyusdfsroot\\RY_Company\\Supply Chain Mgmt\\Spine Plan-NPI\\Conduit\\Jonny\\Powerbi\\Anchor\\2021-09\\SP WW 202109 - 202308 SetsPerMonthcombined09-06-2021.xlsx',
 '\\\\na.jnj.com\\dpyusdfsroot\\RY_Company\\Supply Chain Mgmt\\Spine Plan-NPI\\Conduit\\Jonny\\Powerbi\\Anchor\\2022-02\\SP WW 202202-202401 SetsPerMonthConst02-08-202202-08-2022.xlsx']

In [8]:
data = ReadAnchorData.read_files(all_files)

BOM


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


ValueError: Excel file format cannot be determined, you must specify an engine manually.

In [6]:
data_prepped = ReadAnchorData.prep_group(data)

In [7]:
write_dir = r'C:\Users\jgreenw9\Desktop\Waterfall'
for key, value in data_prepped.items():
    print(f'{write_dir}\\{key}.csv')
    value.to_csv(f'{write_dir}\\{key}.csv', index=False)


C:\Users\jgreenw9\Desktop\Waterfall\BOM.csv
C:\Users\jgreenw9\Desktop\Waterfall\SetForecast.csv
