To do list:

- [x] - List all xlsx files in dir

> ReadAnchorData.get_xlsx_files()
---
- [x] - Create dict to identify anchor file category

>ReadAnchorData.types
---
- [x] - Create method for reading each file type and appropiate sheet name
    - [x] - BOM
    > ReadAnchorData.read_bom()

    - [x] - Forecast
    > ReadAnchorData.read_fcst()
---
- [x] - Create method to run through all files and run appropiate method to get data

> ReadAnchorData.read_files()
---
- [ ] - Consolidate data into appropaite dataframes with identifier info to seperate out each source/release
---
- [ ] - Write data to file
---

[ ] - Stretch - write function only read newly added files

In [169]:
import pandas as pd
import numpy as np
from pathlib import Path
import random

In [223]:
class ReadAnchorData:
    '''Usage
    -
    - Start with reading all xlsx files within a directory

    >>> def get_xlsx_files(base_dir)

    - Loop through types category, filtering to files that fit within cat and then running the method for pulling the data
    
    >>> def read_files(file_paths)
    This will run the following methods based on the types attr in AnchorCategories
    >>> class AnchorCategories.types
        >>> def read_fcst()
        >>> def read_bom()

    - Merge data where applicable
    '''

    def get_xlsx_files(base_dir):
        # Returns all xlsx files (incl subdirs)
        file_paths = Path(base_dir).glob("*/*.xlsx")
        return [str(x.resolve()) for x in file_paths]


    def read_files(file_paths):
        dfs = {}
        for key, value in AnchorCategories.types.items():
            name, method = value
            dfs[name] = pd.DataFrame()
        for key, value in AnchorCategories.types.items():
            name, method = value
            print(name)
            applicable_files = [x for x in file_paths if key in x]
            # applicable_files = ReadAnchorData.get_type_of_file(key, file_paths)
            for file in applicable_files:
                dfs[name] = pd.concat([method(file, key), dfs[name]])
                print(file)
        return dfs

    def get_type_of_file(key, file_paths):
        return [x for x in file_paths if key in x]

    def get_release(filepath : Path):
        # The release is given by the name of the parent folder
        parentfol = str(filepath.parent.absolute())
        return parentfol[len(parentfol)-7:]

    def readfcst(filepath, category):
        # The xlsx files can contain multiple sheets and no std naming convention. This will open each sheet until it finds the matching indicator columns
        # Each forecast will have these indicator columns, but the forecast column headers (202101, 202102 etc) can be different. These are transposed to the YYYYMM column
        sheetnames = pd.ExcelFile(filepath).sheet_names
        for sheet in sheetnames:
            df : pd.DataFrame = pd.read_excel(filepath, sheet)
            indicator_cols = ['Total Forecast']
            if indicator_cols[0] in df.columns:
                df = df.drop('Total Forecast', axis=1)
                dates = [i for i in df.columns if '20' in i] # This will get all forecast column names
                dims = [i for i in df.columns if '20' not in i] # Inverse of the above
                df = df.melt(id_vars=dims, value_vars=dates, var_name='YYYYMM', value_name='SetForecast')
                parent, filename = ReadAnchorData.file_info(filepath)
                df['release'] = parent[:4]+parent[5:]
                df['sourceFile'] = filename
                df['keyfigure'] = category
                df['isForecast'] = df['YYYYMM'].astype('int32') >= df['release'].astype('int32')
                return df

    def read_bom(filepath, category):
        sheetnames = pd.ExcelFile(filepath).sheet_names
        for sheet in sheetnames:
            df : pd.DataFrame = pd.read_excel(filepath, sheet)
            indicator_cols = ['BOM']
            if indicator_cols[0] in df.columns:
                parent, filename = ReadAnchorData.file_info(filepath)
                df['release'] = parent[:4]+parent[5:]
                df['sourcefile'] = filename
                return df

    def file_info(pth):
        start = -1
        stop = -1
        for _ in range(0, 2):
            start = stop
            stop = pth[::-1].find(f'\\', stop + 1)
        parent = pth[len(pth)-stop:len(pth)-start-1]
        filename = pth[len(pth)-start:]
        return parent, filename

    def prep_group(data : dict):
        result = {}
        for key, value in data.copy().items():
            if key == 'BOM':
                result[key] = value
            elif key == 'SetForecast':
                df = value
                df = df.reset_index(drop=True)
                cols = ['keyfigure', 'sourceFile', 'SetForecast']
                print(df)
                print(pd.pivot_table(df,index=list(df.columns.drop(cols)),columns='keyfigure',values='SetForecast',aggfunc=np.sum).reset_index())
                result[key] = pd.pivot_table(df,index=list(df.columns.drop(cols)),columns='keyfigure',values='SetForecast',aggfunc=np.sum).reset_index()
        return result


class AnchorCategories:
    types = {
        'Bom_Report': ('BOM', ReadAnchorData.read_bom),
        'SetsPerMonthcombined': ('SetForecast', ReadAnchorData.readfcst),
        'SetsPerMonthConst': ('SetForecast', ReadAnchorData.readfcst),
        'SetsPerMonthUnConst': ('SetForecast', ReadAnchorData.readfcst)
        }


In [166]:
base_dir = r'\\na.jnj.com\dpyusdfsroot\RY_Company\Supply Chain Mgmt\Spine Plan-NPI\Conduit\Jonny\Powerbi\Anchor'
all_files = ReadAnchorData.get_xlsx_files(base_dir)

In [194]:
paths_to_use = random.choices(all_files,k=4)
paths_to_use

['\\\\na.jnj.com\\dpyusdfsroot\\RY_Company\\Supply Chain Mgmt\\Spine Plan-NPI\\Conduit\\Jonny\\Powerbi\\Anchor\\2021-11\\SP LATAM SetsPerMonthConst20211-202310 11-08-2021 .xlsx',
 '\\\\na.jnj.com\\dpyusdfsroot\\RY_Company\\Supply Chain Mgmt\\Spine Plan-NPI\\Conduit\\Jonny\\Powerbi\\Anchor\\2021-10\\SP WW 202110-202309 SetsPerMonthcombined.xlsx',
 '\\\\na.jnj.com\\dpyusdfsroot\\RY_Company\\Supply Chain Mgmt\\Spine Plan-NPI\\Conduit\\Jonny\\Powerbi\\Anchor\\2021-12\\SP WW 202112-202311 SetsPerMonthUnConst12-06-2021.xlsx',
 '\\\\na.jnj.com\\dpyusdfsroot\\RY_Company\\Supply Chain Mgmt\\Spine Plan-NPI\\Conduit\\Jonny\\Powerbi\\Anchor\\2021-08\\SP WW 202108-202307 SetsPerMonthUnConst08-09-2021.xlsx']

In [216]:
data = ReadAnchorData.read_files(paths_to_use)

BOM
SetForecast
\\na.jnj.com\dpyusdfsroot\RY_Company\Supply Chain Mgmt\Spine Plan-NPI\Conduit\Jonny\Powerbi\Anchor\2021-10\SP WW 202110-202309 SetsPerMonthcombined.xlsx
SetForecast
\\na.jnj.com\dpyusdfsroot\RY_Company\Supply Chain Mgmt\Spine Plan-NPI\Conduit\Jonny\Powerbi\Anchor\2021-11\SP LATAM SetsPerMonthConst20211-202310 11-08-2021 .xlsx
SetForecast
\\na.jnj.com\dpyusdfsroot\RY_Company\Supply Chain Mgmt\Spine Plan-NPI\Conduit\Jonny\Powerbi\Anchor\2021-12\SP WW 202112-202311 SetsPerMonthUnConst12-06-2021.xlsx
\\na.jnj.com\dpyusdfsroot\RY_Company\Supply Chain Mgmt\Spine Plan-NPI\Conduit\Jonny\Powerbi\Anchor\2021-08\SP WW 202108-202307 SetsPerMonthUnConst08-09-2021.xlsx


In [224]:
data_prepped = ReadAnchorData.prep_group(data)

       Business Unit Business Category Region Country  \
0              SPINE     BASE BUSINESS  ASPAC      AU   
1              SPINE     BASE BUSINESS  ASPAC      AU   
2              SPINE     BASE BUSINESS  ASPAC      AU   
3              SPINE     BASE BUSINESS  ASPAC      AU   
4              SPINE               NPI  ASPAC      AU   
...              ...               ...    ...     ...   
185275         SPINE     BASE BUSINESS   EMEA      RU   
185276         SPINE     BASE BUSINESS   EMEA      SA   
185277         SPINE     BASE BUSINESS   EMEA      IT   
185278         SPINE     BASE BUSINESS   EMEA      PT   
185279         SPINE     BASE BUSINESS   EMEA      SK   

                   Product Line             Brand      Sub Brand/Event ID  \
0             ANTERIOR CERVICAL    PROTI 360 ACIS      SPI_BI_AU_ACIS_001   
1                        IBFANT             AEGIS     SPI_BI_AU_AEGIS_001   
2       POSTERIOR THORACOLUMBAR            EXP TI    SPI_BI_AU_EXP TI_001   
3      

In [225]:
data_prepped

{'BOM': Empty DataFrame
 Columns: []
 Index: [],
 'SetForecast': Empty DataFrame
 Columns: [Business Unit, Business Category, Region, Country, Product Line, Brand, Sub Brand/Event ID, BOM, BOM Description, BOM Category, Display Option, Demand StreamId, Cost$, YYYYMM, release, isForecast, CAPEX Cost$]
 Index: []}

In [253]:
data['SetForecast'].convert_dtypes()

Unnamed: 0,Business Unit,Business Category,Region,Country,Product Line,Brand,Sub Brand/Event ID,BOM,BOM Description,BOM Category,Display Option,Demand StreamId,Cost$,YYYYMM,SetForecast,release,sourceFile,keyfigure,isForecast,CAPEX Cost$
0,SPINE,BASE BUSINESS,ASPAC,AU,ANTERIOR CERVICAL,PROTI 360 ACIS,SPI_BI_AU_ACIS_001,001_AU_ACIS,DPY/SYN_INS_AU_ACIS PROTI 360,CORE,Un-Constrained,Non Revenue,0,202108,0,202108,SP WW 202108-202307 SetsPerMonthUnConst08-09-2...,SetsPerMonthUnConst,True,
1,SPINE,BASE BUSINESS,ASPAC,AU,IBFANT,AEGIS,SPI_BI_AU_AEGIS_001,001_AU_AEGIS,DPY_INS_AU_AEGIS,CORE,Un-Constrained,Non Revenue,0,202108,0,202108,SP WW 202108-202307 SetsPerMonthUnConst08-09-2...,SetsPerMonthUnConst,True,
2,SPINE,BASE BUSINESS,ASPAC,AU,POSTERIOR THORACOLUMBAR,EXP TI,SPI_BI_AU_EXP TI_001,001_AU_EXP TI,DPY_IMP_AU_EXP 5.5,CORE,Un-Constrained,Non Revenue,0,202108,0,202108,SP WW 202108-202307 SetsPerMonthUnConst08-09-2...,SetsPerMonthUnConst,True,
3,SPINE,BASE BUSINESS,ASPAC,AU,IB FUSION - POSTERIOR,PIPELINE,SPI_BI_AU_PIPELINE_001,001_AU_PIPELINE,DPY_INS_AU_PIPELINE,CORE,Un-Constrained,Non Revenue,0,202108,0,202108,SP WW 202108-202307 SetsPerMonthUnConst08-09-2...,SetsPerMonthUnConst,True,
4,SPINE,NPI,ASPAC,AU,POSTERIOR CERVICAL,SYMPHONY(UNITED),SPI_BI_AU_SATURN_001,001_AU_SATURN,DPY_INS_AU_SATURN_UNAS-MEDTRONIC,CORE,Un-Constrained,Non Revenue,0,202108,0,202108,SP WW 202108-202307 SetsPerMonthUnConst08-09-2...,SetsPerMonthUnConst,True,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64555,SPINE,BASE BUSINESS,EMEA,RU,ANTERIOR CERVICAL,ZERO-P VA,SPI_AA_ZeroP,YZPVA05C,ZERO-P VA INSTRUMENT SET - WITH VARIO CASE,CORE,Combined,Non Revenue,,202309,0,202110,SP WW 202110-202309 SetsPerMonthcombined.xlsx,SetsPerMonthcombined,True,7138.28
64556,SPINE,BASE BUSINESS,EMEA,SA,ANTERIOR CERVICAL,ZERO-P VA,SPI_AA_ZeroP,YZPVA05C,ZERO-P VA INSTRUMENT SET - WITH VARIO CASE,CORE,Combined,Non Revenue,,202309,0,202110,SP WW 202110-202309 SetsPerMonthcombined.xlsx,SetsPerMonthcombined,True,12671.42
64557,SPINE,BASE BUSINESS,EMEA,IT,ANTERIOR CERVICAL,ZERO-P VA,SPI_AA_ZeroP,YZPVA06C,ZERO-P VA INSTRUMENT SET - NO CASE,CORE,Combined,Non Revenue,,202309,0,202110,SP WW 202110-202309 SetsPerMonthcombined.xlsx,SetsPerMonthcombined,True,13994.6
64558,SPINE,BASE BUSINESS,EMEA,PT,ANTERIOR CERVICAL,ZERO-P VA,SPI_AA_ZeroP,YZPVA06S,ZERO-P VA STERILE SCREWS SET,SUPPORT,Combined,Non Revenue,,202309,0,202110,SP WW 202110-202309 SetsPerMonthcombined.xlsx,SetsPerMonthcombined,True,0.0


In [282]:
df : pd.DataFrame = data['SetForecast']
df = df.reset_index()
cols = ['keyfigure', 'sourceFile', 'SetForecast']
df = df.convert_dtypes(dict.fromkeys(list(df.columns.drop(cols)), 'string'))
key_cols = list(df.columns.drop(cols))
val = pd.pivot_table(df,values='SetForecast',index=list(df.columns),columns=['keyfigure'],aggfunc=np.sum)
print(val)
print(df.info())
# for i in list(df.columns.drop(cols)):
#     print(pd.pivot_table(df,values='SetForecast',index=cols,columns=['keyfigure'],aggfunc=np.sum))


Empty DataFrame
Columns: []
Index: []
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185280 entries, 0 to 185279
Data columns (total 21 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   index               185280 non-null  Int64  
 1   Business Unit       185280 non-null  string 
 2   Business Category   185280 non-null  string 
 3   Region              164976 non-null  string 
 4   Country             185280 non-null  string 
 5   Product Line        185280 non-null  string 
 6   Brand               185280 non-null  string 
 7   Sub Brand/Event ID  185280 non-null  string 
 8   BOM                 185280 non-null  string 
 9   BOM Description     185280 non-null  string 
 10  BOM Category        185280 non-null  string 
 11  Display Option      185280 non-null  string 
 12  Demand StreamId     185280 non-null  string 
 13  Cost$               55680 non-null   Int64  
 14  YYYYMM              185280 non-null  string 
 

In [226]:
write_dir = r'C:\Users\jgreenw9\Desktop\Waterfall'
for key, value in data.items():
    print(f'{write_dir}\\{key}.csv')
    value.to_csv(f'{write_dir}\\{key}.csv', index=False)


C:\Users\jgreenw9\Desktop\Waterfall\BOM.csv
C:\Users\jgreenw9\Desktop\Waterfall\SetForecast.csv
