In [140]:
import pandas as pd
import numpy as np
from pathlib import Path

In [141]:
class ReadAnchorData:
    '''Usage
    -
    - Start with reading all xlsx files within a directory

    >>> def get_xlsx_files(base_dir)

    - Loop through types category, filtering to files that fit within cat and then running the method for pulling the data
    
    >>> def read_files(file_paths)
    This will run the following methods based on the types attr in AnchorCategories
    >>> class AnchorCategories.types
        >>> def read_fcst()
        >>> def read_bom()

    - Merge data where applicable
    '''

    def get_xlsx_files(base_dir):
        # Returns all xlsx files (incl subdirs)
        file_paths = Path(base_dir).glob("*/*.xlsx")
        return [str(x.resolve()) for x in file_paths]


    def read_files(file_paths):
        dfs = []
        for key, value in AnchorCategories.types.items():
            applicable_files = [x for x in file_paths if key in x]
            # applicable_files = ReadAnchorData.get_type_of_file(key, file_paths)
            for file in applicable_files:
                dfs.append(value(file))
        return dfs

    def get_type_of_file(key, file_paths):
        return [x for x in file_paths if key in x]

    def get_release(filepath : Path):
        # The release is given by the name of the parent folder
        parentfol = str(filepath.parent.absolute())
        return parentfol[len(parentfol)-7:]

    def readfcst(filepath):
        # The xlsx files can contain multiple sheets and no std naming convention. This will open each sheet until it finds the matching indicator columns
        # Each forecast will have these indicator columns, but the forecast column headers (202101, 202102 etc) can be different. These are transposed to the YYYYMM column
        sheetnames = pd.ExcelFile(filepath).sheet_names
        for sheet in sheetnames:
            df : pd.DataFrame = pd.read_excel(filepath, sheet)
            indicator_cols = ['Total Forecast']
            if indicator_cols[0] in df.columns:
                df = df.drop('Total Forecast', axis=1)
                dates = [i for i in df.columns if '20' in i] # This will get all forecast column names
                dims = [i for i in df.columns if '20' not in i] # Inverse of the above
                df = df.melt(id_vars=dims, value_vars=dates, var_name='YYYYMM', value_name='SetForecast')
                return df
    
    def read_bom(filepath):
        sheetnames = pd.ExcelFile(filepath).sheet_names
        for sheet in sheetnames:
            df : pd.DataFrame = pd.read_excel(filepath, sheet)
            indicator_cols = ['BOM']
            if indicator_cols[0] in df.columns:
                return df

    def file_info(path):
        start = -1
        stop = -1
        for i in range(0, 2):
            start = stop
            stop = pth[::-1].find(f'\\', stop + 1)
        parent = pth[len(pth)-stop:len(pth)-start-1]
        file = pth[len(pth)-start:]
        return parent, file

class AnchorCategories:
    types = {
        'Bom_Report': ReadAnchorData.read_bom,
        'SetsPerMonthcombined': ReadAnchorData.readfcst,
        'SetsPerMonthConst': ReadAnchorData.readfcst,
        'SetsPerMonthUnConst': ReadAnchorData.readfcst
        }


In [142]:
base_dir = r'\\na.jnj.com\dpyusdfsroot\RY_Company\Supply Chain Mgmt\Spine Plan-NPI\Conduit\Jonny\Powerbi\Anchor'
all_files = ReadAnchorData.get_xlsx_files(base_dir)

In [143]:
data = ReadAnchorData.read_files(all_files[1:2])

In [190]:
pth = all_files[2]


'2021-07'