In [1]:
import pandas as pd
import datetime
import numpy as np
import os

txn_dirPath = '/Users/rcheung/Documents/inv_mgmt_report/txns/'


txn_converter = {'TXN - Transaction Type': str,
                 'TXN - Transaction Date': str,
                 'TXN - Item ID': str,
                 'TXN - Unit': str,
                 'TXN - Qty': float,
                 'TXN - Total Cost': float,
                 'TXN - Adjust Type': str}

#RC: add TXN-Sequence Nbr
transaction_col = ['TXN - Sequence Nbr', 'TXN - Transaction Type', 'TXN - Transaction Date', 'TXN - Unit',
                   'TXN - Item ID', 'TXN - Qty', 'TXN - Total Cost', 'TXN - Adjust Type']

out_types = ['051', '054,', '030', '031', '012']
positive_types = ['041', '022', '024', '050', '010', '020']
drop_types = ['053', '060']


# return all excel files in the directory as a list
def get_lof(directory):
    lof = []
    for file in os.listdir(directory):
        if file.endswith('.xlsx'):
            lof.append(directory + file)
    return lof


def read_txn_to_df(lof, converter):
    dataframe = pd.DataFrame()
    for f in lof:
        next_txn = pd.read_excel(f, converters=converter, parse_dates=['TXN - Transaction Date'])
        #RC: changed to concat
        dataframe = pd.concat([dataframe, next_txn])
        print(f + ' is appended')
    return dataframe


# handle the increase/decrease column
def handle_I_D(txn_df):
    txn_df['TXN - Qty'] = np.where(txn_df['TXN - Adjust Type'] == 'D', 0 - abs(txn_df['TXN - Qty']),
                                   txn_df['TXN - Qty'])
    txn_df['TXN - Qty'] = np.where(txn_df['TXN - Adjust Type'] == 'M', 0 - abs(txn_df['TXN - Qty']),
                                   txn_df['TXN - Qty'])
    return txn_df

# eg: date = '2021-11-01 00:00:00'
# return all txn before 2021-11-01
def read_txn_by_date(txn_df, date):
    txn_df = txn_df.loc[txn_df['TXN - Transaction Date'] < date]
    #txn_df = txn_df.drop(columns=['TXN - Transaction Date'])
    # We don't want to drop the Transaction Date column.
    return txn_df


In [None]:
txn_df = read_txn_to_df(get_lof(txn_dirPath), txn_converter)

In [44]:
# dataframe = txn_df.copy(deep = True)
#RC: switched order of filter and drop duplicate
# filter dataframe
dataframe = txn_df.loc[txn_df['TXN - Transaction Type'].isin(['010', '020', '012', '022', '024', '030', '031', '041', '050', '051', '054', '053', '060'])]

# drop duplication
dataframe = dataframe.drop_duplicates()

#keep relevant rows
dataframe = dataframe[transaction_col]

#dataframe = read_txn_by_date(dataframe,d)
dataframe = handle_I_D(dataframe)
dataframe

Unnamed: 0,TXN - Sequence Nbr,TXN - Transaction Type,TXN - Transaction Date,TXN - Unit,TXN - Item ID,TXN - Qty,TXN - Total Cost,TXN - Adjust Type
0,4794,020,2004-04-02,BS011,04567215,2.0,179.78,
1,13692,020,2004-04-02,BS014,04260657,10.0,46.11,
2,8907,020,2004-04-02,BS013,02005067,50.0,7.03,
3,8704,020,2004-04-02,BS012,05180130,2.0,40.44,
4,4899,020,2004-04-02,BS011,04587046,3.0,857.55,
...,...,...,...,...,...,...,...,...
146084,1,041,2022-09-30,CS004,04295340,0.0,0.00,
146085,1,041,2022-09-30,CS004,02441047,0.0,0.00,
146086,1,041,2022-09-30,CS004,05292187,0.0,0.00,
146087,3,022,2022-09-30,BS014,02595071,1.0,79.07,


In [45]:
# calculate outs
df_outgoing = dataframe.loc[dataframe['TXN - Transaction Type'].isin(out_types)]
# group by date and item ID fields
df_outgoing = df_outgoing.groupby(['TXN - Item ID', 'TXN - Transaction Date']).agg(
                        {'TXN - Qty': 'sum' }).reset_index()

# calculate ins
df_incoming = dataframe.loc[dataframe['TXN - Transaction Type'].isin(positive_types)]
# group by date and item ID fields
df_incoming = df_incoming.groupby(['TXN - Item ID', 'TXN - Transaction Date']).agg(
                        {'TXN - Qty': 'sum' }).reset_index()

In [47]:
# combine ingoing and outgoing dfs by item and date
df_txn_merged = pd.merge(df_outgoing, df_incoming, on=['TXN - Item ID', 'TXN - Transaction Date'], how='outer')
df_txn_merged.update(df_txn_merged[['TXN - Qty_x', 'TXN - Qty_y']].fillna(0))

rename_dict = {'TXN - Qty_x':'TXN - Outgoing Qty', 'TXN - Qty_y':'TXN - Incoming Qty'}

# call rename () method
df_txn_merged.rename(columns= rename_dict, inplace=True)

In [48]:
df_txn_merged

Unnamed: 0,TXN - Item ID,TXN - Transaction Date,TXN - Outgoing Qty,TXN - Incoming Qty
0,00118003,2007-02-07,13.0,0.0
1,00118003,2007-05-07,13.0,13.0
2,00118003,2014-03-25,5.0,0.0
3,00118004,2004-09-09,1.0,0.0
4,00118004,2005-01-21,1.0,0.0
...,...,...,...,...
5063605,YAV95M6,2019-04-02,0.0,-150.0
5063606,Z53921,2017-09-27,0.0,173.0
5063607,Z53921,2019-04-02,0.0,-173.0
5063608,ZX0201015,2007-09-25,0.0,51.0


In [49]:
df_byitem = df_txn_merged.sort_values(by = ['TXN - Item ID', 'TXN - Transaction Date'], ascending = [True, True]).reset_index(drop=True)


In [50]:
original_len = len(df_byitem)

for i in range(0, original_len):
    if i > 0 and df_byitem.loc[i, 'TXN - Item ID'] == df_byitem.loc[i-1, 'TXN - Item ID']:
        df_byitem.loc[i, 'TXN - Overall Qty'] = df_byitem.loc[i-1, 'TXN - Overall Qty'] + df_byitem.loc[i, 'TXN - Incoming Qty'] - df_byitem.loc[i, 'TXN - Outgoing Qty']
        #df_byitem.loc[i, 'TXN - Overall Total Cost'] = df_byitem.loc[i-1, 'TXN - Overall Total Cost'] + df_byitem.loc[i, 'TXN - Incoming Total Cost'] - df_byitem.loc[i, 'TXN - Outgoing Total Cost']
    else:
        df_byitem.loc[i, 'TXN - Overall Qty'] = df_byitem.loc[i, 'TXN - Incoming Qty'] - df_byitem.loc[i, 'TXN - Outgoing Qty']
        #df_byitem.loc[i, 'TXN - Overall Total Cost'] = df_byitem.loc[i, 'TXN - Incoming Total Cost'] -  df_byitem.loc[i, 'TXN - Outgoing Total Cost']
    if i % 100000 == 0:
        print(original_len - i)

5063610
4963610
4863610
4763610
4663610
4563610
4463610
4363610
4263610
4163610
4063610
3963610
3863610
3763610
3663610
3563610
3463610
3363610
3263610
3163610
3063610
2963610
2863610
2763610
2663610
2563610
2463610
2363610
2263610
2163610
2063610
1963610
1863610
1763610
1663610
1563610
1463610
1363610
1263610
1163610
1063610
963610
863610
763610
663610
563610
463610
363610
263610
163610
63610


In [51]:
df_byitem

Unnamed: 0,TXN - Item ID,TXN - Transaction Date,TXN - Outgoing Qty,TXN - Incoming Qty,TXN - Overall Qty
0,00118002,2004-04-03,0.0,1.0,1.0
1,00118002,2013-01-23,0.0,-1.0,0.0
2,00118003,2004-04-03,0.0,13.0,13.0
3,00118003,2006-11-16,0.0,1.0,14.0
4,00118003,2007-02-07,13.0,0.0,1.0
...,...,...,...,...,...
5063605,YAV95M6,2019-04-02,0.0,-150.0,0.0
5063606,Z53921,2017-09-27,0.0,173.0,173.0
5063607,Z53921,2019-04-02,0.0,-173.0,0.0
5063608,ZX0201015,2007-09-25,0.0,51.0,51.0


In [52]:
df_byitem.to_csv('output/txns_by_item.csv', index = None)

In [37]:
dataframe.to_csv('output/raw.csv')