# 12_join_table_accre

## Environment

In [None]:
# import packages
import pandas as pd
import numpy as np
import os
import janitor
from glob import glob

## Set Directories

In [None]:
data_path = '/data/p_dsi/teams2022/bridgestone_data/data/'
sales_file_list = glob(data_path + 'sales_2*.csv')

In [None]:
if not os.path.exists("/data/merged_data"):
    os.mkdir("/data/merged_data")

## Data Read Functions

In [None]:
def read_sales(file):
    col_list = ['STORE_ID','TRAN_ID','DATE','ARTICLE_ID','INDIV_ID','VEHICLE_ID','UNITS','SALES']
    df = pd.read_csv(file
                  ,sep='|'
                  ,usecols=col_list
                  #,parse_dates=['DATE']
                  #,date_parser=date_parser
                  ,dtype = {'STORE_ID':'category'
                            ,'TRAN_ID':np.int32
                            ,'DATE':'category'
                            ,'ARTICLE_ID':np.int32
                            ,'VEHICLE_ID':np.int32
                            ,'UNITS':np.int8
                            ,'SALES':np.float16
                            ,'INDIV_ID':np.float16 # int32 throws error claiming float value
                            }
                 ).clean_names()
    return df

In [None]:
def read_individuals(file):
    
    col_list = ['MZB_INDIV_ID','EMAIL_OPTIN_IND','AH1_RES_BUS_INDC','SUPP1_BUS_PANDER']
    
    individuals = pd.read_csv(file
                            ,sep=','
                            ,usecols=col_list
                            ,dtype = {'MZB_INDIV_ID':np.int32
                                        ,'EMAIL_OPTIN_IND':'category'
                                        ,'AH1_RES_BUS_INDC':'category'
                                        ,'SUPP1_BUS_PANDER':'category'} 
                            ).clean_names()
    
    individuals.rename(columns={'mzb_indiv_id':'indiv_id'}, inplace=True)

    individuals = individuals[(individuals['ah1_res_bus_indc'] == 'R') & (individuals['supp1_bus_pander'] == 'N') & (individuals['email_optin_ind'] == 'Y')]
    individuals.drop(['ah1_res_bus_indc', 'supp1_bus_pander', 'email_optin_ind'], axis=1, inplace=True)

    return individuals

In [None]:
def read_products(file):
    col_list = ['ARTICLE_ID', 'PROD_GROUP_CODE', 'PROD_GROUP_DESC', 'CATEGORY_CODE',
            'CATEGORY_DESC', 'SEGMENT_CODE', 'SEGMENT_DESC', 'CLASS_CODE',
            'CLASS_DESC', 'DISCOUNT_FLAG', 'CROSS_SECTION', 'ASPECT_RATIO',
            'RIM_SIZE']
    products = pd.read_csv(file
                            ,sep='|'
                            ,usecols=col_list
                            ,dtype = {'ARTICLE_ID':np.int32, 'PROD_GROUP_CODE':'category'
                                    , 'PROD_GROUP_DESC':'category', 'CATEGORY_CODE':'category'
                                    ,'CATEGORY_DESC':'category', 'SEGMENT_CODE':'category'
                                    , 'SEGMENT_DESC':'category', 'CLASS_CODE':'category'
                                    , 'CLASS_DESC':'category', 'DISCOUNT_FLAG':'category'
                                    , 'CROSS_SECTION':'category', 'ASPECT_RATIO':'category',
                                    'RIM_SIZE':'category'}
                            ).clean_names()

    return products

In [None]:
def read_stores(file):
    col_list = ['STORE_ID','STATE_CODE','ZIP_CODE','MSA']
    stores = pd.read_csv(file
                        ,sep='|'
                        ,usecols=col_list
                        ,dtype = {'STORE_ID':'category'
                                    ,'STATE_CODE':'category'
                                    ,'ZIP_CODE':'category'
                                    ,'MSA':'category'}
                        ).clean_names()

    return stores

In [None]:
def read_vehicles(file):
    col_list = ['VEHICLE_ID','MAKE','MODEL','SUB_MODEL','MODEL_YEAR']
    vehicles = pd.read_csv(file
                            ,sep='|'
                            ,usecols=col_list
                            ,dtype = {'VEHICLE_ID':np.int32
                                    ,'MAKE':'category'
                                    ,'MODEL':'category'
                                    ,'SUB_MODEL':'category'
                                    ,'MODEL_YEAR':np.int16}
                            ).clean_names()

## Process Data

In [None]:
# read non-sales tables
individual = read_individuals(data_path + '/individual.csv')
product = read_products(data_path + '/product.csv')
store = read_stores(data_path + '/store.csv')
vehicle = read_vehicles(data_path + '/vehicle.csv')

In [None]:
# joined tables for each sales month

new_file_list = []

for file in sales_file_list:
    # read data files and clean names
    sale = read_sales(data_path + "/" + file)
    
    # merge tables
    mega_table = sale.merge(product, on = 'article_id', how = 'inner').\
        merge(store, on = 'store_id', how = 'inner').\
        merge(individual, on = 'indiv_id', how = 'inner').\
        merge(vehicle, on = 'vehicle_id', how = 'inner')
    col_list = list(mega_table.columns)
    
    # new file name and date fields
    new_filename = file[6:]
    new_file_list.append(new_filename)
    mega_table["year"] = new_filename[:4]
    mega_table['month'] = new_filename[4:-4]
    mega_table.to_parquet("data/merged_data/" + new_filename)


In [None]:
# all the data

df = pd.DataFrame(columns = col_list)
for file in new_file_list: 
    if os.path.isfile("/data/merged_data/" + file + ".csv"):
        df1 = pd.read_parquet("/data/merged_data/" + file + ".csv")
        df = pd.concat([df1, df], axis = 0)
        df = df.reset_index(drop = True)

In [None]:
df.to_parquet("data/merged_data/total_dataset.parquet", index = False)