# 12_join_table_accre
> Loading the data on accre

In [None]:
# import packages
import pandas as pd
import numpy as np
import os
import janitor
from sklearn.model_selection import train_test_split
# must install janitor package with the following shell command:
# 'pip install --user pyjanitor'

In [None]:
# It may be helpful to use following command to install janitor on ACCRE:
#!conda install -c conda-forge/label/gcc7 pyjanitor -y

In [None]:
data_path = '/data/p_dsi/teams2022/bridgestone_data/data'
name_list = os.listdir(data_path)
sales_name_list = [x for x in name_list if x[0:7]=='sales_2']

In [None]:
if not os.path.exists("/data/p_dsi/teams2022/team_1/new_data"):
    os.mkdir("/data/p_dsi/teams2022/team_1/new_data")

In [None]:
def read_sales(file):
    col_list = ['STORE_ID','TRAN_ID','DATE','ARTICLE_ID','INDIV_ID','VEHICLE_ID','UNITS','SALES']
    df = pd.read_csv(file
                  ,sep='|'
                  ,usecols=col_list
                  #,parse_dates=['DATE']
                  #,date_parser=date_parser
                  ,dtype = {'STORE_ID':'category'
                            ,'TRAN_ID':np.int32
                            ,'DATE':'category'
                            ,'ARTICLE_ID':np.int32
                            ,'VEHICLE_ID':np.int32
                            ,'UNITS':np.int8
                            ,'SALES':np.float16
                            ,'INDIV_ID':np.float16 # int32 throws error claiming float value
                            }
                 ).clean_names()
    return df

In [None]:
def read_individuals(file):
    col_list = ['MZB_INDIV_ID','EMAIL_OPTIN_IND','AH1_RES_BUS_INDC','SUPP1_BUS_PANDER']
    individuals = pd.read_csv(file
                            ,sep=','
                            ,usecols=col_list
                            ,dtype = {'MZB_INDIV_ID':np.int32
                                        ,'EMAIL_OPTIN_IND':'category'
                                        ,'AH1_RES_BUS_INDC':'category'
                                        ,'SUPP1_BUS_PANDER':'category'} 
                            ).clean_names()
    individuals.rename(columns={'mzb_indiv_id':'indiv_id'}, inplace=True)

    return individuals

In [None]:
def read_products(file):
    col_list = ['ARTICLE_ID', 'PROD_GROUP_CODE', 'PROD_GROUP_DESC', 'CATEGORY_CODE',
            'CATEGORY_DESC', 'SEGMENT_CODE', 'SEGMENT_DESC', 'CLASS_CODE',
            'CLASS_DESC', 'DISCOUNT_FLAG', 'CROSS_SECTION', 'ASPECT_RATIO',
            'RIM_SIZE']
    products = pd.read_csv(file
                            ,sep='|'
                            ,usecols=col_list
                            ,dtype = {'ARTICLE_ID':np.int32, 'PROD_GROUP_CODE':'category'
                                    , 'PROD_GROUP_DESC':'category', 'CATEGORY_CODE':'category'
                                    ,'CATEGORY_DESC':'category', 'SEGMENT_CODE':'category'
                                    , 'SEGMENT_DESC':'category', 'CLASS_CODE':'category'
                                    , 'CLASS_DESC':'category', 'DISCOUNT_FLAG':'category'
                                    , 'CROSS_SECTION':'category', 'ASPECT_RATIO':'category',
                                    'RIM_SIZE':'category'}
                            ).clean_names()

    return products

In [None]:
def read_stores(file):
    col_list = ['STORE_ID','STATE_CODE','ZIP_CODE','MSA']
    stores = pd.read_csv(file
                        ,sep='|'
                        ,usecols=col_list
                        ,dtype = {'STORE_ID':'category'
                                    ,'STATE_CODE':'category'
                                    ,'ZIP_CODE':'category'
                                    ,'MSA':'category'}
                        ).clean_names()

    return stores

In [None]:
def read_vehicles(file):
    col_list = ['VEHICLE_ID','MAKE','MODEL','SUB_MODEL','MODEL_YEAR']
    vehicles = pd.read_csv(file
                            ,sep='|'
                            ,usecols=col_list
                            ,dtype = {'VEHICLE_ID':np.int32
                                    ,'MAKE':'category'
                                    ,'MODEL':'category'
                                    ,'SUB_MODEL':'category'
                                    ,'MODEL_YEAR':np.int16}
                            ).clean_names()

In [None]:
def join_data(sales_name_list):
    
    new_list = []

    individual = read_individuals(data_path + '/individual.csv')
    product = read_products(data_path + '/product.csv')
    store = read_stores(data_path + '/store.csv')
    vehicle = read_vehicles(data_path + '/vehicle.csv')

    for name in sales_name_list:
        # read data files and clean names
        sale = read_sales(data_path + "/" + name)
        
        # merging the data sets together
        mega_table = sale.merge(product, on = 'article_id', how = 'left').\
            merge(store, on = 'store_id', how = 'left').\
            merge(individual, on = 'indiv_id', how = 'left').\
            merge(vehicle, on = 'vehicle_id', how = 'left')
        
        # extracting name for storing data sets
        new_name = name[6:]
        new_list.append(new_name)
        mega_table["year"] = new_name[:4]
        mega_table['month'] = new_name[4:-4]
        mega_table = mega_table[(mega_table['ah1_res_bus_indc'] == 'R') & (mega_table['supp1_bus_pander'] == 'N') & (mega_table['email_optin_ind'] == 'Y')]
        mega_table = mega_table.drop(['ah1_res_bus_indc', 'supp1_bus_pander', 'email_optin_ind'], axis=1)
        col_list = list(mega_table.columns)
        mega_table.to_csv("/data/p_dsi/teams2022/team_1/new_data/" + new_name)
    return new_list, col_list

In [None]:
def combine_data(sales_list):
    data_list, col_list = join_data(sales_list)
    df = pd.DataFrame(columns = col_list)
    for data_name in data_list: 
        if os.path.isfile("/data/p_dsi/teams2022/team_1/new_data/" + data_name + ".csv"):
            df1 = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/" + data_name + ".csv")
            df = pd.concat([df1, df], axis = 0)
            df = df.reset_index(drop = True)
    return (df)

In [None]:
combine_df = combine_data(sales_name_list)

In [None]:
combine_df.to_csv("/data/p_dsi/teams2022/team_1/new_data/total_dataset.csv", index = False)

There is a high probability for ACCRE to break down during the final combination process. So when you run this notebook, it will be better to use a 4GPU (24 cores) server.