# 12_join_table_accre
> Loading the data on accre

In [None]:
# import packages
import pandas as pd
import numpy as np
import os
import janitor
from sklearn.model_selection import train_test_split
# must install janitor package with the following shell command:
# 'pip install --user pyjanitor'

In [None]:
# It may be helpful to use following command to install janitor on ACCRE:
#!conda install -c conda-forge/label/gcc7 pyjanitor -y

In [None]:
data_path = '/data/p_dsi/teams2022/bridgestone_data/data'
name_list = os.listdir(data_path)
sales_name_list = [x for x in name_list if x[0:7]=='sales_2']

In [None]:
if not os.path.exists("/data/p_dsi/teams2022/team_1/new_data"):
    os.mkdir("/data/p_dsi/teams2022/team_1/new_data")

In [None]:
def join_data(sales_name_list):
    new_list = []
    for name in sales_name_list:
        # read data files and clean names
        sale = pd.read_csv(data_path + "/" + name, sep='|', skiprows=[1]).clean_names()
        individual = pd.read_csv(data_path + '/individual.csv', sep=',', skiprows=[1]).clean_names()
        product = pd.read_csv(data_path + '/product.csv', sep='|', skiprows=[1]).clean_names()
        store = pd.read_csv(data_path + '/store.csv', sep='|', skiprows=[1]).clean_names()
        vehicle = pd.read_csv(data_path + '/vehicle.csv', sep='|', skiprows=[1]).clean_names()
        
        # convert store zip and id to string
        store['store_id'] = store['store_id'].apply(str)
        sale['store_id'] = sale['store_id'].apply(str)
        store['zip_code'] = store['zip_code'].apply(str)
        
        # change the column name of table 'individual'
        individual = individual.rename(columns={'mzb_indiv_id':'indiv_id'})
        
        # merging the data sets together
        mega_table = sale.merge(product, on = 'article_id', how = 'left').\
            merge(store, on = 'store_id', how = 'left').\
            merge(individual, on = 'indiv_id', how = 'left').\
            merge(vehicle, on = 'vehicle_id', how = 'left')
        
        # extracting name for storing data sets
        new_name = name[6:]
        new_list.append(new_name)
        mega_table["year"] = new_name[:4]
        mega_table['month'] = new_name[4:-4]
        mega_table = mega_table[(mega_table['ah1_res_bus_indc'] == 'R') & (mega_table['supp1_bus_pander'] == 'N') & (mega_table['email_optin_ind'] == 'Y')]
        mega_table = mega_table.drop(['ah1_res_bus_indc', 'supp1_bus_pander', 'email_optin_ind'], axis=1)
        col_list = list(mega_table.columns)
        mega_table.to_csv("/data/p_dsi/teams2022/team_1/new_data/" + new_name)
    return new_list, col_list

In [None]:
def combine_data(sales_list):
    data_list, col_list = join_data(sales_list)
    df = pd.DataFrame(columns = col_list)
    for data_name in data_list: 
        if os.path.isfile("/data/p_dsi/teams2022/team_1/new_data/" + data_name + ".csv"):
            df1 = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/" + data_name + ".csv")
            df = pd.concat([df1, df], axis = 0)
            df = df.reset_index(drop = True)
    return (df)

In [None]:
combine_df = combine_data(sales_name_list)

In [None]:
combine_df.to_csv("/data/p_dsi/teams2022/team_1/new_data/total_dataset.csv", index = False)

There is a high probability for ACCRE to break down during the final combination process. So when you run this notebook, it will be better to use a 4GPU (24 cores) server.