# 12_join_table_accre
> Loading the data on accre

In [1]:
# check system specs

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print('Connected to a GPU')

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9

if ram_gb < 20:
  print('Not using a high-RAM runtime: {:.1f} gigabytes of available RAM'.format(ram_gb))
else:
  print('Using a high-RAM runtime: {:.1f} gigabytes of available RAM'.format(ram_gb))

Not connected to a GPU
Using a high-RAM runtime: 27.3 gigabytes of available RAM


In [None]:
!pip install pyjanitor==0.23.1

In [3]:
# import packages
import pandas as pd
import numpy as np
import os
import janitor

In [4]:
# mount google drive
from google.colab import drive
drive.mount('/content/gdrive')
     
# navigate to directory
%cd /content/gdrive/MyDrive/Projects/repeat_customers/data

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/Projects/repeat_customers/data


In [5]:
if not os.path.exists('processed/'):
    os.mkdir('processed/')

In [6]:
# read in non-sales data
individual = pd.read_csv('raw/individual.csv', sep=',').clean_names()
product = pd.read_csv('raw/product.csv', sep='|').clean_names()
store = pd.read_csv('raw/store.csv', sep='|').clean_names()
vehicle = pd.read_csv('raw/vehicle.csv', sep='|').clean_names()

In [7]:
# data formats and column names
store['store_id'] = store['store_id'].apply(str)
store['zip_code'] = store['zip_code'].apply(str)
individual.rename(columns={'mzb_indiv_id':'indiv_id'}, inplace=True)

In [8]:
# create list of sales files
sales_files = [i for i in os.listdir('raw/') if 'sales_' in i]

In [9]:
# load initial sales file
df = pd.read_csv('raw/' + sales_files[0], sep='|').clean_names()
print(f"{df.shape[0]:,d}")

14,804,703


In [None]:
# combine all sales
for file in sales_files[1:]:

  sales_to_append = pd.read_csv('raw/' + file, sep='|').clean_names()
  print(file,': ',f"{sales_to_append.shape[0]:,d}")
  
  df = pd.concat([df,sales_to_append], axis = 0)
  print('df: ',f"{df.shape[0]:,d}")



sales_20181031.csv :  15432276
df:  30236979
sales_20180831.csv :  15630241
df:  45867220
sales_20180731.csv :  15197671
df:  61064891
sales_20180630.csv :  15888468
df:  76953359
sales_20180531.csv :  15942240
df:  92895599
sales_20180430.csv :  14449096
df:  107344695
sales_20180331.csv :  16057725
df:  123402420
sales_20180228.csv :  13247533


In [None]:
df.to_csv('processed/combined_sales.csv')

---
---
---

In [None]:
def join_data(sales_name_list):
    new_list = []
    for name in sales_name_list:
        # read data files and clean names
        sale = pd.read_csv('raw/' + name, sep='|', skiprows=[1]).clean_names()
        
        # convert store id to string
        sale['store_id'] = sale['store_id'].apply(str)
     
        # merging the data sets together
        mega_table = sale.merge(product, on = 'article_id', how = 'left').\
            merge(store, on = 'store_id', how = 'left').\
            merge(individual, on = 'indiv_id', how = 'left').\
            merge(vehicle, on = 'vehicle_id', how = 'left')
        
        # extracting name for storing data sets
        new_name = name[6:]
        new_list.append(new_name)
        mega_table["year"] = new_name[:4]
        mega_table['month'] = new_name[4:-4]
        mega_table = mega_table[(mega_table['ah1_res_bus_indc'] == 'R') & (mega_table['supp1_bus_pander'] == 'N') & (mega_table['email_optin_ind'] == 'Y')]
        mega_table = mega_table.drop(['ah1_res_bus_indc', 'supp1_bus_pander', 'email_optin_ind'], axis=1)
        col_list = list(mega_table.columns)
        mega_table.to_csv("/data/p_dsi/teams2022/team_1/new_data/" + new_name)
    return new_list, col_list

In [None]:
def combine_data(sales_list):
    data_list, col_list = join_data(sales_list)
    df = pd.DataFrame(columns = col_list)
    for data_name in data_list: 
        if os.path.isfile('processed/' + data_name + ".csv"):
            df1 = pd.read_csv('processed/' + data_name + ".csv")
            df = pd.concat([df1, df], axis = 0)
            df = df.reset_index(drop = True)
    return (df)

In [None]:
combine_df = combine_data(sales_name_list)

In [None]:
combine_df.to_csv("/data/p_dsi/teams2022/team_1/new_data/total_dataset.csv", index = False)

There is a high probability for ACCRE to break down during the final combination process. So when you run this notebook, it will be better to use a 4GPU (24 cores) server.