This notebook prepares the customer_level_data_company_{company}.csv files for use in the postgres-clv project

For each company:
 1. company_id column is added
 2. the id column is renamed to customer_id
 3. a new customer_level_data_company_{company}.csv file is generated and stored in the OUTPUT_FOLDER

In [None]:
import os
import numpy as np
import pandas as pd
import tqdm
import multiprocessing

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

## Global variables

In [None]:
COMPANYS = [
    '10000', '101200010', '101410010', '101600010', '102100020', '102700020',
    '102840020', '103000030', '103338333', '103400030', '103600030',
    '103700030', '103800030', '104300040', '104400040', '104470040',
    '104900040', '105100050', '105150050', '107800070'
]

In [None]:
DATA_FOLDER = './tmp/acquire-valued-shoppers-challenge' # @param { isTemplate: true, type: 'string'}

In [None]:
OUTPUT_FOLDER = './tmp/acquire-valued-shoppers-challenge/postgres-clv' # @param { isTemplate: true, type: 'string'}

### Postprocess data

In [None]:
def process(company):
    print("Process company {}".format(company))
    customer_level_data_file = f"{DATA_FOLDER}/customer_level_data_company_{company}.csv"
    customer_level_df = pd.read_csv(customer_level_data_file)

    # Rename 'id' to 'customer_id'
    customer_level_df.rename(columns={'id': 'customer_id'}, inplace=True)

    # Insert 'company_id' as the first column
    customer_level_df.insert(0, 'company_id', company)

    updated_customer_level_data_file = f"{OUTPUT_FOLDER}/customer_level_data_company_{company}.csv"
    
    # Save the updated DataFrame to CSV without the index    
    customer_level_df.to_csv(updated_customer_level_data_file, index=False)


In [None]:
p = multiprocessing.Pool(multiprocessing.cpu_count())
_ = p.map(process, COMPANYS)