In [1]:
import pandas as pd
import src.utils as utils
import os
from faker import Faker

from sklearn.model_selection import train_test_split

# Load config file

In [2]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw',
 'data_set_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_columns_path': 'data/output/input_columns.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'valid_set_path': ['data/output/X_valid.pkl', 'data/output/y_valid.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'output_column': 'product_id',
 'seed': 42,
 'test_size': 0.2,
 'standardizer_path': 'data/output/standardizer.pkl',
 'preprocessor_path': 'data/output/preprocessor.pkl',
 'train_clean_path': ['data/output/X_train_clean.pkl',
  'data/output/y_train_clean.pkl'],
 'valid_clean_path': ['data/output/X_valid_clean.pkl',
  'data/output/y_valid_clean.pkl'],
 'test_clean_path': ['data/output/X_test_clean.pkl',
  'data/output/y_test_clean.pkl'],
 'list_of_model_path': 'log/list_of_model.pkl',
 'list_of_param_path': 'log/list_of_param.pkl',
 'list_of_tuned_mod

## Synthetic data for dummy and drop unused column

In [3]:
raw_dataset_path = CONFIG_DATA['raw_dataset_path'] 
raw_dataset_path

'data/raw'

In [4]:
purchase_history = pd.read_csv(raw_dataset_path + '/purchase_history.csv', delimiter=";")
product_detail = pd.read_csv(raw_dataset_path+ '/product_details.csv', delimiter=";")
customer_interactions = pd.read_csv(raw_dataset_path + '/customer_interactions.csv', delimiter=",")

In [5]:
# cek struktur data
purchase_history.head()

Unnamed: 0,customer_id,product_id,purchase_date,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,1,101,2023-01-01,,,,
1,1,105,2023-01-05,,,,
2,2,102,2023-01-02,,,,
3,3,103,2023-01-03,,,,
4,4,104,2023-01-04,,,,


In [6]:
product_detail.head()

Unnamed: 0,product_id,category,price,ratings,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,101,Electronics,500,4.5,,,
1,102,Clothing,50,3.8,,,
2,103,Home & Kitchen,200,4.2,,,
3,104,Beauty,30,4.0,,,
4,105,Electronics,800,4.8,,,


In [7]:
customer_interactions.head()

Unnamed: 0,customer_id,page_views,time_spent
0,1,25,120
1,2,20,90
2,3,30,150
3,4,15,80
4,5,22,110


In [8]:
from faker import Faker
import pandas as pd
import random

fake = Faker()
categories = product_detail['category'].unique()

# Generate purchase_history data
purchase_history_data = []
for _ in range(100):
    purchase_history_data.append({
        'customer_id': random.randint(1, 20),
        'product_id': random.randint(1, 105),
        'purchase_date': fake.date_between(start_date='-1y', end_date='today')
    })

purchase_history = pd.DataFrame(purchase_history_data)


# Generate product_detail data
product_detail_data = []
product_ids = range(1, 106) # generate 100 product_id unik dari 1 hingga 100
for product_id in product_ids:
    product_detail_data.append({
        'product_id': product_id,
        'category': random.choice(categories),
        'price': random.randint(10, 1000),
        'ratings': round(random.uniform(1, 5), 1)
    })
product_detail = pd.DataFrame(product_detail_data)

# Generate customer_interactions data
customer_interactions_data = []
for _ in range(100):
    customer_interactions_data.append({
        'customer_id': random.randint(1, 20),
        'page_views': random.randint(1, 50),
        'time_spent': random.randint(30, 300)
    })

customer_interactions = pd.DataFrame(customer_interactions_data)

In [9]:
purchase_history.to_csv(raw_dataset_path + '/datafaker_purchase_history.csv', index=False)
product_detail.to_csv(raw_dataset_path +'/datafaker_product_details.csv', index=False)
customer_interactions.to_csv(raw_dataset_path +'/datafaker_customer_interactions.csv', index=False)

## Merge data

In [27]:
# cek missing values
missing_values = {
    "customer_interactions": customer_interactions.isnull().sum(),
    "purchase_history": purchase_history.isnull().sum(),
    "product_details": product_detail.isnull().sum()
}
missing_values

{'customer_interactions': customer_id    0
 page_views     0
 time_spent     0
 dtype: int64,
 'purchase_history': customer_id      0
 product_id       0
 purchase_date    0
 dtype: int64,
 'product_details': product_id    0
 category      0
 price         0
 ratings       0
 dtype: int64}

In [28]:
# cek outliers
summary_statistics = {
    "customer_interactions": customer_interactions.describe(),
    "purchase_history": purchase_history.describe(include='all'),
    "product_details": product_detail.describe()
}
summary_statistics

{'customer_interactions':        customer_id  page_views  time_spent
 count   100.000000   100.00000  100.000000
 mean     11.070000    25.73000  157.740000
 std       6.092312    14.74076   78.872051
 min       1.000000     1.00000   30.000000
 25%       6.000000    13.75000   81.750000
 50%      11.000000    25.50000  161.000000
 75%      16.000000    39.00000  217.000000
 max      20.000000    50.00000  296.000000,
 'purchase_history':         customer_id  product_id purchase_date
 count    100.000000  100.000000           100
 unique          NaN         NaN            86
 top             NaN         NaN    2024-02-29
 freq            NaN         NaN             3
 mean      11.200000   56.950000           NaN
 std        5.939085   28.426895           NaN
 min        1.000000    1.000000           NaN
 25%        7.750000   32.500000           NaN
 50%       10.500000   58.000000           NaN
 75%       16.000000   79.000000           NaN
 max       20.000000  105.000000         

In [32]:
product_detail.describe()

Unnamed: 0,product_id,price,ratings
count,105.0,105.0,105.0
mean,53.0,445.342857,2.847619
std,30.454885,288.451063,1.117784
min,1.0,18.0,1.0
25%,27.0,174.0,1.9
50%,53.0,394.0,3.0
75%,79.0,705.0,3.7
max,105.0,982.0,4.9


In [11]:
summary_statistics['product_details']['ratings'].max()

105.0

**customer_interactions**
- Tidak ada missing values. (karna dummy)
- Jumlah page_views dan time_spent bervariasi. page_views maksimum yang tercatat adalah 50 dan waktu_yang dihabiskan maksimum adalah 299 second.

**purchase_history**
- Tidak ada missing values. (karna dummy)

**product_detail**
- Tidak ada missing values. (karna dummy)
- Harga produk berkisar antara $40 hingga $983 dan ratings dari 1 sampai 5 yang menunjukan urutan ordinal

### Cek outliers dan korelasi

In [33]:
# merge purchase_history dengan product_details pada 'product_id'
purchase_details = pd.merge(purchase_history, product_detail, on='product_id', how='left')

# Merging hasil purchase_details with customer_interactions pada 'customer_id'
full_data = pd.merge(purchase_details, customer_interactions, on='customer_id', how='left')

# show data
full_data.head()

Unnamed: 0,customer_id,product_id,purchase_date,category,price,ratings,page_views,time_spent
0,6,39,2023-10-27,Beauty,217,2.0,27,177
1,6,39,2023-10-27,Beauty,217,2.0,39,295
2,6,39,2023-10-27,Beauty,217,2.0,4,251
3,6,39,2023-10-27,Beauty,217,2.0,34,209
4,6,39,2023-10-27,Beauty,217,2.0,17,256


In [34]:
full_data.to_csv(raw_dataset_path + '/data.csv', index=False)

In [35]:
def read_data(return_file=True):
    # Read data
    data = pd.read_csv(raw_dataset_path+ '/data.csv')
    # Print data
    print('data shape   :', data.shape)

    # Dump data
    utils.pickle_dump(data, CONFIG_DATA['data_set_path'])

    # Return data
    if return_file:
        return data

In [15]:
data = read_data()
data.head()

data shape   : (509, 8)


Unnamed: 0,customer_id,product_id,purchase_date,category,price,ratings,page_views,time_spent
0,6,39,2023-10-27,Beauty,217,2.0,27,177
1,6,39,2023-10-27,Beauty,217,2.0,39,295
2,6,39,2023-10-27,Beauty,217,2.0,4,251
3,6,39,2023-10-27,Beauty,217,2.0,34,209
4,6,39,2023-10-27,Beauty,217,2.0,17,256


## Data Splitting

In [17]:
CONFIG_DATA['data_set_path']

'data/output/data.pkl'

In [18]:
def split_input_output(return_file=True):
    # Read data
    data = utils.pickle_load(CONFIG_DATA['data_set_path'])

    # Split input & output
    y = data[CONFIG_DATA['output_column']]
    X = data.drop([CONFIG_DATA['output_column']], axis=1)

    # Print splitting
    print('Input shape  :', X.shape)
    print('Output shape :', y.shape)
    print('Input NAN    :')
    print(X.isnull().sum())
    print('Benchmark    :')
    print(y.value_counts(normalize=True))
    
    # Dump file
    utils.pickle_dump(X, CONFIG_DATA['input_set_path'])
    utils.pickle_dump(y, CONFIG_DATA['output_set_path'])
    utils.pickle_dump(X.columns, CONFIG_DATA['input_columns_path'])     # dump input columns

    if return_file:
        return X, y

In [19]:
X, y = split_input_output()

Input shape  : (509, 7)
Output shape : (509,)
Input NAN    :
customer_id      0
purchase_date    0
category         0
price            0
ratings          0
page_views       0
time_spent       0
dtype: int64
Benchmark    :
product_id
74     0.047151
70     0.037328
103    0.035363
24     0.035363
57     0.035363
         ...   
27     0.005894
61     0.005894
35     0.001965
44     0.001965
34     0.001965
Name: proportion, Length: 63, dtype: float64


In [20]:
CONFIG_DATA['input_set_path']

'data/output/input.pkl'

In [21]:
CONFIG_DATA['test_size']

0.2

In [22]:
def split_train_test(return_file=True):
    # Load data
    X = utils.pickle_load(CONFIG_DATA['input_set_path'])
    y = utils.pickle_load(CONFIG_DATA['output_set_path'])

    # Split test & rest (train & valid)
    X_train, X_test, y_train, y_test = train_test_split( 
                                            X,
                                            y,
                                            test_size = CONFIG_DATA['test_size'],
                                            random_state = CONFIG_DATA['seed']
                                        )
    
    # Split train & valid
    X_train, X_valid, y_train, y_valid = train_test_split(
                                            X_train,
                                            y_train,
                                            test_size = CONFIG_DATA['test_size'],
                                            random_state = CONFIG_DATA['seed']
                                        )
    
    # Print splitting
    print('X_train shape :', X_train.shape)
    print('y_train shape :', y_train.shape)
    print('X_valid shape  :', X_valid.shape)
    print('y_valid shape  :', y_valid.shape)
    print('X_test shape  :', X_test.shape)
    print('y_test shape  :', y_test.shape)

    # Dump file
    utils.pickle_dump(X_train, CONFIG_DATA['train_set_path'][0])
    utils.pickle_dump(y_train, CONFIG_DATA['train_set_path'][1])
    utils.pickle_dump(X_valid, CONFIG_DATA['valid_set_path'][0])
    utils.pickle_dump(y_valid, CONFIG_DATA['valid_set_path'][1])
    utils.pickle_dump(X_test, CONFIG_DATA['test_set_path'][0])
    utils.pickle_dump(y_test, CONFIG_DATA['test_set_path'][1])

    if return_file:
        return X_train, X_valid, X_test, y_train, y_valid, y_test

In [23]:
X_train, X_valid, X_test, y_train, y_valid, y_test = split_train_test()

X_train shape : (325, 7)
y_train shape : (325,)
X_valid shape  : (82, 7)
y_valid shape  : (82,)
X_test shape  : (102, 7)
y_test shape  : (102,)


Get sample testing

In [24]:
import numpy as np

In [25]:
np.random.seed(123)
y_sample_1 = y_test[y_test==101].sample(10)
y_sample_2 = y_test[y_test==102].sample(10)
y_sample_3 = y_test[y_test==103].sample(10)
y_sample_4 = y_test[y_test==104].sample(10)
y_sample_5 = y_test[y_test==105].sample(10)

y_sample = pd.concat((y_sample_1, y_sample_2, y_sample_3, y_sample_4, y_sample_5), axis=0)
y_sample

ValueError: Cannot take a larger sample than population when 'replace=False'

In [None]:
X_sample = X_test.loc[y_sample.index]
X_sample

In [None]:
X_sample.to_csv('data/output/X_sample.csv', index=False)

In [None]:
y_sample.to_csv('data/output/y_sample.csv', index=False)