In [1]:
import pandas as pd
import src.utils as utils
import os
from faker import Faker

from sklearn.model_selection import train_test_split

# Load config file

In [2]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw',
 'data_set_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_columns_path': 'data/output/input_columns.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'valid_set_path': ['data/output/X_valid.pkl', 'data/output/y_valid.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'output_column': 'category_encoded',
 'seed': 42,
 'test_size': 0.2,
 'standardizer_path': 'data/output/standardizer.pkl',
 'preprocessor_path': 'data/output/preprocessor.pkl',
 'train_clean_path': ['data/output/X_train_clean.pkl',
  'data/output/y_train_clean.pkl'],
 'valid_clean_path': ['data/output/X_valid_clean.pkl',
  'data/output/y_valid_clean.pkl'],
 'test_clean_path': ['data/output/X_test_clean.pkl',
  'data/output/y_test_clean.pkl'],
 'list_of_model_path': 'log/list_of_model.pkl',
 'list_of_param_path': 'log/list_of_param.pkl',
 'list_of_tun

## Synthetic data for dummy and drop unused column

In [3]:
raw_dataset_path = CONFIG_DATA['raw_dataset_path'] 
raw_dataset_path

'data/raw'

In [4]:
purchase_history = pd.read_csv(raw_dataset_path + '/purchase_history.csv', delimiter=";")
product_detail = pd.read_csv(raw_dataset_path+ '/product_details.csv', delimiter=";")
customer_interactions = pd.read_csv(raw_dataset_path + '/customer_interactions.csv', delimiter=",")

In [5]:
# cek struktur data
purchase_history.head()

Unnamed: 0,customer_id,product_id,purchase_date,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,1,101,2023-01-01,,,,
1,1,105,2023-01-05,,,,
2,2,102,2023-01-02,,,,
3,3,103,2023-01-03,,,,
4,4,104,2023-01-04,,,,


In [6]:
product_detail.head()

Unnamed: 0,product_id,category,price,ratings,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,101,Electronics,500,4.5,,,
1,102,Clothing,50,3.8,,,
2,103,Home & Kitchen,200,4.2,,,
3,104,Beauty,30,4.0,,,
4,105,Electronics,800,4.8,,,


In [7]:
customer_interactions.head()

Unnamed: 0,customer_id,page_views,time_spent
0,1,25,120
1,2,20,90
2,3,30,150
3,4,15,80
4,5,22,110


In [8]:
from faker import Faker
import pandas as pd
import random

fake = Faker()
categories = product_detail['category'].unique()

# Generate purchase_history data
purchase_history_data = []
for _ in range(100):
    purchase_history_data.append({
        'customer_id': random.randint(1, 20),
        'product_id': random.randint(1, 105),
        'purchase_date': fake.date_between(start_date='-1y', end_date='today')
    })

purchase_history = pd.DataFrame(purchase_history_data)


# Generate product_detail data
product_detail_data = []
product_ids = range(1, 106) # generate 100 product_id unik dari 1 hingga 100
for product_id in product_ids:
    product_detail_data.append({
        'product_id': product_id,
        'category': random.choice(categories),
        'price': random.randint(10, 1000),
        'ratings': round(random.uniform(1, 5), 1)
    })
product_detail = pd.DataFrame(product_detail_data)

# Generate customer_interactions data
customer_interactions_data = []
for _ in range(100):
    customer_interactions_data.append({
        'customer_id': random.randint(1, 20),
        'page_views': random.randint(1, 50),
        'time_spent': random.randint(30, 300)
    })

customer_interactions = pd.DataFrame(customer_interactions_data)

In [9]:
purchase_history.to_csv(raw_dataset_path + '/datafaker_purchase_history.csv', index=False)
product_detail.to_csv(raw_dataset_path +'/datafaker_product_details.csv', index=False)
customer_interactions.to_csv(raw_dataset_path +'/datafaker_customer_interactions.csv', index=False)

## Merge data

In [10]:
# cek missing values
missing_values = {
    "customer_interactions": customer_interactions.isnull().sum(),
    "purchase_history": purchase_history.isnull().sum(),
    "product_details": product_detail.isnull().sum()
}
missing_values

{'customer_interactions': customer_id    0
 page_views     0
 time_spent     0
 dtype: int64,
 'purchase_history': customer_id      0
 product_id       0
 purchase_date    0
 dtype: int64,
 'product_details': product_id    0
 category      0
 price         0
 ratings       0
 dtype: int64}

In [11]:
# cek outliers
summary_statistics = {
    "customer_interactions": customer_interactions.describe(),
    "purchase_history": purchase_history.describe(include='all'),
    "product_details": product_detail.describe()
}
summary_statistics

{'customer_interactions':        customer_id  page_views  time_spent
 count   100.000000  100.000000  100.000000
 mean     10.620000   25.940000  167.080000
 std       5.839226   14.216096   76.207675
 min       1.000000    1.000000   30.000000
 25%       6.000000   14.750000  109.250000
 50%      11.000000   26.000000  164.000000
 75%      16.000000   39.000000  226.500000
 max      20.000000   49.000000  299.000000,
 'purchase_history':         customer_id  product_id purchase_date
 count    100.000000  100.000000           100
 unique          NaN         NaN            83
 top             NaN         NaN    2023-10-18
 freq            NaN         NaN             3
 mean      11.020000   56.460000           NaN
 std        5.862016   30.639784           NaN
 min        1.000000    2.000000           NaN
 25%        6.000000   30.500000           NaN
 50%       12.000000   56.500000           NaN
 75%       16.000000   86.000000           NaN
 max       20.000000  105.000000         

In [12]:
product_detail.describe()

Unnamed: 0,product_id,price,ratings
count,105.0,105.0,105.0
mean,53.0,464.27619,2.982857
std,30.454885,288.547673,1.134211
min,1.0,23.0,1.0
25%,27.0,199.0,2.1
50%,53.0,435.0,3.0
75%,79.0,677.0,3.8
max,105.0,997.0,5.0


In [13]:
summary_statistics['product_details']['ratings'].max()

105.0

**customer_interactions**
- Tidak ada missing values. (karna dummy)
- Jumlah page_views dan time_spent bervariasi. page_views maksimum yang tercatat adalah 50 dan waktu_yang dihabiskan maksimum adalah 299 second.

**purchase_history**
- Tidak ada missing values. (karna dummy)

**product_detail**
- Tidak ada missing values. (karna dummy)
- Harga produk berkisar antara $40 hingga $983 dan ratings dari 1 sampai 5 yang menunjukan urutan ordinal

### Cek outliers dan korelasi

In [14]:
# merge purchase_history dengan product_details pada 'product_id'
purchase_details = pd.merge(purchase_history, product_detail, on='product_id', how='left')

# Merging hasil purchase_details with customer_interactions pada 'customer_id'
full_data = pd.merge(purchase_details, customer_interactions, on='customer_id', how='left')

# show data
full_data.head()

Unnamed: 0,customer_id,product_id,purchase_date,category,price,ratings,page_views,time_spent
0,16,89,2023-06-30,Beauty,381,3.8,1,242
1,16,89,2023-06-30,Beauty,381,3.8,40,166
2,16,89,2023-06-30,Beauty,381,3.8,43,44
3,16,89,2023-06-30,Beauty,381,3.8,35,140
4,16,89,2023-06-30,Beauty,381,3.8,24,111


encode category

In [15]:
full_data['category'].unique()

array(['Beauty', 'Home & Kitchen', 'Electronics', 'Clothing'],
      dtype=object)

In [16]:
from sklearn.preprocessing import LabelEncoder

# enocde categoru and Fit standardizer
encoder = LabelEncoder()
categories = ['Clothing', 'Home & Kitchen', 'Beauty', 'Electronics']
encoder.fit(categories)

full_data['category_encoded'] = encoder.transform(full_data['category'])
# Check the mapping to confirm it is correct
category_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print("Category Mapping:", category_mapping)

Category Mapping: {'Beauty': 0, 'Clothing': 1, 'Electronics': 2, 'Home & Kitchen': 3}


In [17]:
full_data.head()

Unnamed: 0,customer_id,product_id,purchase_date,category,price,ratings,page_views,time_spent,category_encoded
0,16,89,2023-06-30,Beauty,381,3.8,1,242,0
1,16,89,2023-06-30,Beauty,381,3.8,40,166,0
2,16,89,2023-06-30,Beauty,381,3.8,43,44,0
3,16,89,2023-06-30,Beauty,381,3.8,35,140,0
4,16,89,2023-06-30,Beauty,381,3.8,24,111,0


In [18]:
# full_data.drop(columns=['purchase_date', 'product_id'], inplace=True)

In [19]:
full_data.to_csv(raw_dataset_path + '/data.csv', index=False)

In [20]:
def read_data(return_file=True):
    # Read data
    data = pd.read_csv(raw_dataset_path+ '/data.csv')
    # Print data
    print('data shape   :', data.shape)

    # Dump data
    utils.pickle_dump(data, CONFIG_DATA['data_set_path'])

    # Return data
    if return_file:
        return data

In [21]:
data = read_data()
data.head()

data shape   : (555, 9)


Unnamed: 0,customer_id,product_id,purchase_date,category,price,ratings,page_views,time_spent,category_encoded
0,16,89,2023-06-30,Beauty,381,3.8,1,242,0
1,16,89,2023-06-30,Beauty,381,3.8,40,166,0
2,16,89,2023-06-30,Beauty,381,3.8,43,44,0
3,16,89,2023-06-30,Beauty,381,3.8,35,140,0
4,16,89,2023-06-30,Beauty,381,3.8,24,111,0


## Data Splitting

In [22]:
CONFIG_DATA['data_set_path']

'data/output/data.pkl'

In [23]:
def split_input_output(return_file=True):
    # Read data
    data = utils.pickle_load(CONFIG_DATA['data_set_path'])

    # Split input & output
    y = data[CONFIG_DATA['output_column']]
    X = data.drop([CONFIG_DATA['output_column']], axis=1)

    # Print splitting
    print('Input shape  :', X.shape)
    print('Output shape :', y.shape)
    print('Input NAN    :')
    print(X.isnull().sum())
    print('Benchmark    :')
    print(y.value_counts(normalize=True))
    
    # Dump file
    utils.pickle_dump(X, CONFIG_DATA['input_set_path'])
    utils.pickle_dump(y, CONFIG_DATA['output_set_path'])
    utils.pickle_dump(X.columns, CONFIG_DATA['input_columns_path'])     # dump input columns

    if return_file:
        return X, y

In [24]:
X, y = split_input_output()

Input shape  : (555, 8)
Output shape : (555,)
Input NAN    :
customer_id      0
product_id       0
purchase_date    0
category         0
price            0
ratings          0
page_views       0
time_spent       0
dtype: int64
Benchmark    :
category_encoded
3    0.320721
2    0.290090
1    0.225225
0    0.163964
Name: proportion, dtype: float64


In [25]:
CONFIG_DATA['input_set_path']

'data/output/input.pkl'

In [26]:
CONFIG_DATA['test_size']

0.2

In [27]:
def split_train_test(return_file=True):
    # Load data
    X = utils.pickle_load(CONFIG_DATA['input_set_path'])
    y = utils.pickle_load(CONFIG_DATA['output_set_path'])

    # Split test & rest (train & valid)
    X_train, X_test, y_train, y_test = train_test_split( 
                                            X,
                                            y,
                                            test_size = CONFIG_DATA['test_size'],
                                            random_state = CONFIG_DATA['seed']
                                        )
    
    # Split train & valid
    X_train, X_valid, y_train, y_valid = train_test_split(
                                            X_train,
                                            y_train,
                                            test_size = CONFIG_DATA['test_size'],
                                            random_state = CONFIG_DATA['seed']
                                        )
    
    # Print splitting
    print('X_train shape :', X_train.shape)
    print('y_train shape :', y_train.shape)
    print('X_valid shape  :', X_valid.shape)
    print('y_valid shape  :', y_valid.shape)
    print('X_test shape  :', X_test.shape)
    print('y_test shape  :', y_test.shape)

    # Dump file
    utils.pickle_dump(X_train, CONFIG_DATA['train_set_path'][0])
    utils.pickle_dump(y_train, CONFIG_DATA['train_set_path'][1])
    utils.pickle_dump(X_valid, CONFIG_DATA['valid_set_path'][0])
    utils.pickle_dump(y_valid, CONFIG_DATA['valid_set_path'][1])
    utils.pickle_dump(X_test, CONFIG_DATA['test_set_path'][0])
    utils.pickle_dump(y_test, CONFIG_DATA['test_set_path'][1])

    if return_file:
        return X_train, X_valid, X_test, y_train, y_valid, y_test

In [28]:
X_train, X_valid, X_test, y_train, y_valid, y_test = split_train_test()

X_train shape : (355, 8)
y_train shape : (355,)
X_valid shape  : (89, 8)
y_valid shape  : (89,)
X_test shape  : (111, 8)
y_test shape  : (111,)


Get sample testing

In [29]:
import numpy as np

In [30]:
y_test.unique()

array([0, 3, 1, 2])

In [31]:
np.random.seed(123)
y_sample_0 = y_test[y_test==0].sample(10)
y_sample_1 = y_test[y_test==1].sample(10)
y_sample_2 = y_test[y_test==2].sample(10)
y_sample_3 = y_test[y_test==3].sample(10)

y_sample = pd.concat((y_sample_0, y_sample_1, y_sample_2, y_sample_3), axis=0)
y_sample

148    0
6      0
84     0
316    0
88     0
235    0
82     0
182    0
363    0
0      0
353    1
497    1
192    1
398    1
447    1
55     1
468    1
261    1
257    1
471    1
222    2
30     2
434    2
22     2
436    2
334    2
101    2
137    2
90     2
238    2
374    3
513    3
485    3
548    3
73     3
376    3
172    3
63     3
543    3
68     3
Name: category_encoded, dtype: int64

In [32]:
X_sample = X_test.loc[y_sample.index]
X_sample

Unnamed: 0,customer_id,product_id,purchase_date,category,price,ratings,page_views,time_spent
148,16,71,2024-04-13,Beauty,990,2.2,35,140
6,16,89,2023-06-30,Beauty,381,3.8,9,206
84,16,101,2023-11-03,Beauty,983,3.8,9,206
316,13,82,2023-05-17,Beauty,677,4.9,27,174
88,16,101,2023-11-03,Beauty,983,3.8,1,71
235,17,101,2023-08-11,Beauty,983,3.8,17,290
82,16,101,2023-11-03,Beauty,983,3.8,24,111
182,10,45,2023-07-01,Beauty,23,3.6,15,294
363,5,89,2023-08-28,Beauty,381,3.8,1,60
0,16,89,2023-06-30,Beauty,381,3.8,1,242


In [33]:
X_sample.to_csv('data/output/X_sample.csv', index=False)

In [34]:
y_sample.to_csv('data/output/y_sample.csv', index=False)