In [89]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [90]:
retail_dataset_path = r'C:\Users\Rheza\OneDrive - Telkom University\Machine learning\Minggu 7\Retail_Transactions_Dataset.csv'


In [91]:
# Use Pandas to read the CSV file
retail_dataset_path = pd.read_csv(retail_dataset_path)

In [92]:
# Display the first few rows of the dataset
print(retail_dataset_path.head())

   Transaction_ID                 Date     Customer_Name  \
0      1000000000  2020-12-21 19:42:52   Cheyenne Newman   
1      1000000001  2020-07-06 07:45:16  Emily Fitzgerald   
2      1000000002  2021-10-02 06:28:44      Michael Webb   
3      1000000003  2022-01-10 05:39:02      Kimberly Lin   
4      1000000004  2021-10-13 07:28:47   Cathy Hernandez   

                                             Product  Total_Items  Total_Cost  \
0                                       ['Hair Gel']            6       12.77   
1         ['Tuna', 'Bread', 'Tissues', 'Trash Bags']            5       13.88   
2                         ['Jam', 'Soap', 'Ketchup']            7       47.02   
3                                      ['BBQ Sauce']            9       83.86   
4  ['Hand Sanitizer', 'Bread', 'Extension Cords',...            4       30.55   

   Payment_Method      City         Store_Type  Discount_Applied  \
0      Debit Card  New York  Convenience Store              True   
1      Debit Car

In [93]:
# Data preprocessing
# Convert 'Price', 'Reviews', 'Size', and 'Installs' to numeric (replace non-numeric values with 0)
retail_dataset_path['Store_Type'] = pd.to_numeric(retail_dataset_path['Store_Type'].str.replace('$', ''), errors='coerce').fillna(0)
retail_dataset_path['Date'] = pd.to_numeric(retail_dataset_path['Date'], errors='coerce').fillna(0)
retail_dataset_path['Customer_Name'] = pd.to_numeric(retail_dataset_path['Customer_Name'].str.replace('M', '').str.replace('k', '').str.replace(',', ''), errors='coerce').fillna(0)
retail_dataset_path['Product'] = pd.to_numeric(retail_dataset_path['Product'].str.replace('+', '').str.replace(',', ''), errors='coerce').fillna(0)

In [94]:
# Encode 'Content Rating' column
retail_dataset_path = pd.get_dummies(retail_dataset_path, columns=['Promotion'], drop_first=True)

In [95]:
# Select features based on the prediction goal
cols_to_use_retail = ['Store_Type', 'Date', 'Customer_Name', 'Product',]  # Include 'Rating' in features
retail_dataset_path = retail_dataset_path[cols_to_use_retail].dropna()

In [96]:
# Pastikan Anda telah memiliki path dari dataset
path_file_dataset = 'C:/Users/Rheza/OneDrive - Telkom University/Machine learning/Minggu 7/Retail_Transactions_Dataset.csv'  # Ganti dengan path yang benar

# Memuat dataset ke dalam DataFrame
retail_dataset = pd.read_csv(path_file_dataset)

# Pisahkan fitur (X) dan target (y)
X_retail = retail_dataset.drop('Total_Items', axis=1)  # Pastikan 'Total_Items' adalah nama kolom yang benar
y_retail = retail_dataset['Total_Items']  # Pastikan 'Total_Items' adalah nama kolom yang benar


In [97]:
# Split data into training and validation sets
X_train_retail, X_valid_retail, y_train_retail, y_valid_retail = train_test_split(X_retail, y_retail, test_size=0.2, random_state=42)

In [98]:
# Hyperparameter tuning using GridSearchCV
# Define model
model_grid = XGBRegressor()

In [99]:
# Define parameter grid
param_grid = {
    'n_estimators': [100, 150, 200, 250, 300]  # Adjust the range of values to try
}

In [100]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model_grid, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error')

In [101]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline

# Konversi kolom tanggal menjadi datetime
X_train_retail['Date'] = pd.to_datetime(X_train_retail['Date'])

# Ekstrak tahun, bulan, dan hari dari tanggal
X_train_retail['Year'] = X_train_retail['Date'].dt.year
X_train_retail['Month'] = X_train_retail['Date'].dt.month
X_train_retail['Day'] = X_train_retail['Date'].dt.day

# Hapus kolom tanggal asli
X_train_retail = X_train_retail.drop('Date', axis=1)

# Tentukan kolom kategorikal
categorical_columns = ['Customer_Name', 'Product', 'Payment_Method', 'City', 'Store_Type', 'Customer_Category', 'Season', 'Promotion']

# Inisialisasi OneHotEncoder dan ColumnTransformer
column_transformer = ColumnTransformer(
    [("encoder", OneHotEncoder(handle_unknown='ignore'), categorical_columns)],
    remainder='passthrough')

# Buat pipeline
pipeline = Pipeline([
    ('transformer', column_transformer),
    ('model', XGBRegressor())
])

# Parameter untuk GridSearchCV
param_grid = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.01, 0.1],
    'model__max_depth': [3, 5]
}

# Inisialisasi GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

# Latih model
grid_search.fit(X_train_retail, y_train_retail)


In [102]:
best_learning_rate = grid_search.best_params_['model__learning_rate']
best_max_depth = grid_search.best_params_['model__max_depth']

print("Best learning_rate:", best_learning_rate)
print("Best max_depth:", best_max_depth)


Best learning_rate: 0.01
Best max_depth: 3


In [103]:
# Define a range of n_estimators values to test
n_estimators_list = [100, 150, 200, 250, 300]

In [104]:
# Kolom yang ingin diubah menjadi kategorikal
categorical_columns = ['Date', 'Customer_Name', 'Product', 'Payment_Method', 
                       'City', 'Store_Type', 'Customer_Category', 'Season', 'Promotion','Year', 'Day', 'Month']

# Cek apakah setiap kolom ada dalam DataFrame
missing_columns = [col for col in categorical_columns if col not in X_train_retail.columns]
if missing_columns:
    print("Kolom berikut tidak ada dalam DataFrame: ", missing_columns)
else:
    print("Semua kolom tersedia.")


Kolom berikut tidak ada dalam DataFrame:  ['Date']


In [105]:
print("Columns in X_train_retail:", X_train_retail.columns)
print("Columns in X_valid_retail:", X_valid_retail.columns)


Columns in X_train_retail: Index(['Transaction_ID', 'Customer_Name', 'Product', 'Total_Cost',
       'Payment_Method', 'City', 'Store_Type', 'Discount_Applied',
       'Customer_Category', 'Season', 'Promotion', 'Year', 'Month', 'Day'],
      dtype='object')
Columns in X_valid_retail: Index(['Transaction_ID', 'Date', 'Customer_Name', 'Product', 'Total_Cost',
       'Payment_Method', 'City', 'Store_Type', 'Discount_Applied',
       'Customer_Category', 'Season', 'Promotion'],
      dtype='object')


In [106]:
# Restart Kernel Anda dan muat ulang data

# Setelah memuat data, jalankan kode transformasi ini sekali saja
if 'Date' in X_valid_retail.columns:
    X_valid_retail['Date'] = pd.to_datetime(X_valid_retail['Date'])
    X_valid_retail['Year'] = X_valid_retail['Date'].dt.year
    X_valid_retail['Month'] = X_valid_retail['Date'].dt.month
    X_valid_retail['Day'] = X_valid_retail['Date'].dt.day
    X_valid_retail.drop('Date', axis=1, inplace=True)


In [107]:


# Tentukan kolom kategorikal
categorical_columns = ['Customer_Name', 'Product', 'Payment_Method', 
                       'City', 'Store_Type', 'Customer_Category', 'Season', 'Promotion']

# Inisialisasi OneHotEncoder dengan handle_unknown='ignore'
onehot_encoder = OneHotEncoder(handle_unknown='ignore')

# Inisialisasi ColumnTransformer
column_transformer = ColumnTransformer(
    [("encoder", onehot_encoder, categorical_columns)],
    remainder='passthrough')

# Terapkan transformasi pada data latih dan validasi
X_train_retail_encoded = column_transformer.fit_transform(X_train_retail)
X_valid_retail_encoded = column_transformer.transform(X_valid_retail)


In [108]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Inisialisasi model XGBRegressor
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1)

# Latih model dengan data latih
xgb_model.fit(X_train_retail_encoded, y_train_retail)

# Lakukan prediksi pada data validasi
y_pred = xgb_model.predict(X_valid_retail_encoded)

# Hitung dan cetak nilai Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_valid_retail, y_pred, squared=False)
print(f"Root Mean Squared Error: {rmse}")


Root Mean Squared Error: 2.8826685205649634
