# **Kelompok 7 - MSE**
- Harianto             [2231110]
- Jefriyanto Chandra   [2231067]
- Bryan Kenedy         [2231141]
- Gary Happydinata     [2231152]
- Randy Heskyel        [2231149]
<br>
<i>Machine Learning project for predicting user's purchase quantity</i>

# **Import Libraries**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from joblib import dump, load
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from datetime import datetime

# **Data Collection**

In [None]:
data = pd.read_csv('example_data.csv')

# for revert the scaled pred y to it's real value later
unscaled_data = pd.read_csv('example_data.csv')

data.head()

# **Data Preparation**

## EDA (Exploratory Data Analysis)

In [None]:
data.info()

In [None]:
# check dataset columns
data.columns

In [None]:
# check dataset column data type
data.dtypes

In [None]:
# show dataset statistics
data.describe()

In [None]:
data.isnull().sum()

In [None]:
# check `Harga Penjualan` distribution
sns.histplot(data['Harga Penjualan'], kde=True)
plt.title('Harga Penjualan Distribution ')
plt.show()

In [None]:
# check `Kuantitas` boxplot
sns.boxplot(x=data['Kuantitas'])
plt.title('Kuantitas Boxplot ')
plt.show()

In [None]:
# numveric variables correlations
corr_matrix = data[['Umur', 'Kuantitas', 'Harga Penjualan']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='Blues')
plt.title('Numeric Variables Correlations')
plt.show()

In [None]:
# sales trend by `Tanggal`
data['Tanggal'] = pd.to_datetime(data['Tanggal'])
data.set_index('Tanggal').resample('D')['Harga Penjualan'].sum().plot()
plt.title('Daily Sales Trend')
plt.show()

In [None]:
# `Metode Pembayaran` distribution
sns.countplot(x='Metode Pembayaran', data=data)
plt.title('Metode Pembayaran Distribution')
plt.show()

In [None]:
# Correlation between `Jenis Produk` dan `Kuantitas`
sns.boxplot(x='Jenis', y='Kuantitas', data=data)
plt.title('Kuantitas by Jenis Produk')
plt.show()

## Data Cleaning

In [None]:
# concatenate `Nama` and `Jenis` column first
data['Nama'] = data['Jenis'] + ' ' + data['Nama']

# delete `Jenis` column
data = data.drop(['Jenis'], axis=1)

# delete `Nomor` and `Harga Penjualan` columns
data = data.drop(['Nomor', 'Harga Penjualan'], axis=1)

data.info()

In [None]:
# drop null values
data.dropna(inplace=True)

In [None]:
data.info()

In [None]:
# check duplicated data
data.duplicated().sum()

## Data normalization

In [None]:
# transform `Tanggal` column into datetime
data['Tanggal'] = pd.to_datetime(data['Tanggal'])
data['Tanggal'] = (data['Tanggal'] - data['Tanggal'].min()).dt.days
data.head()

In [None]:
le = {}

# initiate label encoder
for column in ['Hari', 'Nama', 'Kode', 'Unit', 'Metode Pembayaran']:
    le[column] = LabelEncoder()

# transform categorical columns with label encoder
for column in ['Hari', 'Nama', 'Kode', 'Unit', 'Metode Pembayaran']:
    data[column] = le[column].fit_transform(data[column].astype(str))

data.head()

In [None]:
# transform numerical column with min max scaler
umur_scaler = MinMaxScaler()
kuantitas_scaler = MinMaxScaler()
data['Umur'] = umur_scaler.fit_transform(data['Umur'].values.reshape(-1, 1))
data['Kuantitas'] = kuantitas_scaler.fit_transform(data['Kuantitas'].values.reshape(-1, 1))

data.head()

In [None]:
# dump encoders and scalers
dump(le, 'encoders/le_encoders.pkl')
dump(umur_scaler, 'scalers/umur_scaler.pkl')
dump(kuantitas_scaler, 'scalers/kuantitas_scaler.pkl')

## Split X and Y data

In [None]:
# drop `Kuantitas` predict from X
X = data.drop(['Kuantitas'], axis=1)

# assign `Kuantitas` into y
y = data['Kuantitas']

In [None]:
X

In [None]:
y

In [None]:
# split the train and test data (70:30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'X_train data length: {len(X_train)}')
print(f'X_test data length: {len(X_test)}')
print(f'y_train data length: {len(y_train)}')
print(f'y_test data length: {len(y_test)}')
n_samples_train, n_features_train = X_train.shape
n_samples_train, n_features_train

# **Model Training**

In [None]:
# func for get adjusted_r2
def adjusted_r_squared(y_test, y_pred, n_samples, n_features):
    r_squared = r2_score(y_test, y_pred)
    adjusted_r_squared = 1 - (1 - r_squared) * (n_samples - 1) / (n_samples - n_features - 1)
    return adjusted_r_squared

# n_samples and n_features for the train set
n_samples_train, n_features_train = X_train.shape

# n_samples and n_features for the test set
n_samples_test, n_features_test = X_test.shape

## Linear Regression

In [None]:
# initialize
lr = LinearRegression()

# train the model
lr.fit(X_train, y_train)

In [None]:
# Linear Regression predict
lr_y_pred_train = lr.predict(X_train)

# evaluate
lr_y_pred_train_mse = mean_squared_error(y_train, lr_y_pred_train)
lr_y_pred_train_mae = mean_absolute_error(y_train, lr_y_pred_train)
lr_y_pred_train_r2 = r2_score(y_train, lr_y_pred_train)
lr_y_pred_train_adjusted_r2 = adjusted_r_squared(y_train, lr_y_pred_train, n_samples_train, n_features_train)

# print
print("Linear Regression Evaluation Metrics")
print(f"Mean Squared Error (MSE):   {lr_y_pred_train_mse:.3f}")
print(f"Mean Absolute Error (MAE):  {lr_y_pred_train_mae:.3f}")
print(f"R-squared Score (R2):       {lr_y_pred_train_r2:.3f}")
print(f"Adjusted R-squared Score:   {lr_y_pred_train_adjusted_r2:.3f}")

In [None]:
# plot actual vs predicted 
plt.figure(figsize=(10, 6))
plt.scatter(y_train, lr_y_pred_train, c='crimson')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Actual Value')
plt.ylabel('Predicted Value')
plt.title('LR Train Actual vs Predicted')
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'b-')
plt.axis('equal')
plt.show()

## XGB Regression

In [None]:
# initialize
xgb = XGBRegressor()

# train the model
xgb.fit(X_train, y_train)

In [None]:
# XGB Regression predict
xgb_y_pred_train = xgb.predict(X_train)

# evaluate
xgb_y_pred_train_mse = mean_squared_error(y_train, xgb_y_pred_train)
xgb_y_pred_train_mae = mean_absolute_error(y_train, xgb_y_pred_train)
xgb_y_pred_train_r2 = r2_score(y_train, xgb_y_pred_train)
xgb_y_pred_train_adjusted_r2 = adjusted_r_squared(y_train, xgb_y_pred_train, n_samples_train, n_features_train)

# print
print("XGB Regression Evaluation Metrics")
print(f"Mean Squared Error (MSE):   {xgb_y_pred_train_mse:.3f}")
print(f"Mean Absolute Error (MAE):  {xgb_y_pred_train_mae:.3f}")
print(f"R-squared Score (R2):       {xgb_y_pred_train_r2:.3f}")
print(f"Adjusted R-squared Score:   {xgb_y_pred_train_adjusted_r2:.3f}")

In [None]:
# plot actual vs predicted 
plt.figure(figsize=(10, 6))
plt.scatter(y_train, xgb_y_pred_train, c='crimson')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Actual Value')
plt.ylabel('Predicted Value')
plt.title('XGB Train Actual vs Predicted')
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'b-')
plt.axis('equal')
plt.show()

## Random Forest Regression

In [None]:
# initialize
rf = RandomForestRegressor(n_estimators=100, random_state=100)

# train the model
rf.fit(X_train, y_train)

In [None]:
# XGB Regression predict
rf_y_pred_train = rf.predict(X_train)

# evaluate
rf_y_pred_train_mse = mean_squared_error(y_train, rf_y_pred_train)
rf_y_pred_train_mae = mean_absolute_error(y_train, rf_y_pred_train)
rf_y_pred_train_r2 = r2_score(y_train, rf_y_pred_train)
rf_y_pred_train_adjusted_r2 = adjusted_r_squared(y_train, rf_y_pred_train, n_samples_train, n_features_train)

# print
print("Random Forest Regression Evaluation Metrics")
print(f"Mean Squared Error (MSE):   {rf_y_pred_train_mse:.3f}")
print(f"Mean Absolute Error (MAE):  {rf_y_pred_train_mae:.3f}")
print(f"R-squared Score (R2):       {rf_y_pred_train_r2:.3f}")
print(f"Adjusted R-squared Score:   {rf_y_pred_train_adjusted_r2:.3f}")

In [None]:
# plot actual vs predicted 
plt.figure(figsize=(10, 6))
plt.scatter(y_train, rf_y_pred_train, c='crimson')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Actual Value')
plt.ylabel('Predicted Value')
plt.title('RF Train Actual vs Predicted')
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'b-')
plt.axis('equal')
plt.show()

## GBR Regression

In [None]:
# initialize
gbr = GradientBoostingRegressor()

# train the model
gbr.fit(X_train, y_train)

In [None]:
# GBR Regression predict
gbr_y_pred_train = gbr.predict(X_train)

# evaluate
gbr_y_pred_train_mse = mean_squared_error(y_train, gbr_y_pred_train)
gbr_y_pred_train_mae = mean_absolute_error(y_train, gbr_y_pred_train)
gbr_y_pred_train_r2 = r2_score(y_train, gbr_y_pred_train)
gbr_y_pred_train_adjusted_r2 = adjusted_r_squared(y_train, gbr_y_pred_train, n_samples_train, n_features_train)

# print
print("GBR Regression Evaluation Metrics")
print(f"Mean Squared Error (MSE):   {gbr_y_pred_train_mse:.3f}")
print(f"Mean Absolute Error (MAE):  {gbr_y_pred_train_mae:.3f}")
print(f"R-squared Score (R2):       {gbr_y_pred_train_r2:.3f}")
print(f"Adjusted R-squared Score:   {gbr_y_pred_train_adjusted_r2:.3f}")

In [None]:
# plot actual vs predicted 
plt.figure(figsize=(10, 6))
plt.scatter(y_train, gbr_y_pred_train, c='crimson')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Actual Value')
plt.ylabel('Predicted Value')
plt.title('GBR Train Actual vs Predicted')
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'b-')
plt.axis('equal')
plt.show()

## SVM Regression

In [None]:
# initialize
svr = SVR()

# train the model
svr.fit(X_train, y_train)

In [None]:
# SVM Regression predict
svm_y_pred_train = svr.predict(X_train)

# evaluate
svm_y_pred_train_mse = mean_squared_error(y_train, svm_y_pred_train)
svm_y_pred_train_mae = mean_absolute_error(y_train, svm_y_pred_train)
svm_y_pred_train_r2 = r2_score(y_train, svm_y_pred_train)
svm_y_pred_train_adjusted_r2 = adjusted_r_squared(y_train, svm_y_pred_train, n_samples_train, n_features_train)

# print
print("SVM Regression Evaluation Metrics")
print(f"Mean Squared Error (MSE):   {svm_y_pred_train_mse:.3f}")
print(f"Mean Absolute Error (MAE):  {svm_y_pred_train_mae:.3f}")
print(f"R-squared Score (R2):       {svm_y_pred_train_r2:.3f}")
print(f"Adjusted R-squared Score:   {svm_y_pred_train_adjusted_r2:.3f}")

In [None]:
# plot actual vs predicted 
plt.figure(figsize=(10, 6))
plt.scatter(y_train, svm_y_pred_train, c='crimson')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Actual Value')
plt.ylabel('Predicted Value')
plt.title('SVM Train Actual vs Predicted')
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'b-')
plt.axis('equal')
plt.show()

# **Save The Trained Model**

In [None]:
# save all the models
dump(lr, 'models/LinearRegression.joblib')
dump(xgb, 'models/XGBRegression.joblib')
dump(rf, 'models/RandomForestRegression.joblib')
dump(gbr, 'models/GBRRegression.joblib')
dump(svr, 'models/SVRRegression.joblib')

# **Test The Models with Testing Data**

## Linear Regression

In [None]:
#  load the saved model
lr_loaded = load('models/LinearRegression.joblib')

In [None]:
# predict
lr_y_pred_test = lr_loaded.predict(X_test)

# evaluate
lr_y_pred_test_mse = mean_squared_error(y_test, lr_y_pred_test)
lr_y_pred_test_mae = mean_absolute_error(y_test, lr_y_pred_test)
lr_y_pred_test_r2 = r2_score(y_test, lr_y_pred_test)
lr_y_pred_test_adjusted_r2 = adjusted_r_squared(y_test, lr_y_pred_test, n_samples_test, n_features_test)

# print
print("Linear Regression Test Evaluation Metrics")
print(f"Mean Squared Error (MSE):   {lr_y_pred_test_mse:.3f}")
print(f"Mean Absolute Error (MAE):  {lr_y_pred_test_mae:.3f}")
print(f"R-squared Score (R2):       {lr_y_pred_test_r2:.3f}")
print(f"Adjusted R-squared Score:   {lr_y_pred_test_adjusted_r2:.3f}")

In [None]:
# plot actual vs predicted 
plt.figure(figsize=(10, 6))
plt.scatter(y_test, lr_y_pred_test, c='crimson')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Actual Value')
plt.ylabel('Predicted Value')
plt.title('LR Test Actual vs Predicted')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'b-')
plt.axis('equal')
plt.show()

## XGB Regression

In [None]:
#  load the saved model
xgb_loaded = load('models/XGBRegression.joblib')

In [None]:
# predict
xgb_y_pred_test = xgb_loaded.predict(X_test)

# evaluate
xgb_y_pred_test_mse = mean_squared_error(y_test, xgb_y_pred_test)
xgb_y_pred_test_mae = mean_absolute_error(y_test, xgb_y_pred_test)
xgb_y_pred_test_r2 = r2_score(y_test, xgb_y_pred_test)
xgb_y_pred_test_adjusted_r2 = adjusted_r_squared(y_test, xgb_y_pred_test, n_samples_test, n_features_test)

# print
print("XGB Regression Test Evaluation Metrics")
print(f"Mean Squared Error (MSE):   {xgb_y_pred_test_mse:.3f}")
print(f"Mean Absolute Error (MAE):  {xgb_y_pred_test_mae:.3f}")
print(f"R-squared Score (R2):       {xgb_y_pred_test_r2:.3f}")
print(f"Adjusted R-squared Score:   {xgb_y_pred_test_adjusted_r2:.3f}")

In [None]:
# plot actual vs predicted 
plt.figure(figsize=(10, 6))
plt.scatter(y_test, xgb_y_pred_test, c='crimson')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Actual Value')
plt.ylabel('Predicted Value')
plt.title('XGB Test Actual vs Predicted')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'b-')
plt.axis('equal')
plt.show()

## Random Forest Regression

In [None]:
#  load the saved model
rf_loaded = load('models/RandomForestRegression.joblib')

In [None]:
# predict
rf_y_pred_test = rf_loaded.predict(X_test)

# evaluate
rf_y_pred_test_mse = mean_squared_error(y_test, rf_y_pred_test)
rf_y_pred_test_mae = mean_absolute_error(y_test, rf_y_pred_test)
rf_y_pred_test_r2 = r2_score(y_test, rf_y_pred_test)
rf_y_pred_test_adjusted_r2 = adjusted_r_squared(y_test, rf_y_pred_test, n_samples_test, n_features_test)

# print
print("Random Forest Regression Test Evaluation Metrics")
print(f"Mean Squared Error (MSE):   {rf_y_pred_test_mse:.3f}")
print(f"Mean Absolute Error (MAE):  {rf_y_pred_test_mae:.3f}")
print(f"R-squared Score (R2):       {rf_y_pred_test_r2:.3f}")
print(f"Adjusted R-squared Score:   {rf_y_pred_test_adjusted_r2:.3f}")

In [None]:
# plot actual vs predicted 
plt.figure(figsize=(10, 6))
plt.scatter(y_test, rf_y_pred_test, c='crimson')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Actual Value')
plt.ylabel('Predicted Value')
plt.title('RF Test Actual vs Predicted')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'b-')
plt.axis('equal')
plt.show()

## GBR Regression

In [None]:
#  load the saved model
gbr_loaded = load('models/GBRRegression.joblib')

In [None]:
# predict
gbr_y_pred_test = gbr_loaded.predict(X_test)

# evaluate
gbr_y_pred_test_mse = mean_squared_error(y_test, gbr_y_pred_test)
gbr_y_pred_test_mae = mean_absolute_error(y_test, gbr_y_pred_test)
gbr_y_pred_test_r2 = r2_score(y_test, gbr_y_pred_test)
gbr_y_pred_test_adjusted_r2 = adjusted_r_squared(y_test, gbr_y_pred_test, n_samples_test, n_features_test)

# print
print("GBR Regression Test Evaluation Metrics")
print(f"Mean Squared Error (MSE):   {gbr_y_pred_test_mse:.3f}")
print(f"Mean Absolute Error (MAE):  {gbr_y_pred_test_mae:.3f}")
print(f"R-squared Score (R2):       {gbr_y_pred_test_r2:.3f}")
print(f"Adjusted R-squared Score:   {gbr_y_pred_test_adjusted_r2:.3f}")

In [None]:
# plot actual vs predicted 
plt.figure(figsize=(10, 6))
plt.scatter(y_test, gbr_y_pred_test, c='crimson')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Actual Value')
plt.ylabel('Predicted Value')
plt.title('GBR Test Actual vs Predicted')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'b-')
plt.axis('equal')
plt.show()

## SVR Regression

In [None]:
#  load the saved model
svr_loaded = load('models/SVRRegression.joblib')

In [None]:
# predict
svr_y_pred_test = svr_loaded.predict(X_test)

# evaluate
svr_y_pred_test_mse = mean_squared_error(y_test, svr_y_pred_test)
svr_y_pred_test_mae = mean_absolute_error(y_test, svr_y_pred_test)
svr_y_pred_test_r2 = r2_score(y_test, svr_y_pred_test)
svr_y_pred_test_adjusted_r2 = adjusted_r_squared(y_test, svr_y_pred_test, n_samples_test, n_features_test)

# print
print("SVR Regression Test Evaluation Metrics")
print(f"Mean Squared Error (MSE):   {svr_y_pred_test_mse:.3f}")
print(f"Mean Absolute Error (MAE):  {svr_y_pred_test_mae:.3f}")
print(f"R-squared Score (R2):       {svr_y_pred_test_r2:.3f}")
print(f"Adjusted R-squared Score:   {svr_y_pred_test_adjusted_r2:.3f}")

In [None]:
# plot actual vs predicted 
plt.figure(figsize=(10, 6))
plt.scatter(y_test, svr_y_pred_test, c='crimson')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Actual Value')
plt.ylabel('Predicted Value')
plt.title('SVR Test Actual vs Predicted')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'b-')
plt.axis('equal')
plt.show()

# **Create Function For Predict Using Chosen Models**

In [None]:
def predict(nama, umur, kode, metode_pembayaran, tanggal, unit):

    # get day by date
    date_object = datetime.strptime(tanggal, "%m/%d/%Y")
    hari = date_object.strftime("%A")

    data = {
        'Umur': [umur],
        'Tanggal': [tanggal],
        'Hari': [hari],
        'Nama': [nama],
        'Kode': [kode],
        'Unit': [unit],
        'Metode Pembayaran': [metode_pembayaran],
    }

    df = pd.DataFrame(data)

    le_loaded = load('encoders/le_encoders.pkl')
    umur_scaler_loaded = load('scalers/umur_scaler.pkl')
    kuantitas_scaler_loaded = load('scalers/kuantitas_scaler.pkl')

    # check `unseen labels`
    def transform_label(column, value):
        if value in le_loaded[column].classes_:
            return le_loaded[column].transform([value])[0]
        else:
            raise ValueError(f"Unseen label '{value}' encountered in column '{column}'")
            # return -1

    for column in ['Hari', 'Nama', 'Kode', 'Unit', 'Metode Pembayaran']:
        df[column] = df[column].apply(lambda x: transform_label(column, x))

    df['Tanggal'] = pd.to_datetime(df['Tanggal'])
    df['Tanggal'] = (df['Tanggal'] - df['Tanggal'].min()).dt.days

    df['Umur'] = umur_scaler_loaded.transform(df['Umur'].values.reshape(-1, 1))

    rf_loaded_model = load('models/RandomForestRegression.joblib')
    rf_loaded_pred_scaled = rf_loaded_model.predict(df)
    rf_loaded_pred = kuantitas_scaler_loaded.inverse_transform(rf_loaded_pred_scaled.reshape(-1,1))
    return int(round(rf_loaded_pred[0][0],0))

In [None]:
name = 'Bag Fashion'
age = 47
code = '2930'
payment_method = 'Cashless'
date = '12/3/2023'
unit = 'pcs'

test_qty_pred = predict(name, age, code, payment_method, date, unit)
print(f"Prediction purchase quantity: {test_qty_pred} {unit}")