# Model Inference

In [2]:
import pymupdf
import pickle
import pandas as pd

import os
import sys
sys.path.append(os.path.abspath("../"))
from src.helper import calculate_cmyk_percentage

import time
from PIL import Image
import io

In [18]:

class PrintCost:
    df_dict = {
        'c': [],
        'm': [],
        'y': [],
        'k': [],
        'cmy': [],
        'cmyk': [],
    }

    
    def __init__(self, file_path: str, model_pkl_path: str) -> None:
        self.file_path = file_path
        self.model_pkl_path = model_pkl_path


    def extract_cmyk(self, dpi) -> pd.DataFrame:
        self._reset_df_dict()
        pdf = pymupdf.open(self.file_path)
       
        
        for page in pdf:
            pixmap = page.get_pixmap(dpi=dpi)
            img = Image.open(io.BytesIO(pixmap.tobytes()))
    
            cmyk = calculate_cmyk_percentage(img)
            self._df_dict_appender(cmyk)

        self.df = pd.DataFrame(self.df_dict)
        
        return self.df


    def _reset_df_dict(self):
         self.df_dict = {
            'c': [],
            'm': [],
            'y': [],
            'k': [],
            'cmy': [],
            'cmyk': [],
         }
         

    def _df_dict_appender(self, cmyk):
        c, m, y, k = cmyk
        cmy = c + m + y
        self.df_dict['c'].append(c)
        self.df_dict['m'].append(m)
        self.df_dict['y'].append(y)
        self.df_dict['k'].append(k)
        self.df_dict['cmy'].append(cmy)
        self.df_dict['cmyk'].append(cmy + k)


    def predict(self):
        self.extract_cmyk(dpi=7)
        
        model = pickle.load(open(self.model_pkl_path, 'rb'))
        y_pred = model.predict(self.df[['cmy', 'k', 'cmyk']])
        self.df['price'] = y_pred
        self.df.price = self.df.price.replace({0:500, 1:750, 2:1000, 3:1500, 4:2000})
        return self.df.price
        
    

In [31]:
start = time.time()
pdf_path="../datasets/statistik-indonesia-2024-combined.pdf"
model_path = "../models/xgboost_98.64_cmy_k_cmyk_7_dpi.pkl"

pc = PrintCost(pdf_path, model_path)
y_pred = pc.predict()
pdf_length = len(y_pred)

end = time.time()

print(f"Jumlah Halaman\t: {pdf_length}")
print(f"Waktu Prediksi\t: {(end - start):.2f} detik ({round(pdf_length/(end-start))} halaman/detik)")
print(f"Harga\t\t: Rp{y_pred.sum():,}".replace(',', '.'))

Jumlah Halaman	: 884
Waktu Prediksi	: 8.13 detik (109 halaman/detik)
Harga		: Rp1.017.250


In [35]:
pd.DataFrame(y_pred.value_counts())

Unnamed: 0_level_0,count
price,Unnamed: 1_level_1
1500,436
1000,194
500,180
750,55
2000,19
