In [1]:
import os

In [2]:
%pwd

'/home/rohwid/Pacmann/lazada-id-reviews/notebooks'

In [3]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [4]:
with open('.env') as f:
    os.environ.update(
        line.strip().split('=') for line in f
)

In [5]:
%pwd

'/home/rohwid/Pacmann/lazada-id-reviews'

### Predict Config

This code will be apply in `src/LadazaIDReview/entity/config_entity.py`.

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class PredictionConfig:
    root_dir: Path
    mlflow_tracking_uri: str
    mlflow_model_name: str
    mlflow_deploy_model_alias: Path
    mlflow_vectorizer_model_path: Path
    
    # for development (debug)
    input_test_path: Path
    output_test_path: Path

### Predict Config Manager

This code will be apply in `src/LazadaIDReviews/config/configurations.py`.

In [7]:
from LazadaIDReviews.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from LazadaIDReviews.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_prediction_config(self) -> PredictionConfig:
        """read training evaluation config file and store as 
        config entity then apply the dataclasses
        
        Returns:
            config: PredictionConfig type
        """
        predict_config = self.config.predict
        
        # for development (debug)
        dump_data_config = self.config.dump_data

        create_directories([predict_config.root_dir])

        config = PredictionConfig(
            root_dir=predict_config.root_dir,
            mlflow_tracking_uri=os.environ["MLFLOW_TRACKING_URI"],
            mlflow_model_name=predict_config.mlflow_model_name,
            mlflow_deploy_model_alias=os.environ["MLFLOW_DEPLOY_MODEL_ALIAS"],
            mlflow_vectorizer_model_path=predict_config.mlflow_vectorizer_model_path,
            
            # for development (debug)
            input_test_path=dump_data_config.input_test_path,
            output_test_path=dump_data_config.output_test_path
        )

        return config

In [9]:
from mlflow.artifacts import download_artifacts
from mlflow import MlflowClient
from mlflow import pyfunc

import joblib

---

**Debug**: Explain when doing prediction in the notebook with MLflow.

In [None]:
config = ConfigurationManager()
predict_config = config.get_prediction_config()

Select the deployed model from MLflow.

In [None]:
client = MlflowClient(tracking_uri=predict_config.mlflow_tracking_uri)
selected_model = client.get_model_version_by_alias(
    predict_config.mlflow_model_name, 
    predict_config.mlflow_deploy_model_alias
)

selected_model.source

In [None]:
loaded_model = pyfunc.load_model(model_uri=selected_model.source)
loaded_model

Get the model `run_id`.

In [None]:
selected_run_id = selected_model.run_id
selected_run_id

Download vectorizer (one of the MLflow artifact) from MLflow.

In [None]:
download_artifacts(
    run_id=selected_run_id,
    artifact_path=predict_config.mlflow_vectorizer_model_path,
    dst_path=predict_config.root_dir
)

Load the downloaded vectorizer.

In [None]:
root_dir = predict_config.root_dir
mlflow_vectorizer_model_path = predict_config.mlflow_vectorizer_model_path
vectorizer_model_path = f"{root_dir}/{mlflow_vectorizer_model_path}"
vectorizer = joblib.load(vectorizer_model_path)
vectorizer

Predict and evaluate the data test as input.

In [None]:
X_test = joblib.load(predict_config.input_test_path)
y_test = joblib.load(predict_config.output_test_path)

In [None]:
X_test.head()

In [None]:
X_test.shape

The request body, for the preparation of the http input request body.

In [None]:
request_body = {
    "reviewContent": X_test.to_list()
}

In [None]:
request_body['reviewContent'][:10]

Vectorize the data test as input.

In [None]:
X_test_vec = vectorizer.transform(request_body['reviewContent'])
X_test_vec

Make prediction.

In [None]:
y_predict = loaded_model.predict(X_test_vec).tolist()

In [None]:
len(y_predict)

In [None]:
y_predict[:10]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict))

---

### Make Prediction

This code in `src/LazadaIDReview/components/predict.py`.

In [10]:
from LazadaIDReviews import logger

class Predict:
    def __init__(self, config: PredictionConfig):
        self.config = config

    def run(self, data: list) -> list:
        """predict the data with linear regression model
        
        Args:
            data (list): input data to predict

        Raises:
            client_error: error when access mlflow to get deployed model
            download_error: error when download vectorizer from mlflow artifact
            load_error: vectorizer error
        
        Returns:
            y_predict: list type
        """
        try:
            logger.info("Set MLflow Client.")
            client = MlflowClient(tracking_uri=self.config.mlflow_tracking_uri)
            
            logger.info("Select the deployed model from MLflow.")
            selected_model = client.get_model_version_by_alias(
                self.config.mlflow_model_name, 
                self.config.mlflow_deploy_model_alias
            )
            
            logger.info("Get the deployed model run id.")
            selected_run_id = selected_model.run_id
        except Exception as client_error:
            logger.error(client_error)
            raise client_error
        
        root_dir = self.config.root_dir
        mlflow_vectorizer_model_path = self.config.mlflow_vectorizer_model_path
        vectorizer_model_path = Path(f"{root_dir}/{mlflow_vectorizer_model_path}")
        
        try:
            logger.info("Downloading vectorizer from MLflow's artifacts.")
            download_artifacts(
                run_id=selected_run_id,
                artifact_path=self.config.mlflow_vectorizer_model_path,
                dst_path=self.config.root_dir
            )
        except Exception as download_error:
            logger.error(download_error)
            raise download_error
        
        try:
            logger.info("Load the vectorizer model.")
            vectorizer = joblib.load(vectorizer_model_path)
            
            logger.info("Transform the data.")
            X_test_vec = vectorizer.transform(data)
        except Exception as load_error:
            logger.error(load_error)
            raise load_error
        
        logger.info("Predict the data.")
        loaded_model = pyfunc.load_model(model_uri=selected_model.source)
        y_predict = loaded_model.predict(X_test_vec).tolist()
        
        return y_predict

### Predict the Data

**Debug**: test the predict object and it's method.

In [11]:
config = ConfigurationManager()
predict_config = config.get_prediction_config()

[2024-07-03 22:29:22,516: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-03 22:29:22,518: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-03 22:29:22,519: INFO: common: created directory at: artifacts]
[2024-07-03 22:29:22,520: INFO: common: created directory at: artifacts/predict]


In [12]:
X_test = joblib.load(predict_config.input_test_path)

In [13]:
request_body = {
    "reviewContents": X_test.to_list()
}

In [14]:
data = request_body["reviewContents"]
data[:10]

['mantul',
 'pengiriman super cepat',
 'flashdisk rusak tdk bs menyimpan...barang baru tp rusak',
 'barang berfungsi normal...semoga awet...',
 'bagus dan cepat',
 'Alhamdulillah sampai dgn selamat tapi y kekurangannya pindahin chanel lambat',
 'lumyan bagus',
 'Bagus...real kapasitasnya...sebelumnya beli yg kapasitas 32 gb....',
 'pengiriman cpt kmrn bayar besok dtng top dah puas semoga awet tv nya',
 'kapasitas tersisa 29.80GB. Berfungsi baik. pengiriman "disediakan oleh lazada" cepat sampai.']

This code in `app.py`.

In [15]:
try:
    config = ConfigurationManager()
    predict_config = config.get_prediction_config()
    predict = Predict(config=predict_config)
    result = predict.run(data)
except Exception as e:
    logger.error(e)
    raise e

[2024-07-03 22:29:58,458: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-03 22:29:58,461: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-03 22:29:58,462: INFO: common: created directory at: artifacts]
[2024-07-03 22:29:58,464: INFO: common: created directory at: artifacts/predict]
[2024-07-03 22:29:58,465: INFO: 539828281: Set MLflow Client.]
[2024-07-03 22:29:58,466: INFO: 539828281: Select the deployed model from MLflow.]
[2024-07-03 22:29:58,731: INFO: 539828281: Get the deployed model run id.]
[2024-07-03 22:29:58,732: INFO: 539828281: Downloading vectorizer from MLflow's artifacts.]


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

[2024-07-03 22:30:00,237: INFO: 539828281: Load the vectorizer model.]
[2024-07-03 22:30:00,584: INFO: 539828281: Transform the data.]
[2024-07-03 22:30:01,347: INFO: 539828281: Predict the data.]


Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

In [16]:
len(result)

85624

In [17]:
result[:10]

[5, 5, 1, 4, 5, 4, 4, 2, 5, 4]