In [49]:
import os
os.environ['MLFLOW_TRACKING_URI']="https://dagshub.com/rahulsamantcoc2/IMDB-Dataset.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME']="rahulsamantcoc2"
os.environ['MLFLOW_TRACKING_PASSWORD']="33607bcb15d4e7a7cca29f0f443d16762cc15549"

In [1]:
%pwd

'd:\\IMDB-Dataset\\notebook'

In [2]:
import os
os.chdir('../')
%pwd

'd:\\IMDB-Dataset'

In [23]:
from dataclasses import dataclass
from typing import Dict, Any

@dataclass
class ModelEvalConfig:
    root_dir: str
    test_data_path: str
    metric_file_name: str
    models: Dict[str, Dict[str, str]]
    all_params: dict
    target_column: str
    mlflow_uri: str
    model_type: str

In [4]:
from src.constants import *
from src.utils.common import read_yaml, create_directories,save_json

In [24]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_eval_config(self, model_type: str) -> ModelEvalConfig:

        config = self.config.model_evaluation
        params=self.params.model_params[model_type]
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_eval_config = ModelEvalConfig(
            root_dir=config.root_dir,
            test_data_path=config.test_data_path,
            metric_file_name=config.metric_file_name,
            models=config.models,
            all_params=params,
            target_column=schema.name,
            mlflow_uri="https://dagshub.com/rahulsamantcoc2/IMDB-Dataset.mlflow",
            model_type=model_type
        )
        return model_eval_config

In [37]:
import os
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from urllib.parse import urlparse
import mlflow
import mlflow.keras
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from src import logger
from typing import List, Union

In [46]:
class ModelEval:
    def __init__(self, config: ModelEvalConfig):
        self.config = config
        self.vocab_size = 10000
        self.best_model_metrics = None
        self.best_model_type = None

    def preprocess_features(self, data: pd.DataFrame) -> np.ndarray:
        features_list = []
        for _, row in data.iterrows():
            feature_str = row.iloc[0]
            if isinstance(feature_str, str):
                features = np.array([
                    min(int(x), self.vocab_size - 1) 
                    for x in feature_str.strip('[]').split()
                ])
            else:
                features = np.minimum(np.array(feature_str), self.vocab_size - 1)
            features_list.append(features)
        return np.vstack(features_list)

    def Eval(self, actual, pred):
        pred_binary = (pred >= 0.5).astype(int)
        actual_binary = actual.astype(int)
        
        accuracy = accuracy_score(actual_binary, pred_binary)
        precision = precision_score(actual_binary, pred_binary)
        recall = recall_score(actual_binary, pred_binary)
        f1 = f1_score(actual_binary, pred_binary)
        auc = roc_auc_score(actual_binary, pred)
        cm = confusion_matrix(actual_binary, pred_binary)
        return accuracy, precision, recall, f1, auc, cm

    def update_best_model(self, metrics, model_type):
        accuracy, _, _, f1, auc, _ = metrics
        current_score = (accuracy + f1 + auc) / 3  # Combined metric
        
        if self.best_model_metrics is None or current_score > self.best_model_metrics['combined_score']:
            self.best_model_metrics = {
                'model_type': model_type,
                'accuracy': accuracy,
                'f1': f1,
                'auc': auc,
                'combined_score': current_score
            }
            self.best_model_type = model_type

    def log_into_mlflow(self):
        test_data = pd.read_csv(self.config.test_data_path)
        model_path = self.config.models.get(self.config.model_type, {}).get('model_path')
        
        if model_path is None:
            raise ValueError(f"Model path not found for model type: {self.config.model_type}")
            
        model = joblib.load(model_path)
        test_x = self.preprocess_features(test_data.drop([self.config.target_column], axis=1))
        test_y = test_data[self.config.target_column].values

        mlflow.set_registry_uri(self.config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        
        with mlflow.start_run():
            predicted_qualities = model.predict(test_x)
            metrics = self.Eval(test_y, predicted_qualities)
            self.update_best_model(metrics, self.config.model_type)
            
            accuracy, precision, recall, f1, auc, cm = metrics
            scores = {
                "accuracy": float(accuracy),
                "precision": float(precision),
                "recall": float(recall),
                "f1": float(f1),
                "auc": float(auc),
                "confusion_matrix": cm.tolist()
            }
            
            save_json(path=Path(self.config.metric_file_name), data=scores)
            mlflow.log_params(self.config.all_params)
            
            for metric_name, value in scores.items():
                if metric_name != "confusion_matrix":
                    mlflow.log_metric(metric_name, value)
            
            if tracking_url_type_store != "file":
                mlflow.sklearn.log_model(model, "model", registered_model_name="IMDB")
            else:
                mlflow.sklearn.log_model(model, "model")

In [50]:
best_model_metrics = None
model_evals = []

try:
    config = ConfigurationManager()
    
    for model_type in ['rnn', 'lstm', 'gru']:
        model_eval_config = config.get_model_eval_config(model_type=model_type)
        model_eval = ModelEval(config=model_eval_config)
        model_eval.log_into_mlflow()
        model_evals.append(model_eval)
    
    # Find best model
    best_model = max(model_evals, key=lambda x: x.best_model_metrics['combined_score'])
    print(f"\nBest Model: {best_model.best_model_type}")
    print(f"Metrics: Accuracy={best_model.best_model_metrics['accuracy']:.4f}, "
          f"F1={best_model.best_model_metrics['f1']:.4f}, "
          f"AUC={best_model.best_model_metrics['auc']:.4f}")
    
except Exception as e:
    raise e

[2024-12-25 23:28:37,206: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-12-25 23:28:37,388: INFO: common: yaml file: params.yaml loaded successfully]
[2024-12-25 23:28:37,410: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-12-25 23:28:37,423: INFO: common: created directory at: artifacts]
[2024-12-25 23:28:37,473: INFO: common: created directory at: artifacts/model_evaluation]
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 125ms/step
[2024-12-25 23:28:57,654: INFO: common: Binary file saved at: artifacts\model_evaluation\metrics.json]


Successfully registered model 'IMDB'.
2024/12/25 23:30:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: IMDB, version 1
Created version '1' of model 'IMDB'.


🏃 View run industrious-auk-17 at: https://dagshub.com/rahulsamantcoc2/IMDB-Dataset.mlflow/#/experiments/0/runs/2560f8b1951e47e191f1012617600d6b
🧪 View experiment at: https://dagshub.com/rahulsamantcoc2/IMDB-Dataset.mlflow/#/experiments/0
[2024-12-25 23:30:38,984: INFO: common: created directory at: artifacts/model_evaluation]
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 39ms/step
[2024-12-25 23:30:43,453: INFO: common: Binary file saved at: artifacts\model_evaluation\metrics.json]


Registered model 'IMDB' already exists. Creating a new version of this model...
2024/12/25 23:31:41 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: IMDB, version 2
Created version '2' of model 'IMDB'.


🏃 View run handsome-zebra-157 at: https://dagshub.com/rahulsamantcoc2/IMDB-Dataset.mlflow/#/experiments/0/runs/b2f8b994ef29477a80c1bc503d1aeb8b
🧪 View experiment at: https://dagshub.com/rahulsamantcoc2/IMDB-Dataset.mlflow/#/experiments/0
[2024-12-25 23:31:42,686: INFO: common: created directory at: artifacts/model_evaluation]
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 74ms/step
[2024-12-25 23:31:47,993: INFO: common: Binary file saved at: artifacts\model_evaluation\metrics.json]


Registered model 'IMDB' already exists. Creating a new version of this model...
2024/12/25 23:32:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: IMDB, version 3
Created version '3' of model 'IMDB'.


🏃 View run auspicious-swan-301 at: https://dagshub.com/rahulsamantcoc2/IMDB-Dataset.mlflow/#/experiments/0/runs/b58df84619b344859edf90e13b6a1b2d
🧪 View experiment at: https://dagshub.com/rahulsamantcoc2/IMDB-Dataset.mlflow/#/experiments/0

Best Model: rnn
Metrics: Accuracy=0.5220, F1=0.5230, AUC=0.5116
