In [1]:
1+1

2

In [2]:
import os

%pwd

'/Users/priyaroy/Documents/Projects/nlp-projects/CommentClassifier/research'

In [3]:
os.chdir("../")

%pwd

'/Users/priyaroy/Documents/Projects/nlp-projects/CommentClassifier'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metric_file_name: Path
    metrics: list[str]
    validation_split: float
    cross_validation: bool
    cv_folds: int
    text_column: str
    label_column: bool


In [5]:
from src.CommentClassifier.constants import *
from src.CommentClassifier.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation

        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_path = config.model_path,
            tokenizer_path = config.tokenizer_path,
            metric_file_name = config.metric_file_name,
            metrics=config.metrics,
            validation_split=config.validation_split,
            cross_validation=config.cross_validation,
            cv_folds=config.cv_folds,
            text_column=config.text_column,
            label_column=config.label_column
        )

        return model_evaluation_config

In [7]:
import joblib
import json
import pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, classification_report

In [23]:
class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config
        self.model = self.load_model()
        self.vectorizer = self.load_vectorizer()
        
    def load_model(self):
        """ Load the trained model """
        model_path = Path(self.config.model_path)
        if not model_path.exists():
            raise FileNotFoundError(f"Model file not found: {model_path}")
        return joblib.load(model_path)

    def load_vectorizer(self):
        """ Load the saved TF-IDF vectorizer """
        vectorizer_path = Path(self.config.tokenizer_path)
        if not vectorizer_path.exists():
            raise FileNotFoundError(f"Vectorizer file not found: {tokenizer_path}")
        return joblib.load(vectorizer_path)

    def load_data(self):
        """ Load validation and test datasets """
        data_path = Path(self.config.data_path, "train.csv")

        if not data_path.exists():
            raise FileNotFoundError(f"Data file not found: {data_path}")

        df = pd.read_csv(data_path)
        # Fill NaN values with an empty string
        df[self.config.text_column] = df[self.config.text_column].fillna("")
        X = df[self.config.text_column]
        y = df[self.config.label_column]

        # Splitting into validation and test sets
        split_idx = int(len(df) * (1 - self.config.validation_split))
        X_validation, y_validation = X[:split_idx], y[:split_idx]
        X_test, y_test = X[split_idx:], y[split_idx:]

        return X_validation, y_validation, X_test, y_test

    def evaluate_model(self, model, vectorizer, X, y):
        """ Transform text using TF-IDF and evaluate the model """
        X_tfidf = vectorizer.transform(X)  # Apply TF-IDF transformation
        predictions = model.predict(X_tfidf)
        accuracy = accuracy_score(y, predictions)
        report = classification_report(y, predictions)
        return accuracy, report
    
    def save_metrics(self, metrics):
        """Save metrics as a CSV file."""
        metrics_path = self.config.metric_file_name
        with open(metrics_path, "w") as f:
            json.dump(metrics, f, indent=4)
        print(f"Metrics saved at {metrics_path}")

    def run_evaluation(self):
        """ Run model evaluation """
        print("Loading model and vectorizer...")
        model = self.load_model()
        vectorizer = self.load_vectorizer()

        print("Loading data...")
        X_validation, y_validation, X_test, y_test = self.load_data()

        print("Evaluating on Validation Set...")
        val_acc, val_report = self.evaluate_model(model, vectorizer, X_validation, y_validation)
        print(f"Validation Accuracy: {val_acc}")
        print(f"Validation Classification Report:\n{val_report}")

        print("Evaluating on Test Set...")
        test_acc, test_report = self.evaluate_model(model, vectorizer, X_test, y_test)
        print(f"Test Accuracy: {test_acc}")
        print(f"Test Classification Report:\n{test_report}")

        # Save metrics
        metrics = {
            "Validation Accuracy": val_acc,
            "Validation Report": val_report,
            "Test Accuracy": test_acc,
            "Test Report": test_report
        }
        self.save_metrics(metrics)

In [24]:
# Load configuration
config = ConfigurationManager()
model_evaluation_config = config.get_model_evaluation_config()
# Run evaluation
model_evaluation = ModelEvaluation(config=model_evaluation_config)
model_evaluation.run_evaluation()


DEBUG: YAML Content Type: <class 'dict'>
DEBUG: YAML Content: {'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/priya-roy/unhealthy-comments-Dataset/raw/refs/heads/main/commentClassification.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}, 'data_transformation': {'text_column': 'comment', 'label_column': 'healthy', 'stopwords': 'english', 'lowercase': True, 'remove_special_characters': True, 'remove_extra_spaces': True, 'root_dir': 'artifacts/data_transformation', 'data_path': 'artifacts/data_ingestion/commentClassification', 'tokenizer_name': 'nltk'}, 'tfidf_vectorizer': {'max_features': 10000, 'ngram_range': [1, 2], 'stop_words': 'english', 'min_df': 2, 'max_df': 0.9}, 'model_trainer': {'root_dir': 'artifacts/model_trainer', 'data_path': 'artifacts/data_transformation/commentClassification', 'tokenizer_name': 'nltk', 'text_column': 'comment', 'label_column