In [1]:
1+1

2

In [2]:
import os   

%pwd

'/Users/priyaroy/Documents/Projects/nlp-projects/CommentClassifier/research'

In [3]:
os.chdir("../")

%pwd

'/Users/priyaroy/Documents/Projects/nlp-projects/CommentClassifier'

In [4]:

from dataclasses import dataclass
from pathlib import Path

@dataclass
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: str
    text_column: str
    label_column: bool
    penalty: str
    random_state: int
    max_iter: int
    class_weight: str
    max_features: str
    ngram_range: list
    stop_words: str
    min_df: int
    max_df:float

In [5]:
from src.CommentClassifier.constants import *
from src.CommentClassifier.utils.common import read_yaml, create_directories

In [6]:

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config=self.config.model_trainer
        params=self.params.TrainingArguments

        create_directories([config.root_dir])
        

        model_trainer_config=ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name = config.tokenizer_name,
            text_column=config.text_column,
            label_column=config.label_column,
            penalty = params.penalty,
            random_state = params.random_state,
            max_iter = params.max_iter,
            class_weight = params.class_weight,
            max_features = params.max_features,
            ngram_range= params.ngram_range,
            stop_words = params.stop_words,
            min_df = params.min_df,
            max_df = params.max_df
        )
        return model_trainer_config

### Strat model training


In [7]:
import pandas as pd
import joblib
import yaml
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [10]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config=config
        
    def load_dataset(self):
        """
        Load dataset from the transformed file.
        """
        file_path = os.path.join(self.config.data_path, "train.csv")
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            print(f"Dataset Loaded: {df.shape} rows, {df.columns.tolist()} columns")
            return df
        else:
            raise FileNotFoundError(f"Dataset not found at {file_path}")
        
    def preprocess_data(self, df):
        df = df.dropna(subset=["comment"])  # Drop missing comments
        df["comment"] = df["comment"].astype(str)  # Convert all to strings
        
        X = df[self.config.text_column]
        y = df[self.config.label_column]

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        vectorizer = TfidfVectorizer()
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
        print("✅ Data Preprocessing Complete")
        return X_train_tfidf, X_test_tfidf, y_train, y_test, vectorizer
    
    def train_model(self, X_train_tfidf, y_train):
        model = RandomForestClassifier(random_state=42)
        model.fit(X_train_tfidf, y_train)
        return model
    
    def save_model(self, model, vectorizer):
        """
        Saves the trained model and the TF-IDF vectorizer.
        """
         ## Save model
        model_path = os.path.join(self.config.root_dir, "trained_model","model.pkl")
        ## Save tokenizer
        vectorizer_path = os.path.join(self.config.root_dir,"tokenizer", "tfidf_vectorizer.pkl")

        joblib.dump(model, model_path)
        joblib.dump(vectorizer, vectorizer_path)

        print(f"✅ Model saved at: {model_path}")
        print(f"✅ TF-IDF Vectorizer saved at: {vectorizer_path}")


In [11]:
config = ConfigurationManager()
model_trainer_config = config.get_model_trainer_config()
model_trainer = ModelTrainer(model_trainer_config)
df = model_trainer.load_dataset()
X_train, X_test, y_train, y_test, vectorizer = model_trainer.preprocess_data(df)

model = model_trainer.train_model(X_train, y_train)
model_trainer.save_model(model, vectorizer)

DEBUG: YAML Content Type: <class 'dict'>
DEBUG: YAML Content: {'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/priya-roy/unhealthy-comments-Dataset/raw/refs/heads/main/commentClassification.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}, 'data_transformation': {'text_column': 'comment', 'label_column': 'healthy', 'stopwords': 'english', 'lowercase': True, 'remove_special_characters': True, 'remove_extra_spaces': True, 'root_dir': 'artifacts/data_transformation', 'data_path': 'artifacts/data_ingestion/commentClassification', 'tokenizer_name': 'nltk'}, 'tfidf_vectorizer': {'max_features': 10000, 'ngram_range': [1, 2], 'stop_words': 'english', 'min_df': 2, 'max_df': 0.9}, 'model_trainer': {'root_dir': 'artifacts/model_trainer', 'data_path': 'artifacts/data_transformation/commentClassification', 'tokenizer_name': 'nltk', 'text_column': 'comment', 'label_column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["comment"] = df["comment"].astype(str)  # Convert all to strings


✅ Data Preprocessing Complete
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/priyaroy/Documents/Projects/nlp-projects/CommentClassifier/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3579, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/pj/lh27tt7s3sd1zql9n09bxk4w0000gn/T/ipykernel_32427/2517108783.py", line 8, in <module>
    model_trainer.save_model(model, vectorizer)
  File "/var/folders/pj/lh27tt7s3sd1zql9n09bxk4w0000gn/T/ipykernel_32427/2697522207.py", line 47, in save_model
    joblib.dump(model, model_path)
  File "/Users/priyaroy/Documents/Projects/nlp-projects/CommentClassifier/venv/lib/python3.10/site-packages/joblib/numpy_pickle.py", line 552, in dump
    with open(filename, 'wb') as f:
FileNotFoundError: [Errno 2] No such file or directory: 'artifacts/model_trainer/trained_model/model.pkl'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/priyaroy/Documents/Projects/