In [1]:
import os
os.chdir('../')
%pwd

'c:\\Users\\Rounak\\Desktop\\OneDrive\\College\\Projects\\Severity-of-Toxic-Commentis-End-to-End'

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    processed_test_data_dir: Path
    processed_train_data_dir: Path
    tokenizer_dir: Path
    epochs: int
    embedding_dim: int
    batch_size: int
    fasttext_model_path: Path
    max_features: int
    maxpadlen: int
    tokenizer_dir: Path

In [3]:
from SeverityOfToxicCommentsEndToEnd.utils.common import read_yaml, create_directories
from SeverityOfToxicCommentsEndToEnd.constants import *

In [4]:
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TransformationAndTrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir = config.root_dir,
            processed_test_data_dir = config.processed_test_data_dir,
            processed_train_data_dir = config.processed_train_data_dir,
            tokenizer_dir = config.tokenizer_dir,
            epochs = params.epochs,
            embedding_dim = params.embedding_dim,
            batch_size = params.batch_size,
            fasttext_model_path = params.fasttext_model_path,
            max_features = params.max_features,
            maxpadlen = params.maxpadlen
        )
        return model_trainer_config

In [5]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import pandas as pd

In [13]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        self.embeddings_index_fasttext = {}
        self.word_index = None
        self.embedding_matrix_fasttext = None
        self.train_padded = []
        self.test_padded = []
        self.train_labels = None
        self.test_labels = None
        self.df = None
        self.df_test = None
    
    def DefineTokenizer(self, path):
        self.df = pd.read_csv(path)
        self.df = self.df[self.df['comment_text'].notnull()]
        processed_train_text = self.df['comment_text'].to_list()
        self.df_test = pd.read_csv(self.config.processed_test_data_dir)
        self.df_test = self.df_test[self.df_test['comment_text'].notnull()]
        processed_test_text = self.df_test['comment_text'].to_list()

        if not os.path.exists(path):
            tokenizer = Tokenizer(num_words=self.config.max_features)
            tokenizer.fit_on_texts(list(processed_train_text))
            with open(self.config.tokenizer_dir, 'wb') as handle:
                pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        else:
            tokenizer = Tokenizer(num_words=self.config.max_features)
            tokenizer.fit_on_texts(list(processed_train_text))
            list_tokenized_train = tokenizer.texts_to_sequences(processed_train_text)
            self.word_index = tokenizer.word_index
            self.train_padded = pad_sequences(list_tokenized_train, maxlen=self.config.maxpadlen, padding='post')
            list_tokenized_test = tokenizer.texts_to_sequences(processed_test_text)
            self.test_padded = pad_sequences(list_tokenized_test, maxlen=self.config.maxpadlen, padding='post')

    def InitializeFastText(self):
        f = open(self.config.fasttext_model_path, encoding='utf8')
        for line in f:
            line.encode('utf-8').strip()
            values = line.split()
            word = values[0]
            self.embeddings_index_fasttext[word] = np.asarray(values[1:], dtype='float32')
        f.close()
        self.embedding_matrix_fasttext = np.random.random((len(self.word_index) + 1, self.config.embedding_dim))
        for word, i in self.word_index.items():
            embedding_vector = self.embeddings_index_fasttext.get(word)
            if embedding_vector is not None:
                self.embedding_matrix_fasttext[i] = embedding_vector
            
    def LSTMTrainer(self):
        self.train_labels = np.array(self.df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']])
        self.test_labels = np.array(self.df_test[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']])
        model = tf.keras.Sequential([
            tf.keras.layers.Embedding(len(self.word_index) + 1,
                                self.config.embedding_dim,
                                weights = [self.embedding_matrix_fasttext],
                                input_length = self.config.maxpadlen,
                                trainable=False,
                                name = 'embeddings'),
        tf.keras.layers.Input(shape=(self.config.maxpadlen, ),dtype='int32'),
        tf.keras.layers.LSTM(40,return_sequences=True, name='lstm_layer'),
        tf.keras.layers.GlobalMaxPooling1D(),
        tf.keras.layers.Dropout(.1),
        tf.keras.layers.Dense(30, activation='relu', kernel_initializer='he_uniform'),
            tf.keras.layers.Dropout(.1),
            tf.keras.layers.Dense(6, activation='sigmoid', kernel_initializer='glorot_uniform')
        ])
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.summary()

        history = model.fit(self.train_padded, self.train_labels, epochs=self.config.epochs, batch_size=self.config.batch_size, validation_data=(self.test_padded, self.test_labels))
        model.save(self.config.root_dir,'/model.h5')

In [14]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config = model_trainer_config)
    model_trainer.DefineTokenizer(model_trainer_config.processed_train_data_dir)
    model_trainer.InitializeFastText()
    model_trainer.LSTMTrainer()
except Exception as e:
    raise e

[2023-07-15 13:17:40,168: INFO: common] Successfully read yaml file from config\config.yaml
[2023-07-15 13:17:40,181: INFO: common] Successfully read yaml file from params.yaml
[2023-07-15 13:17:40,186: INFO: common] Created directory at: artifacts
[2023-07-15 13:17:40,189: INFO: common] Created directory at: artifacts/model_trainer


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embeddings (Embedding)      (None, 200, 300)          38906100  
                                                                 
 input_3 (InputLayer)        multiple                  0         
                                                                 
 lstm_layer (LSTM)           (None, 200, 40)           54560     
                                                                 
 global_max_pooling1d_2 (Gl  (None, 40)                0         
 obalMaxPooling1D)                                               
                                                                 
 dropout_4 (Dropout)         (None, 40)                0         
                                                                 
 dense_4 (Dense)             (None, 30)                1230      
                                                      