In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'c:\\Users\\Rounak\\Desktop\\OneDrive\\College\\Projects\\Severity-of-Toxic-Commentis-End-to-End'

In [22]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_path: Path
    # potential_stopwords: list
    # re_patterns: dict

In [6]:
from SeverityOfToxicCommentsEndToEnd.utils.common import read_yaml, create_directories
from SeverityOfToxicCommentsEndToEnd.constants import *
from DataTransformationParam import *

In [23]:
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            tokenizer_path = config.tokenizer_path,
            # potential_stopwords = config.potential_stopwords,
            # re_patterns = config.RE_PATTERNS
        )
        return data_transformation_config

In [32]:
import os
from SeverityOfToxicCommentsEndToEnd.logging import logger
import re
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import itertools
from string import ascii_lowercase
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rounak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [24]:
print(potential_stopwords)

['editor', 'reference', 'thank', 'work', 'find', 'good', 'know', 'like', 'look', 'thing', 'want', 'time', 'list', 'section', 'wikipedia', 'doe', 'add', 'new', 'try', 'think', 'write', 'use', 'user', 'way', 'page']


In [28]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.stopword_list = []
        self.dual_alpha_list = []
        self.train_text = []
        self.lemma_train_text = []
        self.processed_train_text = []
    
    def clean_text(self, text, remove_repeat_text=True, remove_patterns_text=True, is_lower=True):

        if is_lower:
            text=text.lower()
            
        if remove_patterns_text:
            for target, patterns in RE_PATTERNS.items():
                for pat in patterns:
                    text=str(text).replace(pat, target)

        if remove_repeat_text:
            text = re.sub(r'(.)\1{2,}', r'\1', text) 

        text = str(text).replace("\n", " ")
        text = re.sub(r'[^\w\s]',' ',text)
        text = re.sub('[0-9]',"",text)
        text = re.sub(" +", " ", text)
        text = re.sub("([^\x00-\x7F])+"," ",text)
        return text 
    
    def lemma(self, text, lemmatization=True):
        lemmatizer = WordNetLemmatizer()
        output=''
        if lemmatization:
            text=text.split(' ')
            for word in text:
                word1 = lemmatizer.lemmatize(word, pos = "n") #noun 
                word2 = lemmatizer.lemmatize(word1, pos = "v") #verb
                word3 = lemmatizer.lemmatize(word2, pos = "a") #adjective
                word4 = lemmatizer.lemmatize(word3, pos = "r") #adverb
                output=output + " " + word4
        else:
            output=text
        
        return str(output.strip())
    
    def iter_all_strings(self):
        for size in itertools.count(1):
            for s in itertools.product(ascii_lowercase, repeat=size):
                yield "".join(s)
    
    def dual_alpha(self):
        for s in self.iter_all_strings():
            self.dual_alpha_list.append(s)
            if s == 'zz':
                break
    
    def alter_dual_alpha(self):
        self.dual_alpha_list.remove('i')
        self.dual_alpha_list.remove('a')
        self.dual_alpha_list.remove('am')
        self.dual_alpha_list.remove('an')
        self.dual_alpha_list.remove('as')
        self.dual_alpha_list.remove('at')
        self.dual_alpha_list.remove('be')
        self.dual_alpha_list.remove('by')
        self.dual_alpha_list.remove('do')
        self.dual_alpha_list.remove('go')
        self.dual_alpha_list.remove('he')
        self.dual_alpha_list.remove('hi')
        self.dual_alpha_list.remove('if')
        self.dual_alpha_list.remove('is')
        self.dual_alpha_list.remove('in')
        self.dual_alpha_list.remove('me')
        self.dual_alpha_list.remove('my')
        self.dual_alpha_list.remove('no')
        self.dual_alpha_list.remove('of')
        self.dual_alpha_list.remove('on')
        self.dual_alpha_list.remove('or')
        self.dual_alpha_list.remove('ok')
        self.dual_alpha_list.remove('so')
        self.dual_alpha_list.remove('to')
        self.dual_alpha_list.remove('up')
        self.dual_alpha_list.remove('us')
        self.dual_alpha_list.remove('we')

        for letter in self.dual_alpha_list:
            self.stopword_list.append(letter)
    
    def alter_stopwords(self):
        for word in potential_stopwords:
            self.stopword_list.append(word)
        print(len(self.stopword_list))

    def remove_stopwords(self, text, remove_stop=True):
        output = ""
        if remove_stop:
            text=text.split(" ")
            for word in text:
                if word not in self.stopword_list:
                    output=output + " " + word
        else :
            output=text

        return str(output.strip())
    
    def perform_data_transformation(self):
        df = pd.read_csv(self.config.data_path)
        for line in df['comment_text']: 
            self.train_text.append(self.clean_text(line))

        for line in self.train_text:
            self.lemma_train_text.append(self.lemma(line))
        
        self.dual_alpha()
        self.alter_dual_alpha()
        self.alter_stopwords()
        for line in self.lemma_train_text: 
            self.processed_train_text.append(self.remove_stopwords(line))
    
    def tokenizer(self):
        tokenizer = Tokenizer(num_words=max_features)
        tokenizer.fit_on_texts(list(self.processed_train_text))
        list_tokenized_train = tokenizer.texts_to_sequences(self.processed_train_data)
        word_index=tokenizer.word_index
        training_padded=pad_sequences(list_tokenized_train, maxlen=maxpadlen, padding = 'post')

In [29]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config = data_transformation_config)
    result = data_transformation.perform_data_transformation()

except Exception as e:
    raise e

[2023-07-11 20:22:22,247: INFO: common] Successfully read yaml file from config\config.yaml
[2023-07-11 20:22:22,249: INFO: common] Successfully read yaml file from params.yaml
[2023-07-11 20:22:22,253: INFO: common] Created directory at: artifacts
[2023-07-11 20:22:22,256: INFO: common] Created directory at: artifacts/data_transformation
700


In [31]:
result

['geez be you forgetful we already discuss why marx not an anarchist i he to a state to mold his socialist man ergo he be a statist the opposite of an anarchist i a guy who say that when he get old and his teeth fall out he quit eat meat would you call him a vegetarian',
 'carioca rfa for your support on my request for adminship the final outcome so i be now an administrator if you have any comment or concern on my action a an administrator please let me you',
 'birthday no worry what i do enjoy day talk',
 'pseudoscience category i assume that this article be in the pseudoscience category because of association with creationism however there be modern scientifically accept variant of catastrophism that have nothing to do with creationism and they even mention in the article i the connection to pseudoscience need to be clarify or the article make more general and creationism specific and the category tag remove entirely',
 'and if such phrase exist would be provide by search engine eve