In [1]:
import os 
os.chdir('../')
%pwd

'f:\\senthil\\project\\End_to_end_text_classification_using_bert'

In [38]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataPreprocessConfig:
    root_dir: Path
    data_path: Path
    indepent_feature: list
    drop_feature: list
    target_feature: str
    preprocess_data_path: Path

In [31]:
from textClassification.constants import *
from textClassification.utils.common import read_yaml, create_directories

In [40]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_preprocess_config(self) -> DataPreprocessConfig:
        config = self.config.data_preprocess

        create_directories([config.root_dir])

        data_preprocess_config = DataPreprocessConfig(
            root_dir= config.root_dir,
            data_path =config.data_path,
            indepent_feature= config.indepent_feature,
            drop_feature= config.drop_feature,
            target_feature= config.target_feature,
            preprocess_data_path= config.preprocess_data_path,
        )

        return data_preprocess_config


In [None]:
import os
from textClassification.logging import logging
import pandas as pd
import nltk
import re
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

le = LabelEncoder()
stemmer = PorterStemmer()
lemat = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [44]:
class DataPreprocessor:
    def __init__(self, config: DataPreprocessConfig):
        self.config = config


    
    def feature_text_process(self,data_frame,feature):
        #remove stopwords and apply lemmatization for all data
        corpus =[]
        tags_re = re.compile('<.*?>')
        for i in range(len(data_frame[feature])):
            #text = BeautifulSoup(df['Full_Article'][i],'lxml').text
            text = re.sub(tags_re,'',data_frame[feature][i])
            text = re.sub('[^a-zA-Z]',' ',data_frame[feature][i])
            text = text.lower()
            text = text.split()
            review = [lemat.lemmatize(word) for word in text if not word in set(stopwords.words('english'))]
            review = ' '.join(review)
            review = review[1:len(review)-1]
            corpus.append(review)
        return corpus
        
    

    def convert(self):
        data_path = self.config.data_path
        df = pd.read_csv(data_path,encoding='cp1252')
        drop_feature = self.config.drop_feature

        #drop feature
        df.drop(drop_feature,axis=1,inplace=True)

        #data Preprocssing and store preprocessed data into new column

        indepent_feature = self.config.indepent_feature
        preprocess_column = ['Heading_preprocess','Description_preprocess','Article_preprocess']
        for i in range(len(indepent_feature)):
            preprocess_feature= self.feature_text_process(df,indepent_feature[i])
            df[preprocess_column[i]] = preprocess_feature
        
        #convert cat_feature in numeric representation using labelencoder
        target_feature = self.config.target_feature
        df[target_feature]= le.fit_transform(df['Article_Type'])

        #train test split 

        X = df[preprocess_column]
        y = df['Article_Type']

        X_train, X_test , y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state = 42)

        train = pd.concat([X_train,y_train],axis =1)
        test = pd.concat([X_test,y_test],axis =1)

        preprocess_path = os.path.join(self.config.preprocess_data_path)
        

        train.to_csv(preprocess_path+'train.csv')
        test.to_csv(preprocess_path+'test.csv')

In [45]:
try:
    config = ConfigurationManager()
    data_preprocess_config = config.get_data_preprocess_config()
    data_preprocess = DataPreprocessor(config=data_preprocess_config)
    data_preprocess.convert()
except Exception as e:
    raise e