In [45]:
import pandas as pd
import os
import sys
from pathlib import Path

In [None]:
os.chdir("../")
%pwd

'/Users/hh/MLops/ajjil_technical_task'

In [47]:
from src.datascience.constants import *
from src.datascience.utils.common import (read_yaml, create_directories)
from src.datascience import logger

In [48]:
from dataclasses import dataclass
from pathlib import Path
import pandas as pd
import json
import re

@dataclass
class DataPreprocessingConfig:
    root_dir: Path
    source_file: Path
    csv_name: str
    column_names: list

class DataPreprocessing:
    def init(self,data_preprocessing_config:DataPreprocessingConfig):
        self.data_preprocessing_config = data_preprocessing_config

    def load(self):
        df = pd.read_csv(self.data_preprocessing_config.source_file)
        return df

    def drop_columns(self, df:pd.DataFrame):
        for column in self.data_preprocessing_config.column_names:
            df = df.drop(columns=f'{column}')
        return df
    
    def convert_usd_to_sar(self, df:pd.DataFrame):
        mask = df['Currency Code'] == 'USD'
        df.loc[mask, 'Total Bcy'] = df.loc[mask, 'Total Bcy'] * 3.75
        df.loc[mask, 'Currency Code'] = 'SAR'
        return df

    def fill_missing_item_names(self, df:pd.DataFrame):
        df['Item Name'].fillna("Unknown Product", inplace=True)
        return df
    
    def drop_zero_values_rows(self, df:pd.DataFrame):
        df = df[~((df['Total Bcy'] == 0) & (df['Quantity'] > 0))]
        return df
    
    def feature_engineering(self, df:pd.DataFrame):
        df['Unit Price'] = df['Total Bcy'] / df['Quantity']
        return df
    
    def normalise_whitespace_and_case(self, df: pd.DataFrame) -> pd.DataFrame:
        x = df["Item Name"].astype(str)
        x = x.str.strip()
        x = x.str.replace(r"\s+", " ", regex=True)
        x = x.str.lower()
        df["Item Name"] = x
        return df
    
    def normalise_arabic(self, df: pd.DataFrame) -> pd.DataFrame:
        x = df["Item Name"].astype(str)
        tashkeel = re.compile(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]")
        x = x.apply(lambda s: tashkeel.sub("", s))
        x = x.str.replace("ـ", "", regex=False)
        x = x.str.replace("[\u0623\u0625\u0622\u0671]", "ا", regex=True)
        x = x.str.replace("\u0649", "\u064A", regex=True)
        df["Item Name"] = x
        return df

    def normalise_english(self, df: pd.DataFrame) -> pd.DataFrame:
        x = df["Item Name"].astype(str)
        x = x.str.replace(r"[;,]", " ", regex=True)
        x = x.str.replace(r"[^\w\s\-/x\u0600-\u06FF]", " ", regex=True)
        x = x.str.replace(r"\s+", " ", regex=True).str.strip()
        df["Item Name"] = x
        return df
    
    def initiate_data_preprocessing(self):
        try:
            logger.info("<<<<< Data Preprocessing Initiated >>>>>")
            config = ConfigurationManager()
            data_preprocessing_config = config.get_data_preprocessing_config()
            data_preprocessing = DataPreprocessing()
            data_preprocessing.init(data_preprocessing_config)
            df = data_preprocessing.load()
            df = data_preprocessing.drop_columns(df)
            df = data_preprocessing.convert_usd_to_sar(df)
            df = data_preprocessing.fill_missing_item_names(df)
            df = data_preprocessing.drop_zero_values_rows(df)
            df = data_preprocessing.feature_engineering(df)
            df = data_preprocessing.normalise_whitespace_and_case(df)
            df = data_preprocessing.normalise_arabic(df)
            df = data_preprocessing.normalise_english(df)
            df.to_csv(Path(data_preprocessing_config.root_dir, data_preprocessing_config.csv_name), index=False)
            logger.info(f"<<<<< Data Preprocessing Completed CSV Saved at {Path(data_preprocessing_config.root_dir, data_preprocessing_config.csv_name)}>>>>>")


        except Exception as e:
            logger.exception(e)
            raise e



    

    

    

    
    



        


In [49]:

class ConfigurationManager:
    def __init__(self,
                 config_file_path=CONFIG_FILE_PATH):
        self.config=read_yaml(config_file_path)
        create_directories([self.config.artifacts_root])


    def get_data_preprocessing_config(self) -> DataPreprocessingConfig:
        config = self.config.data_preprocessing
        create_directories([config.root_dir])
        
        data_preprocessing_config = DataPreprocessingConfig(
            root_dir= config.root_dir,
            source_file= config.source_file,
            csv_name= config.csv_name,
            column_names = config.drop_columns.column_names
        )
        return data_preprocessing_config

In [50]:
obj = DataPreprocessing()
obj.initiate_data_preprocessing()

[2025-08-24 17:05:10,224: INFO: 3356108694: <<<<< Data Preprocessing Initiated >>>>>]
[2025-08-24 17:05:10,227: INFO: common: yaml file config/config.yaml is loaded successfully]
[2025-08-24 17:05:10,228: INFO: common: created directory at artifacts]
[2025-08-24 17:05:10,228: INFO: common: created directory at artifacts/preprocessed_data]
[2025-08-24 17:05:10,267: INFO: 3356108694: <<<<< Data Preprocessing Completed CSV Saved at artifacts/preprocessed_data/purchase-order-items.csv>>>>>]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Item Name'].fillna("Unknown Product", inplace=True)


In [41]:
a = DataPreprocessing()
b = ConfigurationManager()
c = b.get_data_preprocessing_config()
obj = a.init(c)
data = a.load()
dropped_df = a.drop_columns(data)
converted_df = a.convert_usd_to_sar(dropped_df)
items_named = a.fill_missing_item_names(converted_df)
zero_valueless = a.drop_zero_values_rows(items_named)
new_column = a.feature_engineering(zero_valueless)
normalised = a.normalise_whitespace_and_case(new_column)
norma = a.normalise_arabic(normalised)
final_df = a.normalise_english(norma)


[2025-08-24 16:49:38,223: INFO: common: yaml file config/config.yaml is loaded successfully]
[2025-08-24 16:49:38,224: INFO: common: created directory at artifacts]
[2025-08-24 16:49:38,225: INFO: common: created directory at artifacts/preprocessed_data]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Item Name'].fillna("Unknown Product", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Unit Price'] = df['Total Bcy'] / df['Quantity']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a

In [44]:
final_df.head(1234)

Unnamed: 0,Item Name,Quantity,Total Bcy,Currency Code,Unit Price
0,unknown product,75.21,227510.25,SAR,3025.0
1,unknown product,15.00,47250.00,SAR,3150.0
2,unknown product,50.00,160000.00,SAR,3200.0
3,unknown product,12.00,38280.00,SAR,3190.0
4,unknown product,2.00,5900.00,SAR,2950.0
...,...,...,...,...,...
1229,10 0 mm deformed straight bar - cr - 12 m,46.00,105800.00,SAR,2300.0
1230,اعمال قشط و اسفلت,14000.00,476000.00,SAR,34.0
1231,اعمال تركيب الهناجر 50 من تصنيع وتركيب الهيكل ...,1.00,500000.00,SAR,500000.0
1232,حديد تسليح اتفاق 12مم 12متر,10.00,22900.00,SAR,2290.0
