In [19]:
import pandas as pd

class DataProcessor:
    def __init__(self, input_csv_path, output_csv_path):
        """
        Initialize the DataProcessor class.

        Args:
            input_csv_path (str): The file path to the input CSV file containing data.
            output_csv_path (str): The file path to save the cleaned and merged data.

        """
        self.input_csv_path = input_csv_path
        self.output_csv_path = output_csv_path
        self.df = None  # Initialize DataFrame attribute

    def read_data(self):
        """
        Read data from the input CSV file into a DataFrame.
        """
        self.df = pd.read_csv(self.input_csv_path)

    def clean_and_merge_data(self):
        """
        Clean and merge data in the DataFrame.

        This method performs the data cleaning and merging steps.
        """
        if self.df is not None:
            # Clean 'filePath' column
            self.df['filePath'] = self.df['filePath'].str.replace(r'^.*File', 'File')

            # Filter rows based on specific properties
            filtered_df_vendor = self.df[self.df['property'].str.contains('Vendor_Name')]
            filtered_df_invoice = self.df[self.df['property'].str.contains('InvoiceTotalAmount')]
            filtered_df_date = self.df[self.df['property'].str.contains('Invoice_Date')]

            # Group by 'filePath' and concatenate values
            result_df_vendor = filtered_df_vendor.groupby('filePath')['Name'].apply(lambda x: ' '.join(x)).reset_index()
            result_df_InvoiceTotalAmount = filtered_df_invoice.groupby('filePath')['Name'].apply(lambda x: ' '.join(x)).reset_index()
            result_df_date = filtered_df_date.groupby('filePath')['Name'].apply(lambda x: ' '.join(x)).reset_index()

            # Merge DataFrames based on 'filePath'
            combined_df = result_df_date.merge(result_df_InvoiceTotalAmount, on='filePath').merge(result_df_vendor, on='filePath')

            # Check if all values exist for each file
            combined_df = combined_df[combined_df.notnull().all(axis=1)]

            # Rename columns
            combined_df.rename(columns={'Name': 'Vendor_Name', 'Name_y': 'InvoiceTotalAmount', 'Name_x': 'Invoice_Date'}, inplace=True)

            self.df = combined_df  # Update the DataFrame attribute

    def save_to_csv(self):
        """
        Save the merged and cleaned data to the specified output CSV file.
        """
        if self.df is not None:
            self.df.to_csv(self.output_csv_path, index=False)

    def process_data(self):
        """
        Perform the complete data processing, including cleaning, merging, and saving to CSV.
        """
        self.read_data()
        self.clean_and_merge_data()
        self.save_to_csv()

# Sample use case 
input_file_path = '/Users/shairawadhawan/Desktop/GiBots /Excel files /combined_csv_500-1000.clean.csv'
output_file_path = '/Users/shairawadhawan/Desktop/GiBots /Clean_dataset.csv'

data_processor = DataProcessor(input_file_path, output_file_path)
data_processor.process_data()


  self.df['filePath'] = self.df['filePath'].str.replace(r'^.*File', 'File')
