In [12]:
import os 

In [13]:
%pwd

'C:\\Users\\RICH-FILES\\Desktop\\ml\\AI-powered-Bank-Product-Recommender-Chatbot'

In [14]:
os.chdir("../.")

In [15]:
%pwd

'C:\\Users\\RICH-FILES\\Desktop\\ml'

In [16]:
project_dir = "C:/Users/RICH-FILES/Desktop/ml/AI-powered-Bank-Product-Recommender-Chatbot"
os.chdir(project_dir)

In [17]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    """
    Data Transformation Configuration
    """
    root_dir: Path
    transformed_data_file: Path
    customer_path: Path
    product_path: Path
    train_data_file: Path
    test_data_file: Path

In [18]:
from BankProducts.constants import *
from BankProducts.utils.common import read_yaml, create_directories

In [19]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,    
        params_filepath = PARAMS_FILE_PATH,
        #schema_filepath = SCHEMA_FILE_PATH,
        ):
       
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        #self.schema = read_yaml(schema_filepath)        
    
        create_directories([self.config.artifacts_root])
        
    def get_data_transformation_config(self)-> DataTransformationConfig:
        """
        Returns Data Transformation Configuration
        """
        config = self.config.data_transformation
        
        create_directories([self.config.artifacts_root])
        
        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config.root_dir),
            transformed_data_file= Path(config.transformed_data_file),
            product_path= Path(config.product_path),
            customer_path= Path(config.customer_path),
            train_data_file= Path(config.train_data_file),
            test_data_file= Path(config.test_data_file)
            )
        
        
        return data_transformation_config
    

In [20]:


%pip install seaborn

import os  
from BankProducts import logger
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Note: you may need to restart the kernel to use updated packages.


In [21]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        #self.data = None
        #self.transformed_data = None
        
    def join_datasets(self):
        """
        Join customer and product datasets"""    
        try:
            customer_data = pd.read_csv(self.config.customer_path)
            product_data = pd.read_csv(self.config.product_path)

            # Ensure the directory for saving exists
            output_dir = os.path.dirname(self.config.transformed_data_file)
            os.makedirs(output_dir, exist_ok=True)

            # Join operation
            joined_data = pd.merge(customer_data, product_data, how="left",
                                left_on="existing_products", right_on="product_name")
            
            #drop unnecessary columns
            joined_data = joined_data.drop(columns=['existing_products'])
            
            #check the dataset head
            print(joined_data.head())
            
            
            # Clean data
            joined_data.drop_duplicates(inplace=True)
            joined_data.dropna(inplace=True)
            joined_data.reset_index(drop=True, inplace=True)
            
            #check columns
            print(joined_data.columns)
            
            #plot "product_name" histogram based of gender using seaborn
            plt.figure(figsize=(10,6))
            sns.histplot(joined_data['product_name'], hue='gender', multiple="stack")
            plt.title("Product Name Histogram by Gender")
            plt.show()
            plt.savefig("product name histogram by gender.png")
           
            
            #plot "age" histogram
            plt.figure(figsize=(10,6))
            plt.hist(joined_data["age"], bins=10, edgecolor='black')
            plt.title("Age Frequency Distribution")
            plt.xlabel("Age")
            plt.ylabel("Frequency")
            plt.savefig("age_histogram.png")
            
        
           
            #plot "product_name" vs "age" bar plot
            plt.figure(figsize=(10,6))
            plt.bar(joined_data["product_name"], joined_data["age"])
            plt.title("Product Name vs Age Bar Plot")
            plt.xlabel("Product Name")
            plt.ylabel("Age")
            plt.savefig("product_name_vs_age_bar_plot.png")
            
            

            
             
            
                     
            
             
            
           
            
            
            
           
           
            
            
            
            
                        
                    
            # Save the joined data
            joined_data.to_csv(self.config.transformed_data_file, index=False)

        except Exception as e:
                print(f"Error joining datasets: {e}")
                
    def transform_data(self):
        """
        Transform the data as per the requirements
        """
        try:
            # Load the data
            self.data = pd.read_csv(self.config.transformed_data_file)
            # Perform transformations
            print(self.data.head())
            
            print(":"*100)
            
            self.data.info()
            print(":"*100)
            
            self.data.describe()
            print(":"*100)
            print(self.data.columns)
            print(":"*100)
            
            
            
            # Example transformation: splitting data into train and test sets
            train_data, test_data = train_test_split(self.data, test_size=0.2)
            self.transformed_data = {
                "train": train_data,
                "test": test_data
            }
            
        
        
        
            #save train_data and test_data to csv files
            train_data.to_csv(self.config.train_data_file, index=False)
            test_data.to_csv(self.config.test_data_file, index=False)
            
            logger.info("Data transformed successfully")
        except Exception as e:
            logger.error(f"Error transforming data: {e}")
            raise

            
        
    
        

In [22]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.join_datasets()
    data_transformation.transform_data()
except Exception as e:
    logger.error(f"Error in data transformation: {e}")
    raise  e

[2025-05-24 10:04:55,895: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-24 10:04:55,898: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-24 10:04:55,900: INFO: common: created directory at: artifacts]
[2025-05-24 10:04:55,902: INFO: common: created directory at: artifacts]
                            customer_id             name  age  gender  \
0  ce557e27-5553-4268-9dc2-b61faef71586    Mark Harrison   44    Male   
1  53c315b3-dd28-4a1d-9d50-6807955643c8      Tyrone Ward   18  Female   
2  0f7d81ad-5a41-40ea-8c3d-dd2e26c1d5a0    Jeremy Harris   24  Female   
3  ef9ef273-df0b-4ef1-bc12-227f115011ac   Dr. Andrew Lam   33    Male   
4  bb7e9042-4ffd-4ac6-bdc6-f7c32ab2b218  Diana Nicholson   39  Female   

                            occupation  annual_income marital_status  \
0                   Editor, film/video       73401.40       Divorced   
1                          Optometrist      192818.49         Single   
2  Scientist, research (

<Figure size 1000x600 with 0 Axes>