In [1]:
import os

In [2]:
%pwd

'/Users/ravina/Desktop/CustomerChurnPrediction/research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'/Users/ravina/Desktop/CustomerChurnPrediction'

In [5]:
from dataclasses import dataclass 
from pathlib import Path

@dataclass(frozen=True)  #this is not python class but dataclass, here you can define the veriables without using self keyword.
class DataTransformationConfig:
    root_dir:Path
    data_path: Path

In [6]:
from CustomerChurnPrediction.constants import *
from CustomerChurnPrediction.utils.common import read_yaml,create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
        

    def get_data_transformation_config(self)-> DataTransformationConfig:
        config=self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config=DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,

        )

        return data_transformation_config

In [8]:
import os
from CustomerChurnPrediction import logger
from sklearn.model_selection import train_test_split
import pandas as pd 

In [9]:
class DataTransformation:
    def __init__(self,config:DataTransformationConfig):
        self.config=config
        ##we can add different datatransformation techniques such as pca,Scarl and all, 
        ## we can perform all kind of EDA
        #probable here we are adding only train test split.

    def train_test_splitting(self):
            df=pd.read_csv(self.config.data_path)
            #df=df1.drop(['Id'],axis=1,inplace=True)

            # Handle categorical variables using one-hot encoding
            df = pd.get_dummies(df, columns=['Geography', 'Gender', 'Card Type'], dtype=int)
            logger.info("Categorical columns transformed")
            # Remove irrelevant columns
            df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
            logger.info("Unneccessary columns removed")

            train,test=train_test_split(df)

            train.to_csv(os.path.join(self.config.root_dir, 'train.csv'),index=False)
            test.to_csv(os.path.join(self.config.root_dir, 'test.csv'),index=False)

            logger.info("splitted data into training and test sets")
            logger.info(train.shape)
            logger.info(test.shape)

            print(train.shape)
            print(test.shape)
            #return 'done'
            

In [10]:
try: 
    config=ConfigurationManager()
    data_transformation_config=config.get_data_transformation_config()
    data_transformation=DataTransformation(config=data_transformation_config)
    data_transformation.train_test_splitting()
except Exception as e:
    raise e

[2024-03-13 10:23:11,612:INFO:yaml file:config/config.yaml loaded successfully]
[2024-03-13 10:23:11,614:INFO:yaml file:params.yaml loaded successfully]
[2024-03-13 10:23:11,616:INFO:yaml file:Schema.yaml loaded successfully]
[2024-03-13 10:23:11,618:INFO:created directory at:artifacts]
[2024-03-13 10:23:11,619:INFO:created directory at:artifacts/data_transformation]
[2024-03-13 10:23:11,659:INFO:Categorical columns transformed]
[2024-03-13 10:23:11,661:INFO:Unneccessary columns removed]
[2024-03-13 10:23:11,716:INFO:splitted data into training and test sets]
[2024-03-13 10:23:11,717:INFO:(7500, 21)]
[2024-03-13 10:23:11,717:INFO:(2500, 21)]
(7500, 21)
(2500, 21)
