In [1]:
import os

In [2]:
%pwd

'/Users/ravina/Desktop/CustomerChurnPrediction/research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'/Users/ravina/Desktop/CustomerChurnPrediction'

In [5]:
from dataclasses import dataclass 
from pathlib import Path

@dataclass(frozen=True)  #this is not python class but dataclass, here you can define the veriables without using self keyword.
class DataTransformationConfig:
    root_dir:Path
    data_path: Path

In [6]:
from CustomerChurnPrediction.constants import *
from CustomerChurnPrediction.utils.common import read_yaml,create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
        

    def get_data_transformation_config(self)-> DataTransformationConfig:
        config=self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config=DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,

        )

        return data_transformation_config

In [8]:
import os
from CustomerChurnPrediction import logger
from sklearn.model_selection import train_test_split
import pandas as pd 
from sklearn.preprocessing import StandardScaler,LabelEncoder
import imblearn
from imblearn.over_sampling import SMOTE




In [9]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def train_test_splitting(self):
        df = pd.read_csv(self.config.data_path)
        
        # Separate the target variable 'Exited'
        y = df['Exited']
        X=df.drop(['Exited'], axis=1)
        
        numr = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
        catg =['Geography', 'Gender', 'NumOfProducts', 'IsActiveMember', 'sufficient_balance', 'is_CreditScore_low']
        
        numerical_transformer = StandardScaler()
        categoricals= []

        # Apply LabelEncoder to each categorical column
        for column in catg:
            le = LabelEncoder()
            X[column] = le.fit_transform(X[column])
            categoricals.append((column, le))

        
        X[numr] = numerical_transformer.fit_transform(X[numr])   # Apply StandardScaler to numerical columns
        transformed_df = pd.concat([X[numr], X[catg]], axis=1)   # Concatenate numerical and encoded categorical columns
        df_transformed = pd.concat([transformed_df, y], axis=1)   # Concatenate the encoded categorical, scaled numerical and target y.
        
        train, test = train_test_split(df_transformed, test_size=0.25, random_state=42)   # Split into training and testing sets
        smote = SMOTE(sampling_strategy='minority')    # Apply SMOTE to training set
        X_train, y_train= smote.fit_resample(train.drop(columns=['Exited']), train['Exited'])
        
        train_resampled = pd.concat([pd.DataFrame(X_train, columns=train.drop(columns=['Exited']).columns), pd.DataFrame(y_train, columns=['Exited'])], axis=1)
        
        # Save train and test sets to CSV
        train_resampled.to_csv(os.path.join(self.config.root_dir, 'train.csv'), index=False)
        test.to_csv(os.path.join(self.config.root_dir, 'test.csv'), index=False)

        logger.info("Data has been split into training and test sets.")
        logger.info(f"Training set shape: {train_resampled.shape}")
        logger.info(f"Test set shape: {test.shape}")
        print("columns of X:",X.columns)
        print("columns of df_transformed:",df_transformed.columns)
        print("Shape of X_train and y_train after appliing smote", X_train.shape,y_train.shape)
        print("columns of train_resample:",train_resampled.columns)

In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_splitting()
except Exception as e:
    raise e


[2024-04-01 09:50:00,231:INFO:yaml file:config/config.yaml loaded successfully]
[2024-04-01 09:50:00,233:INFO:yaml file:params.yaml loaded successfully]
[2024-04-01 09:50:00,235:INFO:yaml file:Schema.yaml loaded successfully]
[2024-04-01 09:50:00,236:INFO:created directory at:artifacts]
[2024-04-01 09:50:00,237:INFO:created directory at:artifacts/data_transformation]
columns of X: Index(['Geography', 'Gender', 'NumOfProducts', 'IsActiveMember',
       'sufficient_balance', 'is_CreditScore_low', 'CreditScore', 'Age',
       'Tenure', 'Balance', 'EstimatedSalary'],
      dtype='object')
[2024-04-01 09:50:00,378:INFO:Data has been split into training and test sets.]
[2024-04-01 09:50:00,378:INFO:Training set shape: (11918, 12)]
[2024-04-01 09:50:00,379:INFO:Test set shape: (2500, 12)]
columns of df_transformed: Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary',
       'Geography', 'Gender', 'NumOfProducts', 'IsActiveMember',
       'sufficient_balance', 'is_CreditScore_l