In [35]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from logger import logging
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder

logging.basicConfig(level=logging.DEBUG)  # or logging.INFO

logging.basicConfig(level=logging.INFO)

class GenderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.gender_dict = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.gender_dict is None:
            self.gender_dict = {'male', 'female'}  

        def Gen(x):
            if x in self.gender_dict:
                return str(x)
            else:
                return 'other'
        
        X['New Gender'] = X["Gender "].apply(Gen)
        gend = pd.get_dummies(X["New Gender"])
        X = pd.concat([X, gend], axis=1)
        
        logging.info("Columns after1:")
        logging.info(X.columns)
        
        return X
    
class Function(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):        
        fen = pd.get_dummies(X["Function"])
        X = pd.concat([X,  fen], axis=1)
        
        logging.info("Columns after1:")
        logging.info(X.columns)
        
        return X
class HiringSource(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):        
        hr = pd.get_dummies(X["Hiring Source"])
        X = pd.concat([X,  hr], axis=1)
        
        logging.info("Columns after1:")
        logging.info(X.columns)
        
        return X
    
class tengrp(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):        
        tengrp = pd.get_dummies(X["Tenure Grp."])
        X = pd.concat([X,  tengrp], axis=1)
        
        logging.info("Columns after1:")
        logging.info(X.columns)
        
        return X
    

class PromotedTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def Promoted(x):
            if x == 'Promoted':
                return int(1)
            else:
                return int(0)

        X['New Promotion'] = X["Promoted/Non Promoted"].apply(Promoted)
        logging.info("Columns after2:")
        logging.info(X.columns)
        return X

class Location(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.location_dict_new = {
            'Chennai':       7,
            'Noida':         6,
            'Bangalore':     5,
            'Hyderabad':     4,
            'Pune':          3,
            'Madurai':       2,
            'Lucknow':       1,
            'other place':   0,
        }
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def location(x):                     
            if str(x) in self.location_dict_new.keys():
                return self.location_dict_new[str(x)]
            else:
                return self.location_dict_new['other place']
        
        X['New Location'] = X["Location"].apply(location)
        logging.info("Columns after3:")
        logging.info(X.columns)
        return X  
    
class Marraige(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.Marital_dict = None
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        self.Marital_dict = X["Marital Status"].value_counts().to_dict()
        def Mar(x):
            if str(x) in self.Marital_dict.keys() and self.Marital_dict[str(x)] > 100:
                return str(x)
            else:
                return 'other status'

        X['New Marital'] = X["Marital Status"].apply(Mar)
        Mr = pd.get_dummies(X["New Marital"])
        X = pd.concat([X,  Mr], axis=1)
        logging.info("Columns after4:")
        logging.info(X.columns) 
        return X 
    
class Emp_group(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.Emp_dict_new = {
            'B1': 4,
            'B2': 3,
            'B3': 2,
            'other group': 1,
        }
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def emp(x):
            if str(x) in self.Emp_dict_new.keys():
                return str(x)
            else:
                return 'other group'
        
        X['New EMP'] = X["Emp. Group"].apply(emp)
        emp = pd.get_dummies(X["New EMP"])
        X = pd.concat([X,  emp], axis=1)
        logging.info("Columns after5:")
        logging.info(X.columns)        
        return X 
   
class Job_Role(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def Job(x):
            if x == 'Yes':
                return int(1)
            else:
                return int(0)

        X['New Job Role Match'] = X["Job Role Match"].apply(Job)
        logging.info("Columns after6:")
        logging.info(X.columns)     
        return X

class Gender(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.gender_dict = None
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        self.gender_dict = X["Gender "].value_counts().to_dict()
        def Gen(x):
            if str(x) in self.gender_dict.keys() and self.gender_dict[str(x)] > 100:
                return str(x)
            else:
                return 'other status'
        
        X['New Gender'] = X["Gender "].apply(Gen)
        logging.info("Columns after7:")
        logging.info(X.columns)         
        return X 

class Droping(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop=None):
        self.columns_to_drop = columns_to_drop
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.columns_to_drop:
            X = X.drop(self.columns_to_drop, axis=1)
            logging.info("Columns after dropping:")
            logging.info(X.columns)
            
        return X
columns_to_drop_names = ["table id", "name", "Marital Status", "Promoted/Non Promoted", 
                         "Function", "Emp. Group", "Job Role Match", "Location", 
                         "Hiring Source", "Gender ", "Tenure", "Tenure Grp.", "phone number",]
drop_columns_transformer = Droping(columns_to_drop=columns_to_drop_names)
    
import pandas as pd
from sklearn.pipeline import Pipeline

# Step 1: Read the CSV file into the DataFrame
attrdata = pd.read_csv(r"C:\Users\pj\Desktop\End to End Project\Attrition_Rate_Django\Notebook\data\Table_1.csv")
logging.info("attrdata ")
        
logging.info("categorical_columns2")

preprocessor = make_column_transformer(
    (GenderTransformer(), ["Gender "]),
    (Function(),["Function"]),
    (PromotedTransformer(), ["Promoted/Non Promoted"]),
    (HiringSource(), ["Hiring Source"]),
    (Location(), ["Location"]),
    (Marraige(), ["Marital Status"]),
    (Emp_group(), ["Emp. Group"]),
    (Job_Role(), ["Job Role Match"]),
    (tengrp(), ["Tenure Grp."]),
    (Gender(), ["Gender "]),
    (drop_columns_transformer, columns_to_drop_names),
)


data_pipeline = Pipeline([
    ('preprocessor', preprocessor),
])

# Apply the preprocessing pipeline to the entire dataset
transformed_data = data_pipeline.fit_transform(attrdata)

In [32]:
transformed_data[0]

array(['Male', 'other', True, 'Operation', True, False, False,
       'Non Promoted', 0, 'Direct', False, True, False, 'Pune', 3,
       'Single', 'Single', False, True, False, 'B2', 'B2', False, True,
       False, False, 'Yes', 1, '< =1', True, False, 'Male', 'Male'],
      dtype=object)

In [None]:
    def initiate_data_transformation(self, train_path, test_path):
        try:
            train_df = pd.read_csv(train_path)
            logging.info("Train DataFrame info:")
            logging.info(f"Column names present in the train_df: {train_df.columns.tolist()}")
            test_df = pd.read_csv(test_path)
            logging.info("Test DataFrame info:")
            logging.info(f"Column names present in the test_df: {test_df.columns.tolist()}")

            logging.info("Read train and test data completed")

            logging.info("Obtaining preprocessing object")

            preprocessing_obj = self.data_transformation_pipeline(train_df)

            target_column_name = "Price"

            input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1)
            target_feature_train_df = train_df[target_column_name]

            input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1)
            target_feature_test_df = test_df[target_column_name]

            logging.info(
                f"Applying preprocessing object on training dataframe and testing dataframe."
            )

            # Use the fitted preprocessing object to transform both train and test data
            input_feature_train_arr = preprocessing_obj.transform(input_feature_train_df)
            input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)

            train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

            logging.info(f"Saved preprocessing object.")

            save_object(
                file_path=self.data_transformation_config.preprocessor_obj_file_path,
                obj=preprocessing_obj
            )

            # Log the columns in the DataFrame after transformation
            logging.info("Columns in the transformed train DataFrame:")
            logging.info(pd.DataFrame(input_feature_train_arr, columns=input_feature_train_df.columns))

            logging.info("Columns in the transformed test DataFrame:")
            logging.info(pd.DataFrame(input_feature_test_arr, columns=input_feature_test_df.columns))

            return (
                train_arr,
                test_arr,
                self.data_transformation_config.preprocessor_obj_file_path,
            )
        except Exception as e:
            raise CustomException(e, sys)


In [34]:
class Droping(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def drop(x):
            X.drop(["table id", "name", "Marital Status","Promoted/Non Promoted","Function","Emp. Group","Job Role Match","Location","Hiring Source","Gender ", 'Tenure', 'New Gender', 'New Marital', 'New EMP'],axis=1,inplace=True)


Unnamed: 0,table id,name,phone number,Location,Emp. Group,Function,Gender,Tenure,Tenure Grp.,Experience (YY.MM),Marital Status,Age in YY.,Hiring Source,Promoted/Non Promoted,Job Role Match,Stay/Left,New Gender,New Promotion,New Location,New Marital
0,1,sid,9876544345,Pune,B2,Operation,Male,0.0,< =1,6.08,Single,27.12,Direct,Non Promoted,Yes,Left,other,0,3,other status
1,2,sid,9876544345,Noida,B7,Support,Male,0.0,< =1,13.0,Marr.,38.08,Direct,Promoted,No,Stay,other,1,6,other status
2,3,sid,9876544345,Bangalore,B3,Operation,Male,0.01,< =1,16.05,Marr.,36.04,Direct,Promoted,Yes,Stay,other,1,5,other status
3,4,sid,9876544345,Noida,B2,Operation,Male,0.01,< =1,6.06,Marr.,32.07,Direct,Promoted,Yes,Stay,other,1,6,other status
4,5,sid,9876544345,Lucknow,B2,Operation,Male,0.0,< =1,7.0,Marr.,32.05,Direct,Non Promoted,Yes,Stay,other,0,1,other status
