# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Load Data

In [2]:
raw_train:pd.DataFrame = pd.read_csv(filepath_or_buffer="../data/loan-train.csv")
raw_test:pd.DataFrame = pd.read_csv(filepath_or_buffer="../data/loan-test.csv")
raw_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
raw_train.shape

(614, 13)

In [4]:
raw_train.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [5]:
# copy the raw data for modifications
train_df:pd.DataFrame = raw_train.copy(deep=True)
test_df:pd.DataFrame = raw_test.copy(deep=True)

In [6]:
train_df.info() # only for training

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [7]:
test_df.info() # only for testing (prediction)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             356 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         357 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      344 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         362 non-null    float64
 9   Loan_Amount_Term   361 non-null    float64
 10  Credit_History     338 non-null    float64
 11  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


In [8]:
train_y:pd.DataFrame = train_df[["Loan_Status"]]
train_df.drop(labels="Loan_Status", axis=1, inplace=True)
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


# Drop Unnecessary Columns
* `Loan Id` has all the rows as unique i.e total unique values in Loan Id = Total rows in the data and since all the rows are unique our ml model won't find any patterns in it.

In [9]:
train_df.drop(labels="Loan_ID", axis=1, inplace=True)
test_df.drop(labels="Loan_ID", axis=1, inplace=True)

In [10]:
print(sorted(train_df.columns))
print(sorted(test_df.columns))

['ApplicantIncome', 'CoapplicantIncome', 'Credit_History', 'Dependents', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed']
['ApplicantIncome', 'CoapplicantIncome', 'Credit_History', 'Dependents', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed']


# Check Duplicates

In [11]:
train_df[train_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area


In [12]:
train_df.drop_duplicates(inplace=True)

In [13]:
test_df[test_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
195,Male,No,0,Graduate,Yes,5833,0,116.0,360.0,1.0,Urban


Since, the duplicate row is in the **test df**, it will have no impact in `model training`. So, it's upto you whether to drop the duplicated rows from test df or not.

# Check Missing Values

In [14]:
train_df.isna().sum().sort_values(ascending=False)

Credit_History       50
Self_Employed        32
LoanAmount           22
Dependents           15
Loan_Amount_Term     14
Gender               13
Married               3
Education             0
ApplicantIncome       0
CoapplicantIncome     0
Property_Area         0
dtype: int64

# Impute Missing Values

* Impute **Categorical** Feature ➾ **Mean**
* Impute **Numerical** Feature ➾ **Mode**

In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(6)
memory usage: 52.9+ KB


In [16]:
train_df.columns.sort_values()

Index(['ApplicantIncome', 'CoapplicantIncome', 'Credit_History', 'Dependents',
       'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married',
       'Property_Area', 'Self_Employed'],
      dtype='object')

In [17]:
train_df.nunique().sort_values(ascending=False)

ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Dependents             4
Property_Area          3
Gender                 2
Married                2
Education              2
Self_Employed          2
Credit_History         2
dtype: int64

In [18]:
numerical_cols:list[str] = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
categorical_cols:list[str] = [col for col in train_df.columns if col not in numerical_cols]
print(categorical_cols)

['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']


* `strategy="most_frequent"` ➾ Mode

In [19]:
categorical_imputer = SimpleImputer(strategy="most_frequent") # initialize imputer
categorical_imputer.fit(X=train_df[categorical_cols]) # learn

train_df[categorical_cols] = categorical_imputer.transform(X=train_df[categorical_cols]) # transform
test_df[categorical_cols] = categorical_imputer.transform(X=test_df[categorical_cols]) # transform

train_df[categorical_cols].head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area
0,Male,No,0,Graduate,No,1.0,Urban
1,Male,Yes,1,Graduate,No,1.0,Rural
2,Male,Yes,0,Graduate,Yes,1.0,Urban
3,Male,Yes,0,Not Graduate,No,1.0,Urban
4,Male,No,0,Graduate,No,1.0,Urban


In [20]:
numerical_imputer = SimpleImputer(strategy="most_frequent") # initialize imputer
numerical_imputer.fit(X=train_df[numerical_cols]) # learn

train_df[numerical_cols] = numerical_imputer.transform(X=train_df[numerical_cols]) # transform
test_df[numerical_cols] = numerical_imputer.transform(X=test_df[numerical_cols]) # transform

test_df[numerical_cols].head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term
0,5720.0,0.0,110.0,360.0
1,3076.0,1500.0,126.0,360.0
2,5000.0,1800.0,208.0,360.0
3,2340.0,2546.0,100.0,360.0
4,3276.0,0.0,78.0,360.0


# Re-Check Missing Values

In [21]:
train_df.isna().sum().sort_values(ascending=False)

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

# Feature Engineering

In [22]:
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849.0,0.0,120.0,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583.0,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000.0,0.0,141.0,360.0,1.0,Urban


In [23]:
# combine ApplicantIncome + CoapplicantIncome into one
train_df["ApplicantIncome"] += train_df["CoapplicantIncome"]
test_df["ApplicantIncome"] += test_df["CoapplicantIncome"]

# drop the CoapplicantIncome as it's not needed
train_df.drop(labels=["CoapplicantIncome"],axis=1,inplace=True)
test_df.drop(labels=["CoapplicantIncome"],axis=1,inplace=True)

# remove the CoapplicantIncome from numerical cols also
numerical_cols.remove("CoapplicantIncome")

# Encoding Features with `LabelEncoder`
* Transforming Categorical Data into Numerical

In [24]:
train_df.nunique().sort_values()

Gender                2
Married               2
Education             2
Self_Employed         2
Credit_History        2
Property_Area         3
Dependents            4
Loan_Amount_Term     10
LoanAmount          203
ApplicantIncome     554
dtype: int64

In [25]:
train_df["Property_Area"].unique() # Can be treated as Nominal or Ordinal

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [26]:
train_df["Dependents"].unique() # Ordinal therefore, can apply Label Encoding

array(['0', '1', '2', '3+'], dtype=object)

In [27]:
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])  # Fit and transform the training data
    test_df[col] = le.fit_transform(test_df[col])  # Only transform the test data

train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,0,0,5849.0,120.0,360.0,1,2
1,1,1,1,0,0,6091.0,128.0,360.0,1,0
2,1,1,0,0,1,3000.0,66.0,360.0,1,2
3,1,1,0,1,0,4941.0,120.0,360.0,1,2
4,1,0,0,0,0,6000.0,141.0,360.0,1,2


# Log Transormation on Numerical Features
* **`Log transformation`** is a common technique used in machine learning to transform numerical features that have a `highly skewed distributio`n`. The significance of log transformation is that it can help to improve the performance of machine learning models by making the data more normally distributed, reducing the effect of outliers, and stabilizing the variance.

In [28]:
train_df[numerical_cols] = np.log(train_df[numerical_cols])
test_df[numerical_cols] = np.log(test_df[numerical_cols])

train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,0,0,8.674026,4.787492,5.886104,1,2
1,1,1,1,0,0,8.714568,4.85203,5.886104,1,0
2,1,1,0,0,1,8.006368,4.189655,5.886104,1,2
3,1,1,0,1,0,8.505323,4.787492,5.886104,1,2
4,1,0,0,0,0,8.699515,4.94876,5.886104,1,2


# Scale the Entire Data

In [29]:
minmax = MinMaxScaler() # initialize minmax scaler

train_df = minmax.fit_transform(X=train_df)
test_df = minmax.fit_transform(X=test_df)

train_df

array([[1.        , 0.        , 0.        , ..., 0.9220137 , 1.        ,
        1.        ],
       [1.        , 1.        , 0.33333333, ..., 0.9220137 , 1.        ,
        0.        ],
       [1.        , 1.        , 0.        , ..., 0.9220137 , 1.        ,
        1.        ],
       ...,
       [1.        , 1.        , 0.33333333, ..., 0.9220137 , 1.        ,
        1.        ],
       [1.        , 1.        , 0.66666667, ..., 0.9220137 , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.9220137 , 0.        ,
        0.5       ]])

# Building the Model

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_df, train_y.squeeze(), test_size=0.3, random_state=0)
X_train:np.ndarray
X_test:np.ndarray
y_train:pd.DataFrame
y_test:pd.DataFrame

In [31]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(429, 10)
(429,)
(185, 10)
(185,)


In [32]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X=X_train, y=y_train)

In [33]:
y_pred = lr.predict(X=X_test)

# Model Evaluation

In [34]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_true=y_test, y_pred=y_pred)
print(acc)

0.827027027027027


# Serialization of ML Model

In [35]:
import joblib
joblib.dump(value=lr, filename="../artifacts/lr_model_v1.pkl")

['../artifacts/lr_model_v1.pkl']

# De-Serialization of ML Model

In [36]:
de_serialized_model = joblib.load(filename="../artifacts/lr_model_v1.pkl")
de_serialized_model

In [37]:
# Check that model before serialization and deserialization is same or not
print("Y-Intercept of De-Serialized Model:", de_serialized_model.intercept_)
print("Y-Intercept of the Original Model", lr.intercept_)

Y-Intercept of De-Serialized Model: [-2.10635808]
Y-Intercept of the Original Model [-2.10635808]


In [38]:
print("Coefficient of De-Serialized Model:\n", de_serialized_model.coef_)
print("\nCoefficient of Original Model:\n",lr.coef_)

Coefficient of De-Serialized Model:
 [[ 0.02309126  0.35215431  0.3931467  -0.4421388  -0.04886011  0.00607904
  -0.73182368  0.24499636  3.18041381  0.22543288]]

Coefficient of Original Model:
 [[ 0.02309126  0.35215431  0.3931467  -0.4421388  -0.04886011  0.00607904
  -0.73182368  0.24499636  3.18041381  0.22543288]]


# Check Path

In [39]:
import notebook

In [40]:
notebook.__file__

'/usr/local/lib/python3.11/site-packages/notebook/__init__.py'

In [41]:
import pathlib
pathlib.Path(notebook.__file__)

PosixPath('/usr/local/lib/python3.11/site-packages/notebook/__init__.py')

In [42]:
pathlib.Path(notebook.__file__).parent

PosixPath('/usr/local/lib/python3.11/site-packages/notebook')

In [43]:
pathlib.Path(notebook.__file__).parent.resolve()

PosixPath('/usr/local/lib/python3.11/site-packages/notebook')

# Create a Custom Data Transformer (`pipeline`) with Sklearn
1. To create a Custom Transformer Pipeline we have to inherit two mandatory classes:
    * **BaseEstimator**
    * **TransformerMixin**
2. Then, implement `fit` and `transform`.
3. Accept input with `__init__` method, if any.

In [44]:
from typing import Self
from sklearn.base import BaseEstimator, TransformerMixin

class DataTransformer(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()

    def fit(self, X, y=None) ->Self:
        return self # returning 'self' because we want our 'fit' method to do nothing
    
    def transform(self, X) -> list:
        return X # returning 'X' because we don't want to do anything

## Custom Imputer for Numerical Features

In [45]:
from typing import Self, Union
from sklearn.base import BaseEstimator, TransformerMixin

class MeanImputer(BaseEstimator, TransformerMixin):
    """
    Our own Custom Data Transformer\n
    Numerical Imputation => Mean"""
    def __init__(self, numerical_features) -> None:
        self.numerical_features:list = numerical_features
        self.name = "Subrata"
        super().__init__()

    def fit(self, X, y=None) -> Self:
        self.mean_dict:dict = {}
        for col in self.numerical_features:
            self.mean_dict[col] = X[col].mean()
        return self
    
    def transform(self, X) -> pd.DataFrame:
        X = X.copy()
        for col in self.numerical_features:
            X[col].fillna(self.mean_dict[col], inplace=True)
        return X

In [46]:
import numpy as np
import pandas as pd

np.random.seed(seed=0)
df = pd.DataFrame(data = np.random.randint(low=10,high=20, size=(10,2)), columns=["A","B"])
df.iloc[1,0] = np.nan
df.iloc[2,0] = np.nan
df.iloc[3,1] = np.nan

In [47]:
df

Unnamed: 0,A,B
0,15.0,10.0
1,,13.0
2,,19.0
3,13.0,
4,12.0,14.0
5,17.0,16.0
6,18.0,18.0
7,11.0,16.0
8,17.0,17.0
9,18.0,11.0


* Let's impute the missing `nan` values with mean with our custom  `MeanImputer` class.

In [48]:
mean_imputer = MeanImputer(numerical_features=["A", "B"])

In [49]:
mean_imputer.fit(X=df)

In [50]:
mean_imputer.mean_dict

{'A': 15.125, 'B': 14.88888888888889}

In [51]:
df.mean()

A    15.125000
B    14.888889
dtype: float64

In [52]:
mean_imputer.name

'Subrata'

In [53]:
df = mean_imputer.transform(X=df)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(self.mean_dict[col], inplace=True)


Unnamed: 0,A,B
0,15.0,10.0
1,15.125,13.0
2,15.125,19.0
3,13.0,14.888889
4,12.0,14.0
5,17.0,16.0
6,18.0,18.0
7,11.0,16.0
8,17.0,17.0
9,18.0,11.0


In [54]:
class ModeImputer(BaseEstimator, TransformerMixin):
    """
    Our own Custom Data Transformer\n
    Numerical Imputation => Mode"""
    def __init__(self, categorical_features) -> None:
        self.categorical_features:list = categorical_features
        self.name = "Subrata"
        super().__init__()

    def fit(self, X, y=None) -> Self:
        """No need of learning and hence returning 'self'"""
        self.mode_dict:dict = {}
        for col in self.categorical_features:
            self.mode_dict[col] = X[col].mode()
        return self
    
    def transform(self, X) -> pd.DataFrame:
        """Impute with Mode"""
        X = X.copy()
        for col in self.categorical_features:
            X[col].fillna(self.mode_dict[col], inplace=True)
        return X

In [55]:
class DropColumns(BaseEstimator, TransformerMixin):
    """
    Our own Custom Data Transformer for Dropping Columns\n
    Dropping unnecessary Columns"""
    def __init__(self, columns_to_drop) -> None:
        self.columns_to_drop:list = columns_to_drop
        super().__init__()

    def fit(self, X, y=None) -> Self:
        """No need of learning and hence returning 'self'"""
        return self
    
    def transform(self, X) -> pd.DataFrame:
        """Drop the Columns"""
        X = X.copy()
        X = X.drop(columns = self.columns_to_drop)
        return X

In [56]:
class CombineColumns(BaseEstimator, TransformerMixin):
    """
    Feature Engineering\n
    ColumnA += ColumnB"""
    def __init__(self, columnA, columnB) -> None:
        self.columnA = columnA
        self.columnB = columnB
        super().__init__()

    def fit(self, X, y=None) -> Self:
        """No need of learning and hence returning 'self'"""
        return self
    
    def transform(self, X) -> pd.DataFrame:
        """Combine the Columns"""
        X = X.copy()
        X[self.columnA] += X[self.columnB]
        return X

In [57]:
df

Unnamed: 0,A,B
0,15.0,10.0
1,15.125,13.0
2,15.125,19.0
3,13.0,14.888889
4,12.0,14.0
5,17.0,16.0
6,18.0,18.0
7,11.0,16.0
8,17.0,17.0
9,18.0,11.0


In [58]:
combine_columns = CombineColumns(columnA="A", columnB="B")
combine_columns

In [59]:
combine_columns.fit(X=df)

In [60]:
combine_columns.transform(X=df)

Unnamed: 0,A,B
0,25.0,10.0
1,28.125,13.0
2,34.125,19.0
3,27.888889,14.888889
4,26.0,14.0
5,33.0,16.0
6,36.0,18.0
7,27.0,16.0
8,34.0,17.0
9,29.0,11.0


# Custom LabelEncoder

In [61]:
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    """
    Custom Label Encoder to encode categorical features to numerical features.

    This transformer replaces each unique categorical value with a numerical index,
    based on the sorted order of their frequency in the input data.

    Parameters
    ----------
    categorical_features : list of str
        List of column names in the input data that should be treated as categorical
        features and encoded using this transformer.

    Attributes
    ----------
    label_dict_ : dict of dict
        Mapping of each categorical feature to a dictionary of its unique values and
        their corresponding numerical indices.
"""
    def __init__(self, categorical_features: list[str]) -> None:
        self.categorical_features: list[str] = categorical_features
        super().__init__()

    def fit(self, X, y=None) -> 'CustomLabelEncoder':
        """
        Learn the mapping of each categorical feature to its unique values and their
        corresponding numerical indices.

        Parameters
        ----------
        X : pandas.DataFrame of shape (n_samples, n_features)
            The input data containing the categorical features to be encoded.

        y : None
            Ignored in this transformer.

        Returns
        -------
        self : CustomLabelEncoder
            The fitted transformer object.
        """
        self.label_dict_ = {}
        for col in self.categorical_features:
            t = X[col].value_counts().sort_values(ascending=True).index
            self.label_dict_[col] = {value: index for index, value in enumerate(iterable=t, start=0)}
        return self

    def transform(self, X) -> pd.DataFrame:
        """
        Replace each unique categorical value with its corresponding numerical index.

        Parameters
        ----------
        X : pandas.DataFrame of shape (n_samples, n_features)
            The input data containing the categorical features to be encoded.

        Returns
        -------
        X_encoded : pandas.DataFrame of shape (n_samples, n_features)
            The encoded input data with numerical features.
        """
        X = X.copy()
        for col in self.categorical_features:
            X[col] = X[col].map(self.label_dict_[col])
        return X


In [62]:
import numpy as np
import pandas as pd

np.random.seed(seed=0)
df = pd.DataFrame(data = np.random.randint(low=0,high=20, size=(6,2)), columns=["A","B"])

df.iloc[0,1] = "oK"
df.iloc[1,1] = "oK"
df.iloc[2,1] = "no"
df.iloc[3,1] = "ya"
df.iloc[4,1] = "ya"
df.iloc[5,1] = "ya"

df

  df.iloc[0,1] = "oK"


Unnamed: 0,A,B
0,12,oK
1,0,oK
2,3,no
3,9,ya
4,18,ya
5,6,ya


In [63]:
label_dict = {}
t = df["B"].value_counts().sort_values(ascending=True).index
t

Index(['no', 'oK', 'ya'], dtype='object', name='B')

In [64]:
label_dict["B"] = {value:index for index,value in enumerate(iterable=t, start=0)}
label_dict

{'B': {'no': 0, 'oK': 1, 'ya': 2}}

In [65]:
df["B"].map(label_dict["B"])

0    1
1    1
2    0
3    2
4    2
5    2
Name: B, dtype: int64

In [66]:
df

Unnamed: 0,A,B
0,12,oK
1,0,oK
2,3,no
3,9,ya
4,18,ya
5,6,ya


In [67]:
custom_label_encoder = CustomLabelEncoder(categorical_features=["B"])

In [68]:
custom_label_encoder.fit(X=df)

In [69]:
custom_label_encoder.transform(X=df)

Unnamed: 0,A,B
0,12,1
1,0,1
2,3,0
3,9,2
4,18,2
5,6,2


# `requirements.txt`

In [72]:
import numpy
numpy.__version__

'1.26.4'

In [73]:
import pandas
pandas.__version__

'2.2.0'

In [74]:
import sklearn
sklearn.__version__

'1.4.0'

In [75]:
import setuptools
setuptools.__version__

'69.0.2'

In [76]:
import wheel
wheel.__version__

'0.42.0'

In [77]:
import scipy
scipy.__version__

'1.12.0'

In [78]:
import joblib
joblib.__version__

'1.3.2'