<a href="https://colab.research.google.com/github/sandeeprairai/Feature-Engineering/blob/main/encoding_categorical_feature.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [2]:
# dataset generation
import pandas as pd
import numpy as np
import category_encoders as ce

# Simulating a dataset
data = {
    'Age': np.random.randint(20, 60, size=100).astype(float),  # Random ages between 20 and 60
    'State': np.random.choice(['Karnataka', 'Tamil Nadu', 'Maharashtra', 'Delhi', 'Telangana'], size=100),
    'Education': np.random.choice(['High School', 'UG', 'PG'], size=100),
    'Package': np.random.rand(100) * 100  # Random package values for demonstration
}

# Introducing missing values in 'Age' column (5%)
np.random.seed(0)  # For reproducibility
missing_indices = np.random.choice(data['Age'].shape[0], replace=False, size=int(data['Age'].shape[0] * 0.05))
data['Age'][missing_indices] = np.nan

df = pd.DataFrame(data)

df.head()

Unnamed: 0,Age,State,Education,Package
0,28.0,Karnataka,UG,0.044549
1,28.0,Telangana,PG,1.790736
2,,Karnataka,PG,6.428513
3,48.0,Karnataka,PG,90.143917
4,24.0,Maharashtra,High School,94.873305


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Package']), df['Package'], test_size=0.2, random_state=42)

In [4]:
X_train.head()

Unnamed: 0,Age,State,Education
55,,Tamil Nadu,High School
88,37.0,Karnataka,High School
26,,Telangana,PG
42,30.0,Telangana,High School
69,39.0,Tamil Nadu,UG


In [5]:
X_train['State'].value_counts()

State
Telangana      22
Karnataka      17
Tamil Nadu     14
Maharashtra    14
Delhi          13
Name: count, dtype: int64

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.base import BaseEstimator,TransformerMixin
import pandas as pd
import sklearn

In [7]:
class CountEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.count_map = {}

    def fit(self, X, y=None):
        if self.columns is None:
            self.columns = X.columns
        for col in self.columns:
            self.count_map[col] = X[col].value_counts().to_dict()
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].map(self.count_map[col]).fillna(0)
        return X

In [8]:
preprocessor=ColumnTransformer(
    transformers=[
        ('age_missing',SimpleImputer(strategy='mean'),['Age']),
        ('cat_state',CountEncoder(),['State']),
        ('education_ordinal',OrdinalEncoder(),['Education'])
    ]
)
sklearn.set_config(transform_output="pandas")

In [9]:
preprocessor.fit_transform(X_train)

Unnamed: 0,age_missing__Age,cat_state__State,education_ordinal__Education
55,38.48,14,0.0
88,37.00,17,0.0
26,38.48,22,1.0
42,30.00,22,0.0
69,39.00,14,2.0
...,...,...,...
60,48.00,14,1.0
71,51.00,17,0.0
14,20.00,13,1.0
92,55.00,17,0.0


In [11]:
# using category encoders
from category_encoders.count import CountEncoder

In [12]:
preprocessor=ColumnTransformer(
    transformers=[
        ('age_missing',SimpleImputer(strategy='mean'),['Age']),
        ('cat_state',CountEncoder(normalize=True),['State']),
        ('education_ordinal',OrdinalEncoder(),['Education'])
    ]
)

sklearn.set_config(transform_output='pandas')

In [13]:
preprocessor.fit_transform(X_train)

Unnamed: 0,age_missing__Age,cat_state__State,education_ordinal__Education
55,38.48,0.1750,0.0
88,37.00,0.2125,0.0
26,38.48,0.2750,1.0
42,30.00,0.2750,0.0
69,39.00,0.1750,2.0
...,...,...,...
60,48.00,0.1750,1.0
71,51.00,0.2125,0.0
14,20.00,0.1625,1.0
92,55.00,0.2125,0.0


In [14]:
# parameters
import pandas as pd
import numpy as np
import category_encoders as ce

# Simulating a dataset
np.random.seed(42)  # For reproducibility
data = {
    'State': np.random.choice(['Karnataka', 'Tamil Nadu', 'Maharashtra', 'Delhi', 'Telangana', np.NaN], size=100),
    'Education': np.random.choice(['High School', 'UG', 'PG', np.NaN], size=100)
}
df = pd.DataFrame(data)

df.head(25)


Unnamed: 0,State,Education
0,Delhi,PG
1,Telangana,High School
2,Maharashtra,High School
3,Telangana,High School
4,Telangana,PG
5,Tamil Nadu,High School
6,Maharashtra,
7,Maharashtra,High School
8,Maharashtra,
9,Telangana,


In [15]:
df.isnull().sum()

State        0
Education    0
dtype: int64

In [18]:
df['State'].value_counts()

State
Delhi          25
Tamil Nadu     19
Telangana      17
nan            17
Maharashtra    11
Karnataka      11
Name: count, dtype: int64

In [16]:
# Initialize the CountEncoder with various parameters
encoder=ce.CountEncoder(
    cols=['State','Education'],
    handle_missing='error',
    handle_unknown='error'
)

In [17]:
encoder.fit_transform(df)

Unnamed: 0,State,Education
0,25,34
1,17,27
2,11,27
3,17,27
4,17,34
...,...,...
95,25,27
96,25,16
97,17,23
98,11,23


In [19]:
encoder.mapping

{'State': State
 Delhi          25
 Tamil Nadu     19
 Telangana      17
 nan            17
 Maharashtra    11
 Karnataka      11
 Name: count, dtype: int64,
 'Education': Education
 PG             34
 High School    27
 nan            23
 UG             16
 Name: count, dtype: int64}

In [20]:
new_data=pd.DataFrame({'State':['Bihar'],'Education':['UG']})
encoder.transform(new_data)

ValueError: Missing data found in column State at transform time.

In [21]:
np.random.seed(0)  # For reproducibility
data = {
    'Category': np.random.choice(['A', 'B', 'C', 'D', 'E', 'F', np.nan], size=100, p=[0.3, 0.25, 0.15, 0.15, 0.05, 0.05, 0.05]),
    'Value': np.random.rand(100)
}

df = pd.DataFrame(data)

df.sample(10)


Unnamed: 0,Category,Value
91,C,0.209844
29,B,0.290078
2,C,0.735194
50,C,0.149448
44,C,0.806194
78,A,0.704414
33,C,0.298282
65,B,0.855803
75,A,0.223925
45,C,0.703889


In [22]:
df['Category'].value_counts()

Category
A      34
B      22
C      21
D      12
nan     5
F       4
E       2
Name: count, dtype: int64

In [23]:
encoder = ce.CountEncoder(
    cols=['Category'],
    min_group_size=10,  # Groups with counts less than 5 will be combined
    min_group_name='salman',  # Use default naming for combined minimum groups
)

# Fit and transform the dataset
encoded_df = encoder.fit_transform(df['Category'])

# Display the original and encoded data for comparison
df['Encoded'] = encoded_df
print(df.head(20))

   Category     Value  Encoded
0         B  0.677817       22
1         D  0.270008       12
2         C  0.735194       21
3         B  0.962189       22
4         B  0.248753       22
5         C  0.576157       21
6         B  0.592042       22
7         E  0.572252       11
8       nan  0.223082       11
9         B  0.952749       22
10        D  0.447125       12
11        B  0.846409       22
12        C  0.699479       21
13        F  0.297437       11
14        A  0.813798       34
15        A  0.396506       34
16        A  0.881103       34
17        D  0.581273       12
18        D  0.881735       12
19        E  0.692532       11


In [24]:
encoder.mapping

{'Category': Category
 A         34
 B         22
 C         21
 D         12
 salman    11
 Name: count, dtype: int64}

### Binary Encoder

In [25]:
import pandas as pd
import category_encoders as ce

# Sample dataset
data = {
    'Item': ['Item1', 'Item2', 'Item3', 'Item4', 'Item5', 'Item6', 'Item7', 'Item8'],
    'Fruit': ['Apple', 'Banana', 'Cherry', 'Date', 'Elderberry', 'Fig', 'Grape', 'Honeydew']
}
df = pd.DataFrame(data)

df


Unnamed: 0,Item,Fruit
0,Item1,Apple
1,Item2,Banana
2,Item3,Cherry
3,Item4,Date
4,Item5,Elderberry
5,Item6,Fig
6,Item7,Grape
7,Item8,Honeydew


In [26]:
# Initialize the Binary Encoder
encoder=ce.BinaryEncoder(cols=['Fruit'],return_df=True)

# Fit and transform the data
df_encoded=encoder.fit_transform(df)

print(df_encoded)

    Item  Fruit_0  Fruit_1  Fruit_2  Fruit_3
0  Item1        0        0        0        1
1  Item2        0        0        1        0
2  Item3        0        0        1        1
3  Item4        0        1        0        0
4  Item5        0        1        0        1
5  Item6        0        1        1        0
6  Item7        0        1        1        1
7  Item8        1        0        0        0


### Target Encoder

In [27]:
# using category_encoder

import pandas as pd
import category_encoders as ce

# Sample data
data = {
    'Feature': ['A', 'B', 'A', 'B', 'C', 'A', 'B', 'C'],
    'Target': [1, 0, 0, 1, 1, 1, 0, 1]
}
df = pd.DataFrame(data)

# Separating the feature and target columns
X = df.drop('Target', axis=1)
y = df['Target']

# Initialize the TargetEncoder
encoder = ce.TargetEncoder(cols=['Feature'])

# Fit the encoder using the feature data and target variable
encoder.fit(X, y)

# Transform the data
encoded = encoder.transform(X)

# Show the original and encoded data
print(pd.concat([df, encoded], axis=1))


   Feature  Target   Feature
0        A       1  0.631436
1        B       0  0.579948
2        A       0  0.631436
3        B       1  0.579948
4        C       1  0.678194
5        A       1  0.631436
6        B       0  0.579948
7        C       1  0.678194


In [28]:
encoder.mapping

{'Feature': Feature
  1    0.631436
  2    0.579948
  3    0.678194
 -1    0.625000
 -2    0.625000
 dtype: float64}

In [29]:
!pip install --upgrade scikit-learn==1.4.0

Collecting scikit-learn==1.4.0
  Downloading scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.4.0


In [1]:
# using sklearn
import pandas as pd
from sklearn.preprocessing import TargetEncoder

# Sample data
data = {
    'Feature': ['A', 'B', 'A', 'B', 'C', 'A', 'B', 'C'],
    'Target': [1, 0, 0, 1, 1, 1, 0, 1]
}
df = pd.DataFrame(data)

# Separating the feature and target columns
X = df.drop('Target', axis=1)
y = df['Target']

# Initialize the TargetEncoder
encoder = TargetEncoder(smooth=0.0)

# Fit the encoder using the feature data and target variable
encoder.fit(X, y)

# Transform the data
encoded = encoder.transform(X)

encoded


array([[0.66666667],
       [0.33333333],
       [0.66666667],
       [0.33333333],
       [1.        ],
       [0.66666667],
       [0.33333333],
       [1.        ]])

### Weight of evidence

In [2]:
!pip install category_encoders



In [3]:
import pandas as pd
import category_encoders as ce

# Example dataset
data = {
    'Feature': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'B', 'A', 'C'],
    'Target': [1, 0, 0, 1, 1, 0, 1, 0, 1, 0]
}
df = pd.DataFrame(data)

# Define the features and target
X = df[['Feature']]
y = df['Target']

# Initialize and fit the TargetEncoder
encoder = ce.WOEEncoder(cols=['Feature'])
X_encoded = encoder.fit_transform(X, y)

# Display the original and encoded data
df['Feature_Encoded'] = X_encoded
print(df)


  Feature  Target  Feature_Encoded
0       A       1         0.000000
1       B       0        -0.405465
2       A       0         0.000000
3       C       1         0.405465
4       B       1        -0.405465
5       A       0         0.000000
6       C       1         0.405465
7       B       0        -0.405465
8       A       1         0.000000
9       C       0         0.405465
