In [1]:
import pandas as pd
import numpy as np

In [2]:
path='datasets/housing.csv'
df=pd.read_csv(path)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# Custom Transformer

#### Creating a custom transformer in Scikit-learn is very useful when you want to preprocess data in a custom way but still keep it compatible with Scikit-learn pipelines like:

#### ✅ Best Practices
##### Use __init__() only to define parameters — no logic.
##### Use fit() to compute any statistics if needed.
##### Use transform() to actually change the input X.
##### Always return NumPy arrays or DataFrames, not lists.
##### Make sure it works with Pipeline, ColumnTransformer, GridSearchCV, etc.

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

| Method            | Purpose                                                        |
| ----------------- | -------------------------------------------------------------- |
| `fit()`           | Learns parameters from data (optional)                         |
| `transform()`     | Applies the transformation                                     |
| `fit_transform()` | Comes from TransformerMixin – it calls `fit()` + `transform()` |


In [4]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class attributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, addBedroom=True):  # optional parameter
        self.addBedroom = addBedroom

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Assume: col 0 = total_rooms, col 1 = bedrooms, col 2 = households
        total_rooms = X[:, 0]
        bedrooms = X[:, 1]
        households = X[:, 1]

        rooms_per_household = (total_rooms / households).reshape(-1, 1)

        if self.addBedroom:
            bedrooms_per_room = (bedrooms / total_rooms).reshape(-1, 1)
            return np.hstack([X, rooms_per_household, bedrooms_per_room])
        else:
            return np.hstack([X, rooms_per_household])


In [5]:
data = np.array([[1000, 200], [850, 170]])  # total_rooms, households
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('room_ratio_adder', attributeAdder())
])

output = pipeline.fit_transform(data)
print(output)

[[1.0e+03 2.0e+02 5.0e+00 2.0e-01]
 [8.5e+02 1.7e+02 5.0e+00 2.0e-01]]


In [6]:
class Scalar(BaseEstimator):
    def __init__(self):
        pass

    def fit(self,X,y=None):
        self.min=X.min(axis=0)
        self.max=X.max(axis=0)
        return self

    def transform(self,X):
        return (X-self.min)/(self.max-self.min)

In [7]:
data = np.array([[10, 200],
                 [20, 300],
                 [30, 400]])

scaler = Scalar()
scaler.fit(data)
scaled_data = scaler.transform(data)
print(scaled_data)

[[0.  0. ]
 [0.5 0.5]
 [1.  1. ]]


#### Custom Transformer for Frequency Encoding

In [8]:
df['ocean_proximity'].value_counts().index

Index(['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND'], dtype='object', name='ocean_proximity')

In [9]:
class FrequencyEncoder(BaseEstimator,TransformerMixin):
    def __init__(self,feature_index):
        self.feature_index=feature_index
    def fit(self,X,y=None):
        value_counts=X.iloc[:,self.feature_index].value_counts(normalize=True)
        self.freq_map=value_counts.to_dict()
        return self
    def transform(self,X,y=None):
        X=X.copy()
        col=X.columns[self.feature_index]
        X[col]=X[col].map(self.freq_map)
        return X

In [10]:
df = pd.DataFrame({
    'profession': ['Doctor', 'Engineer', 'Doctor', 'Artist', 'Engineer', 'Engineer']
})

encoder = FrequencyEncoder(feature_index=0)
transformed = encoder.fit_transform(df)
print(transformed)


   profession
0    0.333333
1    0.500000
2    0.333333
3    0.166667
4    0.500000
5    0.500000


## Creating a custom transformer that Find missing values

In [11]:
class Finder(BaseEstimator,TransformerMixin):
    def __init__(self,feature_index):
        self.feature_index=feature_index
    def fit(self,X,y=None):
        """ here i am assumig that the missing values are always nan not other"""
        self.filt=pd.isna(X.iloc[:,self.feature_index])
        return self
    def transform(self,X,y=None):
        X=X.copy()
        X=X.loc[self.filt]
        return X

In [12]:
df=pd.DataFrame({
    'Age':np.random.randint(1,100,size=(10)),
    'Passion':np.random.choice(['doctor','engineer','dentist','scientist'],size=(10)),
    'Income':np.random.choice(range(1000,100000),size=(10))
})

In [13]:
# creating missing values
df.iloc[0,0]=np.nan
df.iloc[4,0]=np.nan

In [14]:
df.iloc[:,0]

0     NaN
1    74.0
2    33.0
3    24.0
4     NaN
5    38.0
6    95.0
7    51.0
8    91.0
9    98.0
Name: Age, dtype: float64

In [15]:
pd.isna(df.iloc[:,0])

0     True
1    False
2    False
3    False
4     True
5    False
6    False
7    False
8    False
9    False
Name: Age, dtype: bool

In [16]:
transformer=Finder(0)
df_encode=transformer.fit_transform(df)
df_encode

Unnamed: 0,Age,Passion,Income
0,,dentist,63218
4,,scientist,63532


## 🧠 Challenge 2: Rare Category Grouper

In [17]:
for i,val in enumerate(df['Age']):
    print(i)

0
1
2
3
4
5
6
7
8
9


In [18]:
class CategoryGrouper(BaseEstimator,TransformerMixin):
    def __init__(self,feature_index,top_n):
        self.feature_index=feature_index
        self.top_n=top_n

    def fit(self,X,y=None):
        return self        
    def transform(self,X,y=None):
        X=X.copy()
        top_n_feature=X.iloc[:,self.feature_index].value_counts()[self.top_n]
        for i,row in enumerate(X.iloc[:,self.feature_index]):
            if(row not in top_n_feature):
                X.iloc[i,self.feature_index]="Other"
                
        return X

In [19]:
df=pd.DataFrame({
    'Age':np.random.randint(1,100,size=(1000)),
    'Passion':np.random.choice(['doctor','engineer','dentist','scientist'],size=(1000)),
})

In [20]:
df['Passion'].value_counts()

Passion
engineer     264
scientist    258
dentist      246
doctor       232
Name: count, dtype: int64

In [22]:
transformer=CategoryGrouper(1,2)
df_encoded=transformer.fit_transform(df)

  top_n_feature=X.iloc[:,self.feature_index].value_counts()[self.top_n]


TypeError: argument of type 'numpy.int64' is not iterable

In [None]:
df_encoded