In [3]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
import pandas as pd
from typing import List, Optional
class BaseDataPreprocessor(TransformerMixin, BaseEstimator):
        
    def __init__(self, needed_columns: Optional[List[str]]=None):
        self.scaler = StandardScaler()
        self.needed_columns = needed_columns
        
    
    def fit(self, data, *args):
        X = data[self.needed_columns] if self.needed_columns is not None else data
        X = pd.DataFrame(X).fillna(0)
        self.scaler.fit(X)
        self.is_fitted_ = True
        return self

    def transform(self, data: pd.DataFrame) -> np.array:
        X = data[self.needed_columns] if self.needed_columns is not None else data
        X = pd.DataFrame(X).fillna(0)
        return self.scaler.transform(X)
            
class OneHotPreprocessor(BaseDataPreprocessor):
    
    def __init__(self,interesting_columns: Optional[List[str]]=None, **kwargs):
        super(OneHotPreprocessor, self).__init__(**kwargs)
        self.interesting_columns =  interesting_columns
        self.ohe = OneHotEncoder(sparse_output=False,handle_unknown='ignore')
        
    def fit(self, data, *args):
        if self.needed_columns is None:
            self.needed_columns = data.drop(self.interesting_columns, axis=1).columns.tolist()
            
        super().fit(data, *args)
        if self.interesting_columns is not None:
            self.ohe.fit(data[self.interesting_columns])
        self.is_fitted_ = True
        return self
    
    def transform(self, data):
        X_scaled = super().transform(data)
        if self.interesting_columns is not None:
            X_encoded = self.ohe.transform(data[self.interesting_columns])
            return np.hstack((X_scaled, X_encoded))
        return X_scaled
    
def grad(X, w, b, y, reg):
    grad_w =  (2/X.shape[0])*(X.T @ (X @ w + b - y)) +2*reg*w
    grad_b = (2/X.shape[0])*np.sum(X @ w + b - y)
    return grad_w,grad_b

class SGDLinearRegressor(RegressorMixin, BaseEstimator):
    def __init__(self,
                 lr=0.01, regularization=1., delta_converged=1e-3, max_steps=1000,
                 batch_size=64):
        self.lr = lr
        self.regularization = regularization
        self.max_steps = int(max_steps)
        self.delta_converged = delta_converged
        self.batch_size = int(batch_size)
        self.is_fitted_ = False

        self.W_ = None
        self.b_ = None
    def fit(self, X, Y):
        self.W_ = np.ones(X.shape[1])
        self.b_ = 1
        Y = np.array(Y).ravel()
        X = np.array(X)
        converged = False
        step = 0
        if self.batch_size > X.shape[0]:
            self.batch_size = X.shape[0]
        while step < self.max_steps:
            indices = np.random.permutation(X.shape[0])
            for i in range(0,X.shape[0],self.batch_size):
                idx = indices[i:i + self.batch_size]
                X_batch, Y_batch = X[idx], Y[idx]
                grad_w, grad_b = grad(X_batch, self.W_, self.b_, Y_batch, self.regularization)
                if step >= self.max_steps or np.linalg.norm(self.lr*grad_w) < self.delta_converged:
                    converged = True
                    break
                self.W_ -= self.lr*grad_w
                self.b_ -= self.lr*grad_b
                step += 1
            if converged:
                break

        self.is_fitted_ = True
        return self

    def predict(self, X):
        return np.dot(X,self.W_) + self.b_

def make_ultimate_pipeline(needed_columns: Optional[List[str]]=None,interesting_columns: Optional[List[str]]=None,lr=0.01, regularization=1., delta_converged=1e-3, max_steps=1000,
                     batch_size=64):
    return Pipeline([('preprocessor', OneHotPreprocessor(needed_columns=needed_columns,interesting_columns=interesting_columns)),('SGDEstimator',
                    SGDLinearRegressor(lr,regularization,delta_converged,max_steps,batch_size))])

ModuleNotFoundError: No module named 'numpy'