<a href="https://colab.research.google.com/github/rtajeong/M2_new/blob/main/lab34a_scikit_learn_object_rev5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scikit-Learn API convention
- to show how scikit-learn objects are made
- all objects share a consistent and simple interface
- The main objects in sklearn are:
  - supervised learning model:
    - model = LinearRegression()
    - model.fit(X_train, y_train)
    - model.score(X_test, y_test)
    - model.predict(X_new), model.predict_proba(X_new), model.decision_function(X_new)
  - unsupervised learning model:
    - model = XXX()
    - model.fit(X)
    - model.predict(X): clustering
    - model.transform(X): transform
    - model.fit_transform(X)


# Example 1 - transformer(unsupervised)

In [None]:
import numpy as np

class my_StandardScaler():

    def __init__(self):
        pass
        # self.mean_, self.std_ = 0., 0.

    def fit(self, X, y=None):
        X = X.astype(np.float32)
        self.mean_, self.std_ = X.mean(), X.var()**0.5
        return self

    def transform(self, X, y=None):
        X = X.astype(np.float32)
        X -= self.mean_
        X /= self.std_
        return X

    def fit_transform(self, X, y=None):
        X = X.astype(np.float32)
        self.mean_, self.std_ = X.mean(), X.var()**0.5
        X -= self.mean_
        X /= self.std_
        return X

    def inverse_transform(self, X):
        X = X.astype(np.float32)
        X *= self.std_
        X += self.mean_
        return X

In [None]:
import numpy as np

X = np.array([1,2,3,4,5,6,7,8,9,10])
sc = my_StandardScaler()

In [None]:
try:
    print(sc.mean_, sc.std_)
except:
    print("parameters are available after the fit() method is performed.")

parameters are available after the fit() method is performed.


In [None]:
sc.fit(X)

<__main__.my_StandardScaler at 0x78d7b479bc20>

In [None]:
sc.mean_, sc.std_

(np.float32(5.5), np.float32(2.8722813))

In [None]:
X_sc = sc.transform(X); X_sc

array([-1.5666989 , -1.2185436 , -0.87038827, -0.52223295, -0.17407766,
        0.17407766,  0.52223295,  0.87038827,  1.2185436 ,  1.5666989 ],
      dtype=float32)

In [None]:
sc.inverse_transform(X_sc)

array([ 1.       ,  1.9999998,  3.       ,  4.       ,  5.       ,
        6.       ,  7.       ,  8.       ,  9.       , 10.       ],
      dtype=float32)

In [None]:
sc

<__main__.my_StandardScaler at 0x78d7b479bc20>

- with sklearn library

In [None]:
from sklearn.preprocessing import StandardScaler

model = StandardScaler()

In [None]:
# Call the fit method to calculate the mean and standard deviation
model.fit(X.reshape(-1, 1))

# Now you can access the mean_ attribute
model.mean_

array([5.5])

In [None]:
X_sc = model.fit_transform(X.reshape(-1,1))
model.mean_, model.var_**0.5, X_sc

(array([5.5]),
 array([2.87228132]),
 array([[-1.5666989 ],
        [-1.21854359],
        [-0.87038828],
        [-0.52223297],
        [-0.17407766],
        [ 0.17407766],
        [ 0.52223297],
        [ 0.87038828],
        [ 1.21854359],
        [ 1.5666989 ]]))

In [None]:
model.get_params()

{'copy': True, 'with_mean': True, 'with_std': True}

## More practical one
- usually, we define the hyperparametes in the __init__(), and the trtainable parameters in the fit() method.
- (example) we have a hyperparameter, called with_mean, which determines whether to center the data before scaling, and with_std, which determines whether to scale the data to unit variance.

- why we need inheritance from BaseEstimator, TransformerMixin?
  - BaseEstimator and TransformerMixin are utility classes provided by scikit-learn that help standardize the implementation of custom estimators and transformers.
  - They ensure that your custom classes integrate seamlessly into the scikit-learn ecosystem, allowing them to be used with pipelines, cross-validation, grid search, and other scikit-learn tools.
- BaseEstimator() provides __repr__(), __eq__(), get_params(), set_params()
- TransformerMixin() provides fit_transform().

In [None]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class my_StandardScaler2(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True, with_std=True):
        self.with_mean = with_mean
        self.with_std = with_std

    def fit(self, X, y=None):
        if self.with_mean:
            self.mean_ = np.mean(X, axis=0)
        if self.with_std:
            self.var_ = np.var(X, axis=0)
        return self

    def transform(self, X):
        if self.with_mean:
            X = X - self.mean_
        if self.with_std:
            X = X / np.sqrt(self.var_)
        return X

    # def fit_transform(self, X, y=None):
    #     return self.fit(X, y).transform(X)

In [None]:
X = np.array([1,2,3,4,5,6,7,8,9,10])

scaler = my_StandardScaler2(with_mean=True, with_std=True)
scaler.fit(X)
X_scaled = scaler.transform(X)
X_scaled

array([-1.5666989 , -1.21854359, -0.87038828, -0.52223297, -0.17407766,
        0.17407766,  0.52223297,  0.87038828,  1.21854359,  1.5666989 ])

In [None]:
scaler.fit_transform(X)

array([-1.5666989 , -1.21854359, -0.87038828, -0.52223297, -0.17407766,
        0.17407766,  0.52223297,  0.87038828,  1.21854359,  1.5666989 ])

In [None]:
scaler.get_params()

{'with_mean': True, 'with_std': True}

In [None]:
scaler.set_params(with_mean=False, with_std=False)

In [None]:
scaler.get_params()

{'with_mean': False, 'with_std': False}

In [None]:
scaler

# Example 2 - predictor (supervised)

In [None]:
import numpy as np
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

# my own OLS(Ordinary Least Square)
class my_OLS ():

    def __init__(self):
        self.W_ = np.array([])

    def fit(self, X, y):
        for i in range(len(X)+1):
            np.append(self.W_, 0)
        X_b = np.concatenate([X, np.ones((X.shape[0], 1))], axis=1)  # Xb
        self.W_ = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)

    def score(self, X, y):
        pred = self.predict(X)
        return r2_score(pred, y)

    def predict(self, X):
        X_b = np.concatenate([X, np.ones((X.shape[0], 1))], axis=1)
        return X_b@self.W_

In [None]:
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=100, n_features=2, noise=10, random_state=1)

In [None]:
X[:5], y[:5]

(array([[ 0.0465673 ,  0.80186103],
        [-2.02220122,  0.31563495],
        [-0.38405435, -0.3224172 ],
        [-1.31228341,  0.35054598],
        [-0.88762896, -0.19183555]]),
 array([ 86.25546612, -29.71630024, -51.45616226, -18.11089712,
        -29.63783339]))

In [None]:
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [None]:
X_sc[:5]

array([[ 0.06345283,  0.66953924],
       [-2.13002995,  0.10357213],
       [-0.39312854, -0.6391205 ],
       [-1.37731524,  0.14420857],
       [-0.9270608 , -0.48712347]])

In [None]:
model = my_OLS()

In [None]:
model.W_

array([], dtype=float64)

In [None]:
model.fit(X_sc, y)

In [None]:
model.W_

array([29.52899512, 71.88859745, 19.28081596])

In [None]:
model.score(X_sc, y)

0.9814885362629435

In [None]:
Xnew_sc = sc.transform([[1.7, 0.7]])
result = model.predict(Xnew_sc)
result

array([112.53073047])

In [None]:
# with sklearn library
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

model = LinearRegression()        # create model
model.fit(X_sc,y)                    # train model
print("coefficients and bias: ", model.coef_, model.intercept_)
print(model.score(X_sc,y))

Xnew_sc = sc.transform([[1.7, 0.7]])
print(model.predict(Xnew_sc))

coefficients and bias:  [29.52899512 71.88859745] 19.280815955500472
0.9818249824413999
[112.53073047]
