# Importing Basic Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# 4. `add_dummy_feature`
- Useful for adding a parameter for bias term in the model.

In [2]:
x = np.array(
    [
        [7, 1],
        [1, 8],
        [2, 0],
        [9, 6]
    ]
)

from sklearn.preprocessing import add_dummy_feature

x_new = add_dummy_feature(x)
x_new

array([[1., 7., 1.],
       [1., 1., 8.],
       [1., 2., 0.],
       [1., 9., 6.]])

# 5. Custom Transformers

In [3]:
from sklearn.preprocessing import FunctionTransformer

In [4]:
wine_data = pd.read_csv('./data/winequality-red.csv', sep=';')

In [11]:
wine_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,8.319637,1.741096,4.6,7.1,7.9,9.2,15.9
volatile acidity,1599.0,0.527821,0.17906,0.12,0.39,0.52,0.64,1.58
citric acid,1599.0,0.270976,0.194801,0.0,0.09,0.26,0.42,1.0
residual sugar,1599.0,2.538806,1.409928,0.9,1.9,2.2,2.6,15.5
chlorides,1599.0,0.087467,0.047065,0.012,0.07,0.079,0.09,0.611
free sulfur dioxide,1599.0,15.874922,10.460157,1.0,7.0,14.0,21.0,72.0
total sulfur dioxide,1599.0,46.467792,32.895324,6.0,22.0,38.0,62.0,289.0
density,1599.0,0.996747,0.001887,0.99007,0.9956,0.99675,0.997835,1.00369
pH,1599.0,3.311113,0.154386,2.74,3.21,3.31,3.4,4.01
sulphates,1599.0,0.658149,0.169507,0.33,0.55,0.62,0.73,2.0


In [8]:
transformer = FunctionTransformer(np.log1p, validate=True)
wine_data_transformed = transformer.fit_transform(np.array(wine_data))
pd.DataFrame(wine_data_transformed, columns=wine_data.columns).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,2.215842,0.1781,1.722767,2.091864,2.186051,2.322388,2.827314
volatile acidity,1599.0,0.417173,0.114926,0.113329,0.329304,0.41871,0.494696,0.947789
citric acid,1599.0,0.228147,0.152423,0.0,0.086178,0.231112,0.350657,0.693147
residual sugar,1599.0,1.218131,0.269969,0.641854,1.064711,1.163151,1.280934,2.80336
chlorides,1599.0,0.083038,0.038991,0.011929,0.067659,0.076035,0.086178,0.476855
free sulfur dioxide,1599.0,2.639013,0.62379,0.693147,2.079442,2.70805,3.091042,4.290459
total sulfur dioxide,1599.0,3.63475,0.682575,1.94591,3.135494,3.663562,4.143135,5.669881
density,1599.0,0.691519,0.000945,0.68817,0.690945,0.691521,0.692064,0.69499
pH,1599.0,1.460557,0.03576,1.319086,1.437463,1.460938,1.481605,1.611436
sulphates,1599.0,0.501073,0.093731,0.285179,0.438255,0.482426,0.548121,1.098612


# 6. Polynomial Features
- Useful when relationship is not simply linear.

In [12]:
from sklearn.preprocessing import PolynomialFeatures

wine_data_copy = wine_data.copy()
wine_data_copy = wine_data_copy.drop(['quality'], axis=1)
print('# features before transformation: ', wine_data_copy.shape)

# FITTING A POLYNOMIAL OF DEGREE 2 TO THE WINE DATA
poly = PolynomialFeatures()
poly_wine_data = poly.fit_transform(wine_data_copy)
print('# features after transformation: ', poly_wine_data.shape)

# features before transformation:  (1599, 11)
# features after transformation:  (1599, 78)


In [13]:
# LIST OF FEATURES

# Get output feature names for transformation
poly.get_feature_names_out()

array(['1', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'fixed acidity^2', 'fixed acidity volatile acidity',
       'fixed acidity citric acid', 'fixed acidity residual sugar',
       'fixed acidity chlorides', 'fixed acidity free sulfur dioxide',
       'fixed acidity total sulfur dioxide', 'fixed acidity density',
       'fixed acidity pH', 'fixed acidity sulphates',
       'fixed acidity alcohol', 'volatile acidity^2',
       'volatile acidity citric acid', 'volatile acidity residual sugar',
       'volatile acidity chlorides',
       'volatile acidity free sulfur dioxide',
       'volatile acidity total sulfur dioxide',
       'volatile acidity density', 'volatile acidity pH',
       'volatile acidity sulphates', 'volatile acidity alcohol',
       'citric acid^2', 'citric acid residual sugar',
       'citric acid chlorides', 'citric aci

# 7. Discretization
- Otherwise known as **quantization** or **binning**.

In [15]:
from sklearn.preprocessing import KBinsDiscretizer
wine_data = wine_data_copy.copy()

enc = KBinsDiscretizer(n_bins=10, encode='onehot')
X = np.array(wine_data['chlorides']).reshape(-1, 1)
X_binned = enc.fit_transform(X)

In [16]:
X_binned

<1599x10 sparse matrix of type '<class 'numpy.float64'>'
	with 1599 stored elements in Compressed Sparse Row format>

In [17]:
# USING ARRAY TO EXPAND THE SPARSE MATRIX

X_binned.toarray()[:5]

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]])

# 8. Handling Categorical Features

### `OneHotEncoder`

In [20]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [19]:
cols = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
iris_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None, names=cols)
iris_data.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


`label` is categorical. Let's convert it into **one hot vectors**.

In [21]:
iris_data['label'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [22]:
onehotencoder = OneHotEncoder(categories='auto')
print('Shape of y before encoding: ', iris_data.label.shape)

iris_labels = onehotencoder.fit_transform(iris_data.label.values.reshape(-1, 1))

print('Shape of y after OneHotEncoding: ', iris_labels.shape)

# Since output is sparse matrix, we need to expand it
print('Output type: ', type(iris_labels))
print(iris_labels.toarray()[:5])

Shape of y before encoding:  (150,)
Shape of y after OneHotEncoding:  (150, 3)
Output type:  <class 'scipy.sparse._csr.csr_matrix'>
[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


### `OrdinalEncoder`

In [26]:
enc = OrdinalEncoder()
iris_labels = np.array(iris_data['label'])

iris_labels_transformed = enc.fit_transform(iris_labels.reshape(-1, 1))
print('Unique labels: ', np.unique(iris_labels_transformed))

print('First 5 labels: ')
print(iris_labels_transformed[:5])

Unique labels:  [0. 1. 2.]
First 5 labels: 
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


### `LabelEncoder`

In [28]:
from sklearn.preprocessing import LabelEncoder

In [29]:
iris_labels = np.array(iris_data['label'])
enc = LabelEncoder()
label_integer = enc.fit_transform(iris_labels)
label_integer

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### `MultiLabelBinarizer`

In [30]:
movie_genres = [
    {'action', 'comedy'},
    {'comedy'},
    {'action', 'thriller'},
    {'action', 'thriller', 'science-fiction'}
]

In [31]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit_transform(movie_genres)

array([[1, 1, 0, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 1],
       [1, 0, 1, 1]])

### Using Dummy Variables
- Use `get_dummies` to create **one-hot encoding** for each unique categorical value.

In [38]:
iris_data_onehot = pd.get_dummies(data=iris_data, columns=['label'], prefix=['one-hot'])
iris_data_onehot

Unnamed: 0,sepal length,sepal width,petal length,petal width,one-hot_Iris-setosa,one-hot_Iris-versicolor,one-hot_Iris-virginica
0,5.1,3.5,1.4,0.2,True,False,False
1,4.9,3.0,1.4,0.2,True,False,False
2,4.7,3.2,1.3,0.2,True,False,False
3,4.6,3.1,1.5,0.2,True,False,False
4,5.0,3.6,1.4,0.2,True,False,False
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,False,False,True
146,6.3,2.5,5.0,1.9,False,False,True
147,6.5,3.0,5.2,2.0,False,False,True
148,6.2,3.4,5.4,2.3,False,False,True


# 9. Composite Transformers

### `ColumnTransformer`

In [39]:
x = [
    [20.0, 'male'],
    [47.0, 'male'],
    [2.9, 'female'],
    [20.0, 'female'],
    [12.3, 'male'],
    [33.0, 'male'],
    [10.4, 'female'],
]

x = np.array(x)

In [40]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MaxAbsScaler

ct = ColumnTransformer(
    [
        ('scalar', MaxAbsScaler(), [0]),
        ('pass', 'passthrough', [0]), # pass it as it is
        ('encoder', OneHotEncoder(), [1])
    ]
)

ct.fit_transform(x)

array([['0.425531914893617', '20.0', '0.0', '1.0'],
       ['1.0', '47.0', '0.0', '1.0'],
       ['0.06170212765957447', '2.9', '1.0', '0.0'],
       ['0.425531914893617', '20.0', '1.0', '0.0'],
       ['0.26170212765957446', '12.3', '0.0', '1.0'],
       ['0.7021276595744681', '33.0', '0.0', '1.0'],
       ['0.22127659574468087', '10.4', '1.0', '0.0']], dtype='<U32')

### `TransformedTargetRegressor`
- Transforms target variable `y` before fitting a regression model.
- The predicted values are mapped back to the original space via inverse
  transform.
- It takes **regressor** and transformer to be applied to the target variable as arguments.

In [2]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

In [3]:
X, y = fetch_california_housing(return_X_y=True)
# Taking a subset of data
X, y = X[:2000, :], y[:2000]

In [4]:
transformer = MaxAbsScaler()

# Regressor for original label
regressor = LinearRegression()

# Regressor with transformed labels
regr = TransformedTargetRegressor(regressor= regressor, transformer=transformer)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
regr.fit(X_train, y_train)

print(f'R2 score raw label regression: {regr.score(X_test, y_test):.2f}')

raw_target_regr = LinearRegression().fit(X_train, y_train)
print(f'R2 score transformed label regression: {raw_target_regr.score(X_test, y_test):.2f}')

R2 score raw label regression: 0.59
R2 score transformed label regression: 0.59


# NOTES

In [48]:
a = np.array(
    [
        [1, 3],
        [5, 2],
        [6, 2]
    ]
)
a

array([[1, 3],
       [5, 2],
       [6, 2]])

In [50]:
a[:1]

array([[1, 3]])

In [51]:
a[:1, :1]

array([[1]])