# Scalers at work

In [None]:
import matplotlib as mpl
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, Normalizer, PowerTransformer, \
                                  QuantileTransformer, RobustScaler,  StandardScaler, minmax_scale, FunctionTransformer

## Load data 
Description see <https://www.kaggle.com/datasets/camnugent/california-housing-prices?resource=download>

In [None]:
df = pd.read_csv('data/housing.csv')
df.insert(len(df.columns), "avg_occup", df["population"]/df["households"])
df.describe()

In [None]:
df

## Plot Histograms 

In [None]:
df_numeric = df.drop(columns=["ocean_proximity"])
for i in df_numeric.columns:
    plt.figure()
    plt.title(f'{i}')
    plt.hist(df_numeric[i],bins=50)
    plt.show()

## Select median_income and avg_occup as features

In [None]:
features = df[["median_income", "avg_occup"]]
y_orig = df["median_house_value"]
y = minmax_scale(y_orig)

In [None]:
# function to create plots
def make_plot(titel :str, X :pd.DataFrame) -> None:
    plt.figure(figsize=(6,6))
    plt.title(titel)
    plt.scatter(X.iloc[:,0], X.iloc[:,1], marker='o', edgecolor='black', s=20)
    plt.xlabel("median_income")
    plt.ylabel("avg_occup")
    plt.grid()
    plt.xlim([X.iloc[:,0].min(),X.iloc[:,0].max()+1])
    plt.ylim([X.iloc[:,1].min(),X.iloc[:,1].max()+1])       
    plt.show()
    
    # zoom-in
    lower_median_income = np.percentile(X["median_income"], 0)
    upper_median_income = np.percentile(X["median_income"], 99)
    lower_avg_occup = np.percentile(X["avg_occup"], 0)
    upper_avg_occup = np.percentile(X["avg_occup"], 99)   
    
    # Select data between
    trimmed = X.loc[(X["median_income"].between(lower_median_income, upper_median_income)) & (X["avg_occup"].between(lower_avg_occup, upper_avg_occup))]
    plt.figure(figsize=(6,6))
    plt.title(titel + " (zoomed)")
    plt.scatter(X.iloc[:,0], X.iloc[:,1], marker='o', edgecolor='black', s=20)
    plt.xlabel("median_income")
    plt.ylabel("avg_occup")
    plt.grid()
    plt.xlim([trimmed.iloc[:,0].min(),trimmed.iloc[:,0].max()+1])
    plt.ylim([trimmed.iloc[:,1].min(), trimmed.iloc[:,1].max()+1])       
    plt.show()
    
    for i in X.columns:
        plt.figure()
        plt.title(f'{i}')
        plt.hist(X[i],bins=50)
        plt.show()

## Original Data

In [None]:
make_plot("Original data", features)

## Standard Scaler

In [None]:
stsc = StandardScaler().set_output(transform="pandas")
features_stsc = stsc.fit_transform(features)
make_plot("standard", features_stsc)

## Min Max Scaler

In [None]:
mmsc = MinMaxScaler().set_output(transform="pandas")
features_mmsc = mmsc.fit_transform(features)
display(features_mmsc)
print(features_mmsc["median_income"].max(), features_mmsc["median_income"].min())
print(features_mmsc["avg_occup"].max(), features_mmsc["avg_occup"].min())
make_plot("min max", features_mmsc)

## Power Transformer

In [None]:
pt = PowerTransformer(method="yeo-johnson").set_output(transform="pandas")
features_pt = pt.fit_transform(features)
print(pt.lambdas_)
display(features_pt)
make_plot("Power Transformer", features_pt)
pt2 = PowerTransformer(method="box-cox").set_output(transform="pandas")
features_pt2 = pt2.fit_transform(features) # box-cox needs all positive values
display(features_pt2)
print(pt2.lambdas_)
make_plot("Power Transformer Box-Cox", features_pt2)   

## Log Transformer

In [None]:
lt = FunctionTransformer(np.log1p)
features_lt = lt.transform(features)
display(features_lt)    
make_plot("Log Transformer", features_lt)


## Quantil Transformer

In [None]:
qt = QuantileTransformer(output_distribution="normal").set_output(transform="pandas")    
features_qt = qt.fit_transform(features)
display(features_qt)
make_plot("Quantile Transformer", features_qt)
qt_uni = QuantileTransformer(output_distribution="uniform").set_output(transform="pandas")
features_qt_uni = qt_uni.fit_transform(features)
display(features_qt_uni)
make_plot("Quantile Transformer Uniform", features_qt_uni)   