In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv('hypothetical_dataset.csv')

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

# 1. Standardization (Z-score normalization)
def standardize(X):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

X_standardized = standardize(X[numeric_features])
print("Standardized data summary:\n", X_standardized.describe())

# 2. Min-Max Scaling
def min_max_scale(X):
    scaler = MinMaxScaler()
    return pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

X_minmax = min_max_scale(X[numeric_features])
print("Min-Max scaled data summary:\n", X_minmax.describe())

# 3. Robust Scaling (using quartiles)
def robust_scale(X):
    scaler = RobustScaler()
    return pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

X_robust = robust_scale(X[numeric_features])
print("Robust scaled data summary:\n", X_robust.describe())

# 4. Normalization (scaling to unit norm)
def normalize(X):
    normalizer = Normalizer()
    return pd.DataFrame(normalizer.fit_transform(X), columns=X.columns, index=X.index)

X_normalized = normalize(X[numeric_features])
print("Normalized data summary:\n", X_normalized.describe())

# 5. Log Transformation
def log_transform(X):
    return np.log1p(X)

X_log = log_transform(X[numeric_features])
print("Log-transformed data summary:\n", X_log.describe())

# 6. Box-Cox Transformation
from scipy.stats import boxcox
def box_cox_transform(X):
    return pd.DataFrame(X.apply(lambda x: boxcox(x + 1)[0]), columns=X.columns, index=X.index)

X_boxcox = box_cox_transform(X[numeric_features])
print("Box-Cox transformed data summary:\n", X_boxcox.describe())

# 7. Yeo-Johnson Transformation
from sklearn.preprocessing import PowerTransformer
def yeo_johnson_transform(X):
    pt = PowerTransformer(method='yeo-johnson')
    return pd.DataFrame(pt.fit_transform(X), columns=X.columns, index=X.index)

X_yeojohnson = yeo_johnson_transform(X[numeric_features])
print("Yeo-Johnson transformed data summary:\n", X_yeojohnson.describe())

# 8. Combining different scaling methods in a pipeline
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('normalizer', Normalizer())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', 'passthrough', categorical_features)
    ])

# Fit and transform the data
X_preprocessed = preprocessor.fit_transform(X)

# 9. Handling skewed data
from scipy.stats import skew

def identify_skewed_features(X, threshold=0.5):
    skewed_features = X.apply(lambda x: skew(x.dropna()))
    return skewed_features[abs(skewed_features) > threshold]

skewed_features = identify_skewed_features(X[numeric_features])
print("Skewed features:\n", skewed_features)

# Apply log transformation to skewed features
X[skewed_features.index] = np.log1p(X[skewed_features.index])

# 10. Scaling for specific algorithms
# Example: Scaling for SVM or Neural Networks
svm_scaler = StandardScaler()
X_svm = svm_scaler.fit_transform(X[numeric_features])

# Example: Scaling for tree-based models (generally not needed, but included for completeness)
# Tree-based models are invariant to monotonic transformations of individual features
X_tree = X.copy()  # No scaling needed

# 11. Scaling time series data
def scale_time_series(X):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

# Assuming 'date' is the timestamp column
X_sorted = X.sort_values('date')
X_ts_scaled = scale_time_series(X_sorted[numeric_features])

# 12. Feature-wise scaling vs. global scaling
def feature_wise_scale(X):
    return X.apply(lambda x: (x - x.mean()) / x.std())

X_feature_scaled = feature_wise_scale(X[numeric_features])

def global_scale(X):
    global_mean = X.values.mean()
    global_std = X.values.std()
    return (X - global_mean) / global_std

X_global_scaled = global_scale(X[numeric_features])

print("Scaling and Normalization completed.")