# Standardization

Standardization transforms data to have a mean of 0 and a standard deviation of 1.

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.preprocessing import StandardScaler

# Raw data
data = np.array([[1.2, 3.5, 5.8], [2.3, 3.1, 4.6], [3.2, 2.8, 1.5]])

# Standardization
scaler = StandardScaler()
standardized_data = scaler.fit_transform(data)
print(standardized_data)

[[-1.26346568  1.27872403  1.01194625]
 [ 0.08151391 -0.11624764  0.34958143]
 [ 1.18195176 -1.16247639 -1.36152768]]


# Normalization

Normalization scales data to fit within a specific range, typically [0, 1].

In [4]:
from sklearn.preprocessing import MinMaxScaler

# Normalization
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(data)
print(normalized_data)

[[0.         1.         1.        ]
 [0.55       0.42857143 0.72093023]
 [1.         0.         0.        ]]


# Simple Imputer

It is a data imputation technique that fills missing values with a defined constant, mean, or median.

In [12]:
from sklearn.impute import SimpleImputer
import numpy as np

# Data with missing values
data_with_nan = np.array([[1.2, 3.5, np.nan], [np.nan, 2.1, 4.6], [2.3, np.nan, 1.5]])

# Simple Imputer (mean strategy)
imputer = SimpleImputer(strategy='mean')
imputed_data = imputer.fit_transform(data_with_nan)
print(imputed_data)

[[1.2  3.5  3.05]
 [1.75 2.1  4.6 ]
 [2.3  2.8  1.5 ]]


# KNN Imputer

It is a data imputation technique that fills missing values based on the nearest neighbors in the dataset.

In [14]:
from sklearn.impute import KNNImputer

# KNN Imputation
knn_imputer = KNNImputer(n_neighbors=2)
imputed_knn_data = knn_imputer.fit_transform(data_with_nan)
print(imputed_knn_data)

[[1.2  3.5  3.05]
 [1.75 2.1  4.6 ]
 [2.3  2.8  1.5 ]]


# Encoding

Converts categorical variables into numerical form (Label Encoding, Ordinal Encoding or One-Hot Encoding).

In [27]:
from sklearn.preprocessing import LabelEncoder

# Example categorical labels
labels = ['cat', 'dog', 'fish', 'dog']

# Label encoding
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
print(encoded_labels)

[0 1 2 1]


In [28]:
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

# Example ordinal data
categories = np.array([['low'], ['medium'], ['high'], ['medium']])

# Ordinal encoding (Assumes 'low' < 'medium' < 'high')
ordinal_encoder = OrdinalEncoder(categories=[['low', 'medium', 'high']])
encoded_data = ordinal_encoder.fit_transform(categories)
print(encoded_data)

[[0.]
 [1.]
 [2.]
 [1.]]


In [10]:
from sklearn.preprocessing import OneHotEncoder

# Categorical data
categories = np.array([['red'], ['blue'], ['green'], ['blue']])

# One-hot encoding
encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(categories)
print(encoded_data)

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]




# Transformers

Transformers apply functions to convert data during preprocessing (e.g., log transformation).

Function Transformer: A FunctionTransformer applies any custom function (like log transformation, square root, etc.) to the data. 

Column Transformer: A ColumnTransformer allows different preprocessing steps (like scaling, encoding, etc.) to be applied to specific columns in a dataset.

In [29]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

# Raw data
data = np.array([[1.2, 3.5], [2.3, 3.1], [3.2, 2.8]])

# Applying log transformation using FunctionTransformer
log_transformer = FunctionTransformer(np.log1p)  # log1p is log(1+x) to avoid log(0)
transformed_data = log_transformer.fit_transform(data)
print(transformed_data)

[[0.78845736 1.5040774 ]
 [1.19392247 1.41098697]
 [1.43508453 1.33500107]]


In [30]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

# Example dataset (2 numerical columns and 1 categorical column)
data = np.array([[1.2, 3.5, 'red'], [2.3, 3.1, 'blue'], [3.2, 2.8, 'green']])

# ColumnTransformer: apply standardization to numerical columns and OneHotEncoder to the categorical column
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), [0, 1]),  # Standardize columns 0 and 1
        ('cat', OneHotEncoder(), [2])       # One-hot encode column 2
    ]
)

transformed_data = preprocessor.fit_transform(data)
print(transformed_data)

[[-1.26346568  1.27872403  0.          0.          1.        ]
 [ 0.08151391 -0.11624764  1.          0.          0.        ]
 [ 1.18195176 -1.16247639  0.          1.          0.        ]]


# Pipelines

Pipelines streamline multiple data preprocessing steps (e.g., imputation, transformation).

In [40]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

# Example dataset with numerical and categorical columns
data = np.array([[1.2, 3.5, 'red'], 
                 [2.3, np.nan, 'blue'], 
                 [3.2, 2.8, 'green']])
labels = np.array([0, 1, 0])  # Target labels for classification

# Define a ColumnTransformer for mixed data types
preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Impute missing numerical values
        ('scaler', StandardScaler())  # Standardize numerical features
    ]), [0, 1]),  # Apply to first two columns (numerical)
    
    ('cat', OneHotEncoder(), [2])  # One-hot encode the categorical column (third column)
])

# Create a full pipeline with preprocessing and a classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing (imputation, scaling, encoding)
    ('classifier', RandomForestClassifier())  # Classifier
])

# Fit the pipeline to the data
pipeline.fit(data, labels)

# To transform the data using the preprocessor part of the pipeline
pipeline_data = pipeline.named_steps['preprocessor'].transform(data)
print(pipeline_data)

# If you want to make predictions, you can use:
predictions = pipeline.predict(data)
print(predictions)

[[-1.26346568  1.22474487  0.          0.          1.        ]
 [ 0.08151391  0.          1.          0.          0.        ]
 [ 1.18195176 -1.22474487  0.          1.          0.        ]]
[0 1 0]


# Overfitting and Underfitting

Overfitting: Model performs well on training data but poorly on unseen data because it captures noise.

Underfitting: Model is too simple, fails to capture patterns in training data and performs poorly on both training and test data.

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np

# Example of overfitting using Linear Regression on small dataset
X = np.random.rand(100, 1) * 10  # Feature
y = 3 * X.squeeze() + np.random.randn(100) * 2  # Target with noise

# Overfitting model with too many features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model = LinearRegression()
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print("Train Score:", train_score)
print("Test Score:", test_score)  # Expect large difference if overfitting

Train Score: 0.9595085352689678
Test Score: 0.9534093919138027


In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import numpy as np

# Generate a simple dataset
X = np.random.rand(100, 1) * 10
y = 2 * X.squeeze() + np.random.randn(100) * 0.5  # Simple linear relationship with noise

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Create a very simple model (underfitting)
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate performance
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Training Score: {train_score}")
print(f"Test Score: {test_score}")  # Expect both to be low due to underfitting

Training Score: 0.992575959005177
Test Score: 0.9904702646213497


# L1 Regularization (Lasso)

It is a regualization technique that shrinks less important feature coefficients to 0, effectively selecting features.

In [18]:
from sklearn.linear_model import Lasso

# Lasso (L1 regularization)
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
print("Lasso Coefficients:", lasso.coef_)

Lasso Coefficients: [2.94780947]


# L2 Regularization (Ridge)

It is a regualization technique that reduces coefficients to prevent overfitting but keeps all features.

In [20]:
from sklearn.linear_model import Ridge

# Ridge (L2 regularization)
ridge = Ridge(alpha=0.1)
ridge.fit(X_train, y_train)
print("Ridge Coefficients:", ridge.coef_)

Ridge Coefficients: [2.95792088]


# Elastic Net Regularization

Elastic Net combines both L1 (Lasso) and L2 (Ridge) regularization techniques. It encourages both feature selection (like L1) and coefficient shrinkage (like L2). This is useful when dealing with data that has high multicollinearity.

In [25]:
from sklearn.linear_model import ElasticNet

# Elastic Net model (combining L1 and L2 regularization)
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)  # l1_ratio balances L1 and L2
elastic_net.fit(X_train, y_train)

print("Elastic Net Coefficients:", elastic_net.coef_)

Elastic Net Coefficients: [2.93758142]


# Handling Outliers

IQR (Interquartile Range) Method: Identifies outliers as data points falling below Q1 - 1.5IQR or above Q3 + 1.5IQR.
    
Z-Score Method: Outliers are data points more than 3 standard deviations from the mean.

In [22]:
import numpy as np

# Outlier detection using IQR
data = np.array([10, 12, 14, 15, 16, 17, 19, 22, 100])  # 100 is an outlier
Q1, Q3 = np.percentile(data, [25, 75])
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = data[(data < lower_bound) | (data > upper_bound)]
print("Outliers:", outliers)

Outliers: [100]


In [23]:
from scipy.stats import zscore

# Z-score method for outlier detection
z_scores = zscore(data)
outliers_z = data[np.abs(z_scores) > 3]
print("Outliers based on Z-scores:", outliers_z)

Outliers based on Z-scores: []
