<a href="https://colab.research.google.com/github/sdgroeve/Machine_Learning_course_UGent_D012554_2025/blob/main/notebooks/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing for Machine Learning

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Set the style for our visualizations
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("deep")

## Loading a Dataset

The breast cancer dataset is a classic dataset in machine learning. It contains features computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. These features describe characteristics of the cell nuclei present in the image.

The dataset includes 569 instances with 30 features each. The target variable indicates whether the cancer is malignant (0) or benign (1).

In [None]:
cancer = datasets.load_breast_cancer()
print(f"Features: {cancer.feature_names}")
print(f"Target: {cancer.target_names}")
print(f"Dataset shape: {cancer.data.shape}")

# Convert to pandas DataFrame for easier manipulation
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target

In [None]:
df.head()

## Exploratory Data Analysis

In [None]:
df.describe().T

In [None]:
df.boxplot(vert= False, figsize=(12, 6))

In [None]:
# Let's create a function to visualize the distribution of a feature
def plot_feature_distribution(feature_name):
    plt.figure(figsize=(10, 6))
    plt.subplot(1, 2, 1)
    sns.histplot(df[df['target'] == 0][feature_name], color='red', label='Malignant', kde=True)
    sns.histplot(df[df['target'] == 1][feature_name], color='blue', label='Benign', kde=True)
    plt.title(f'Distribution of {feature_name} by Target Class')
    plt.xlabel(feature_name)
    plt.ylabel('Count')
    plt.legend()

    plt.subplot(1, 2, 2)
    sns.boxplot(x='target', y=feature_name, data=df)
    plt.title(f'Boxplot of {feature_name} by Target Class')
    plt.xticks([0, 1], ['Malignant', 'Benign'])

    plt.tight_layout()
    plt.show()

# Visualize one of the features
plot_feature_distribution('mean radius')

## Data Splitting

In [None]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

## Feature Scaling

### StandardScaler (Z-score normalization)

In [None]:
print("Before scaling (first 5 rows, first 5 features):")
print(X_train.iloc[:5, :5])

# Standardization (z-score normalization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for easier viewing
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
print("\nAfter StandardScaler (first 5 rows, first 5 features):")
print(X_train_scaled_df.iloc[:5, :5])

print("\nMean and standard deviation of 'mean radius' after scaling:")
print(f"Mean: {X_train_scaled_df['mean radius'].mean():.6f}")
print(f"Std: {X_train_scaled_df['mean radius'].std():.6f}")

In [None]:
# Visualize the effect of standardization
feature = 'mean radius'
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(X_train[feature], kde=True)
plt.title(f'Distribution of {feature} (Before Scaling)')
plt.xlabel(feature)

plt.subplot(1, 2, 2)
sns.histplot(X_train_scaled_df[feature], kde=True)
plt.title(f'Distribution of {feature} (After StandardScaler)')
plt.xlabel(f'{feature} (scaled)')

plt.tight_layout()
plt.show()

### MinMaxScaler (0-1 normalization)

In [None]:
# Min-Max scaling
min_max_scaler = MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_test_minmax = min_max_scaler.transform(X_test)

# Convert back to DataFrame
X_train_minmax_df = pd.DataFrame(X_train_minmax, columns=X_train.columns)
print("After MinMaxScaler (first 5 rows, first 5 features):")
print(X_train_minmax_df.iloc[:5, :5])

print("\nMin and max of 'mean radius' after scaling:")
print(f"Min: {X_train_minmax_df['mean radius'].min():.6f}")
print(f"Max: {X_train_minmax_df['mean radius'].max():.6f}")

In [None]:
# Visualize the effect of min-max scaling
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(X_train[feature], kde=True)
plt.title(f'Distribution of {feature} (Before Scaling)')
plt.xlabel(feature)

plt.subplot(1, 2, 2)
sns.histplot(X_train_minmax_df[feature], kde=True)
plt.title(f'Distribution of {feature} (After MinMaxScaler)')
plt.xlabel(f'{feature} (scaled)')

plt.tight_layout()
plt.show()

## Working with Categorical Features

In [None]:
# For demonstration, let's create a categorical feature
np.random.seed(42)
categories = ['A+', 'A-', 'B+', 'B-', 'O+', 'O-', 'AB+', 'AB-']
df['category'] = np.random.choice(categories, size=df.shape[0])

print("\nAdded a synthetic categorical feature:")
print(df[['category', 'target']].head(10))

### 5.1 One-Hot Encoding

In [None]:
# One-hot encoding with sklearn
encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(df[['category']])

# Get the feature names
encoded_feature_names = encoder.get_feature_names_out(['category'])
print(f"Encoded feature names: {encoded_feature_names}")

# Create a DataFrame with encoded features
encoded_df = pd.DataFrame(encoded, columns=encoded_feature_names)
print("\nOne-hot encoded features (first 10 rows):")
print(encoded_df.head(10))

print("\nOriginal categorical data vs. encoded data:")
comparison = pd.concat([df[['category']].reset_index(drop=True),
                         encoded_df.reset_index(drop=True)], axis=1)
print(comparison.head(10))

## Creating a Preprocessing Pipeline

In [None]:
# Define which columns are numerical and which are categorical
numerical_features = cancer.feature_names
categorical_features = ['category']

# Create transformers for each type of feature
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the full dataset with the categorical feature
X_with_cat = df.drop('target', axis=1)
y = df['target']

# Split the data
X_train_with_cat, X_test_with_cat, y_train, y_test = train_test_split(
    X_with_cat, y, test_size=0.3, random_state=42
)

# Apply the preprocessing pipeline
X_train_processed = preprocessor.fit_transform(X_train_with_cat)
X_test_processed = preprocessor.transform(X_test_with_cat)

print(f"Shape before preprocessing: {X_train_with_cat.shape}")
print(f"Shape after preprocessing: {X_train_processed.shape}")

# Get the feature names after transformation
numerical_feature_names = numerical_features
categorical_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
all_feature_names = list(numerical_feature_names) + list(categorical_feature_names)

print(f"\nFeature names after preprocessing: {all_feature_names[:5]} ... {all_feature_names[-2:]} (total: {len(all_feature_names)})")