In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# --- 1. Sample Data Creation (mimicking a typical ML dataset) ---
# Let's create a slightly more complex dataset for demonstration
data = {
    'feature1_num': np.random.rand(100) * 10,
    'feature2_num': np.random.rand(100) * 5,
    'feature3_cat': np.random.choice(['A', 'B', 'C'], size=100),
    'feature4_ord': np.random.choice(['Low', 'Medium', 'High'], size=100, p=[0.2, 0.5, 0.3]),
    'feature5_nan_num': np.concatenate([np.random.rand(80)*20, np.full(20, np.nan)]), # some NaNs
    'target': np.random.choice([0, 1], size=100)
}
df = pd.DataFrame(data)
np.random.shuffle(df['feature5_nan_num'].values) # Shuffle to spread NaNs

print("Sample DataFrame head:")
print(df.head())
print("\nMissing values before preprocessing:")
print(df.isnull().sum())

# Define features (X) and target (y)
X = df.drop('target', axis=1)
y = df['target']

# --- 2. Train-Test Split ---
# Stratify by y to ensure similar class proportions in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nShape of X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}, y_test: {y_test.shape}")


# --- 3. Preprocessing with Scikit-learn ---
# Identify column types for different preprocessing steps
numerical_features = ['feature1_num', 'feature2_num', 'feature5_nan_num']
categorical_features = ['feature3_cat']
# Ordinal features would ideally use OrdinalEncoder, but for simplicity,
# we'll treat 'feature4_ord' as categorical for OneHotEncoding here.
# In a real scenario:
# ordinal_features = ['feature4_ord']
# ordinal_categories = [['Low', 'Medium', 'High']] # Define order for OrdinalEncoder

# Create preprocessing pipelines for different data types

# Pipeline for numerical features:
# 1. Impute missing values (e.g., with median)
# 2. Scale the features (e.g., StandardScaler)
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')), # Handles NaN in 'feature5_nan_num'
    ('scaler', StandardScaler())
])

# Pipeline for categorical features:
# 1. Impute missing values (e.g., with most frequent, though our example doesn't have them here)
# 2. One-Hot Encode the features
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')), # For robustness if new data has NaNs
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # ignore new categories in test data
])

# --- 4. Combine Preprocessing Steps using ColumnTransformer ---
# ColumnTransformer applies specified transformers to designated columns of the DataFrame
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features + ['feature4_ord']) # Include ordinal as categorical for now
    ],
    remainder='passthrough' # Keep other columns (if any) not specified. 'drop' is another option.
)
# Note: If 'feature4_ord' was handled by OrdinalEncoder:
# ('ord', OrdinalEncoder(categories=ordinal_categories), ordinal_features)

# --- 5. Create the Full ML Pipeline (Preprocessing + Model) ---
# We'll use Logistic Regression as an example model
ml_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42))
])

# --- 6. Train the Model using the Pipeline ---
print("\nTraining the ML pipeline...")
ml_pipeline.fit(X_train, y_train)
print("Training complete.")

# --- 7. Make Predictions on the Test Set ---
y_pred = ml_pipeline.predict(X_test)
y_pred_proba = ml_pipeline.predict_proba(X_test)[:, 1] # Probabilities for the positive class

# --- 8. Evaluate the Model ---
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# You can inspect steps in the pipeline
# print("\nFitted preprocessor:")
# print(ml_pipeline.named_steps['preprocessor'])
# print("\nClassifier coefficients (if applicable and preprocessed features are accessible):")
# Can be complex to get feature names after ColumnTransformer, requires get_feature_names_out()

# --- Example of how preprocessed data looks (optional to show) ---
# X_train_transformed = ml_pipeline.named_steps['preprocessor'].transform(X_train)
# print(f"\nShape of transformed X_train: {X_train_transformed.shape}")
# print("First 5 rows of transformed X_train (can be a sparse matrix or numpy array):")
# print(X_train_transformed[:5])

print("\nScikit-learn pipeline demonstration complete.")

# Further points to discuss:
# - GridSearch/RandomizedSearch for hyperparameter tuning with pipelines.
# - Saving and loading pipelines (using joblib or pickle).
# - Custom transformers.
# - More complex feature engineering within pipelines.