In [2]:
# pipeline 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
import pickle

# Define features
categorical_features = [
    'iag_business_unit_ug', 'iag_age_band_auto', 'iag_tenure_band_enum',
    'iag_site_ug', 'iag_product_type_auto', 'iag_region_ug'
]
numeric_features = [
    'iag_trust_confidence_scale11', 'iag_value_price_of_policy_reflects_scale11'
]

# Load and prepare data
df = pd.read_excel('IAG.xlsx')
X = df[categorical_features + numeric_features]
y = df['Likely to recommend']

# Remove detract classes and convert to binary (fixed capitalization)
mask = ~y.isin(['Detract', 'Super Detract'])
X = X[mask]
y = (y[mask] == 'Promote').astype(int)  # 1 for Promote, 0 for Passive

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('binning', KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='quantile'))
    ]), numeric_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]), categorical_features)
])

# Create full pipeline
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='lbfgs', max_iter=1000, C=1.0, random_state=42))
])

# Fit the full pipeline
full_pipeline.fit(X_train, y_train)

# Calculate and print training accuracy
train_accuracy = full_pipeline.score(X_train, y_train)
test_accuracy = full_pipeline.score(X_test, y_test)
print(f'Training accuracy: {train_accuracy:.3f}')
print(f'Test accuracy: {test_accuracy:.3f}')

# Save pipeline
with open('iag_full_pipeline.pkl', 'wb') as f:
    pickle.dump(full_pipeline, f)

Training accuracy: 0.810
Test accuracy: 0.807


*******

The key principle is: ANY transformation that learns from data (means, medians, modes, standard deviations, categories, bins, etc.) must only learn from training data and then apply that learned transformation to test data

Let me explain this with a clear example of what data leakage through scaling looks like:

Let's say you have a dataset with a column for "salary" with these values:
```python
All data: [30000, 45000, 1000000, 55000, 48000]
```

When scaling (like StandardScaler), we typically:
1. Calculate the mean (μ)
2. Calculate the standard deviation (σ)
3. Transform each value using: (x - μ) / σ

Here's the crucial difference:

BAD (Data Leakage):
```python
# Wrong way - using all data to calculate statistics
all_data = [30000, 45000, 1000000, 55000, 48000]
mean = np.mean(all_data)    # Influenced by test data!
std = np.std(all_data)      # Influenced by test data!

# Then splitting into train/test
train_data = [30000, 45000, 1000000]
test_data = [55000, 48000]

# Scaling using statistics from ALL data
scaled_train = [(x - mean) / std for x in train_data]
scaled_test = [(x - mean) / std for x in test_data]
```

GOOD (No Leakage):
```python
# First split the data
train_data = [30000, 45000, 1000000]
test_data = [55000, 48000]

# Calculate statistics ONLY from training data
train_mean = np.mean(train_data)    # No test data influence
train_std = np.std(train_data)      # No test data influence

# Scale using ONLY training statistics
scaled_train = [(x - train_mean) / train_std for x in train_data]
scaled_test = [(x - train_mean) / train_std for x in test_data]
```

The key difference:
- In the BAD example, we're "cheating" by letting our scaling know about the test data
- In the GOOD example, we're being realistic - in the real world, we won't know the statistics of future (test) data

This matters because:
1. In the BAD example, our model gets an unrealistic advantage by "seeing" the distribution of ALL data
2. The scaled values in our test set are influenced by their own values, which wouldn't be possible in a real-world scenario
3. This can lead to overly optimistic model performance metrics that won't hold up in production

This is why we use sklearn's Pipeline - it automatically handles this by fitting scalers only on training data:
```python
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression())
])

# Pipeline automatically only uses training data for scaling
pipeline.fit(X_train, y_train)
pipeline.predict(X_test)  # Test data scaled using training statistics
```