1. Setup and Imports

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.base import BaseEstimator, TransformerMixin
import joblib
import sys
import os

In [3]:
# Add src to path
sys.path.append('src/')

2. Load and Clean Data

In [4]:
# Load the cleaned data from Task 2
df = pd.read_csv('data.csv')
print(f"Original shape: {df.shape}")

# Drop useless columns with no variance
columns_to_drop = ['BatchID', 'CountryCode', 'CurrencyCode']
existing_drops = [col for col in columns_to_drop if col in df.columns]
if existing_drops:
    df = df.drop(columns=existing_drops)
    print(f"Dropped columns: {existing_drops}")

print(f"Final shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

Original shape: (95662, 16)
Dropped columns: ['CountryCode', 'CurrencyCode']
Final shape: (95662, 14)
Columns: ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount', 'Value', 'TransactionStartTime', 'PricingStrategy', 'FraudResult']


3. Custom Transformers

In [5]:
class AggregateFeatureTransformer(BaseEstimator, TransformerMixin):
    """Creates customer-level aggregate features"""

    def __init__(self):
        self.customer_stats = {}

    def fit(self, X, y=None):
        df = X.copy()

        # Customer transaction aggregates
        self.customer_stats = df.groupby('CustomerId').agg({
            'Amount': ['sum', 'mean', 'std', 'count', 'min', 'max'],
            'Value': ['sum', 'mean', 'std'],
            'FraudResult': ['sum', 'mean']  # fraud count and rate
        }).reset_index()

        # Flatten column names
        self.customer_stats.columns = ['CustomerId'] + [
            f'customer_{col[0]}_{col[1]}' for col in self.customer_stats.columns[1:]
        ]

        return self

    def transform(self, X):
        df = X.copy()

        # Merge aggregate features
        df_merged = df.merge(self.customer_stats, on='CustomerId', how='left')

        # Fill NaN for new customers
        agg_cols = [col for col in df_merged.columns if col.startswith('customer_')]
        df_merged[agg_cols] = df_merged[agg_cols].fillna(0)

        return df_merged

class TimeFeatureExtractor(BaseEstimator, TransformerMixin):
    """Extracts time-based features from TransactionStartTime"""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()

        # Convert to datetime
        df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

        # Extract time features
        df['transaction_hour'] = df['TransactionStartTime'].dt.hour
        df['transaction_day'] = df['TransactionStartTime'].dt.day
        df['transaction_month'] = df['TransactionStartTime'].dt.month
        df['transaction_dayofweek'] = df['TransactionStartTime'].dt.dayofweek

        # Business hours (9-17)
        df['is_business_hours'] = ((df['transaction_hour'] >= 9) &
                                 (df['transaction_hour'] <= 17)).astype(int)

        # Weekend
        df['is_weekend'] = (df['transaction_dayofweek'].isin([5, 6])).astype(int)

        return df

class AmountTransformer(BaseEstimator, TransformerMixin):
    """Handles Amount and Value transformation"""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()

        # Handle negative amounts (refunds/chargebacks)
        df['is_refund'] = (df['Amount'] < 0).astype(int)
        df['abs_amount'] = df['Amount'].abs()

        # Log transform for skewness (add 1 to handle zeros)
        df['amount_log'] = np.log1p(df['abs_amount'])
        df['value_log'] = np.log1p(df['Value'])

        # Amount bins
        df['amount_category'] = pd.cut(df['abs_amount'],
                                     bins=[0, 100, 1000, 5000, float('inf')],
                                     labels=['small', 'medium', 'large', 'xlarge'])

        return df

4. Main Feature Engineering Pipeline

In [6]:
def create_feature_pipeline():
    """Creates the complete feature engineering pipeline"""

    # Define column groups
    categorical_cols = ['ProductCategory', 'ChannelId', 'PricingStrategy']
    high_cardinality_cols = ['ProviderId', 'ProductId']
    numerical_cols = ['Amount', 'Value']

    # Custom transformers pipeline
    custom_pipeline = Pipeline([
        ('aggregate', AggregateFeatureTransformer()),
        ('time_features', TimeFeatureExtractor()),
        ('amount_transform', AmountTransformer())
    ])

    # Preprocessing for different column types
    preprocessor = ColumnTransformer([
        # Categorical - One Hot Encoding
        ('cat_onehot', OneHotEncoder(drop='first', sparse_output=False),
         categorical_cols),

        # High cardinality - Imputation + Ordinal Encoding
        ('high_card', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ordinal_enc', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)) # Use OrdinalEncoder here
        ]), high_cardinality_cols),

        # Numerical - Imputation + Scaling
        ('numerical', Pipeline([
            ('imputer', KNNImputer(n_neighbors=5)),
            ('scaler', StandardScaler())
        ]), numerical_cols + ['amount_log', 'value_log', 'abs_amount']),

        # Time features - Min-Max scaling
        ('time', MinMaxScaler(),
         ['transaction_hour', 'transaction_day', 'transaction_month'])
    ], remainder='passthrough')

    # Complete pipeline
    feature_pipeline = Pipeline([
        ('custom_features', custom_pipeline),
        ('preprocessing', preprocessor)
    ])

    return feature_pipeline

5. Execute Feature Engineering

In [7]:
# Create and fit the pipeline
print("Creating feature engineering pipeline...")
pipeline = create_feature_pipeline()

# Fit and transform the data
print("Fitting pipeline on training data...")
X_transformed = pipeline.fit_transform(df)

print(f"Original shape: {df.shape}")
print(f"Transformed shape: {X_transformed.shape}")

Creating feature engineering pipeline...
Fitting pipeline on training data...
Original shape: (95662, 14)
Transformed shape: (95662, 47)


6. Feature Analysis


In [8]:
# Get feature names after transformation
def get_feature_names(pipeline, df):
    """Extract feature names from the fitted pipeline"""

    # Apply custom transformations first
    df_custom = pipeline.named_steps['custom_features'].transform(df)

    # Get preprocessor
    preprocessor = pipeline.named_steps['preprocessing']
    feature_names = []

    for name, transformer, columns in preprocessor.transformers_:
        if name == 'cat_onehot':
            if hasattr(transformer, 'get_feature_names_out'):
                names = transformer.get_feature_names_out(columns).tolist()
            else:
                names = [f"{col}_{val}" for col in columns for val in transformer.categories_[columns.index(col)][1:]]
        elif name == 'high_card':
            names = [f"{col}_encoded" for col in columns]
        elif name == 'numerical':
            names = columns
        elif name == 'time':
            names = columns
        elif name == 'remainder':
            # Handle remainder columns
            remainder_cols = [col for col in df_custom.columns
                            if col not in sum([list(cols) for _, _, cols in preprocessor.transformers_[:-1]], [])]
            names = remainder_cols
        else:
            names = columns if isinstance(columns, list) else [columns]

        feature_names.extend(names)

    return feature_names

# Get feature names and create DataFrame
try:
    feature_names = get_feature_names(pipeline, df)
    X_transformed_df = pd.DataFrame(X_transformed, columns=feature_names)

    print("Feature engineering completed!")
    print(f"Total features created: {len(feature_names)}")

    # Count feature types
    cat_count = sum(1 for name in feature_names if any(cat in name for cat in ['ProductCategory', 'ChannelId', 'PricingStrategy']))
    high_card_count = sum(1 for name in feature_names if 'encoded' in name)
    numerical_count = sum(1 for name in feature_names if name in ['Amount', 'Value', 'amount_log', 'value_log', 'abs_amount'])
    time_count = sum(1 for name in feature_names if 'transaction_' in name or name in ['is_business_hours', 'is_weekend'])
    customer_count = sum(1 for name in feature_names if 'customer_' in name)

    print("\nFeature types:")
    print(f"- categorical_onehot: {cat_count}")
    print(f"- high_cardinality_encoded: {high_card_count}")
    print(f"- numerical: {numerical_count}")
    print(f"- time_feature: {time_count}")
    print(f"- remainder: {len(feature_names) - cat_count - high_card_count - numerical_count - time_count}")

except Exception as e:
    print(f"Error getting feature names: {e}")
    feature_names = [f"feature_{i}" for i in range(X_transformed.shape[1])]
    X_transformed_df = pd.DataFrame(X_transformed, columns=feature_names)



Feature engineering completed!
Total features created: 47

Feature types:
- categorical_onehot: 14
- high_cardinality_encoded: 2
- numerical: 5
- time_feature: 6
- remainder: 20


7. Class Imbalance Analysis


In [9]:
# Check class imbalance for fraud detection
print("CLASS IMBALANCE ANALYSIS")
print("="*40)

fraud_counts = df['FraudResult'].value_counts()
fraud_percentages = df['FraudResult'].value_counts(normalize=True) * 100

print("Fraud Distribution:")
for class_val, count in fraud_counts.items():
    percentage = fraud_percentages[class_val]
    print(f"Class {class_val}: {count:,} samples ({percentage:.2f}%)")

# Calculate imbalance ratio
fraud_ratio = fraud_counts[0] / fraud_counts[1] if len(fraud_counts) > 1 else 0
print(f"Imbalance Ratio: {fraud_ratio:.1f}:1 (non-fraud:fraud)")

if fraud_ratio > 10:
    print("⚠️  SEVERE CLASS IMBALANCE DETECTED!")
    print("Recommendations:")
    print("- Use SMOTE or ADASYN for oversampling")
    print("- Apply class weights in model")
    print("- Consider ensemble methods (Random Forest, XGBoost)")
    print("- Use stratified sampling")
    print("- Focus on Precision, Recall, F1-score over Accuracy")


CLASS IMBALANCE ANALYSIS
Fraud Distribution:
Class 0: 95,469 samples (99.80%)
Class 1: 193 samples (0.20%)
Imbalance Ratio: 494.7:1 (non-fraud:fraud)
⚠️  SEVERE CLASS IMBALANCE DETECTED!
Recommendations:
- Use SMOTE or ADASYN for oversampling
- Apply class weights in model
- Consider ensemble methods (Random Forest, XGBoost)
- Use stratified sampling
- Focus on Precision, Recall, F1-score over Accuracy


8. Skewness Analysis

In [10]:
# Analyze skewness before and after transformation
from scipy import stats

print("SKEWNESS ANALYSIS")
print("="*40)

# Original skewness
original_amount_skew = stats.skew(df['Amount'])
original_value_skew = stats.skew(df['Value'])

print("Original Skewness:")
print(f"Amount: {original_amount_skew:.2f}")
print(f"Value: {original_value_skew:.2f}")

# After log transformation
df_with_logs = pipeline.named_steps['custom_features'].transform(df)
log_amount_skew = stats.skew(df_with_logs['amount_log'])
log_value_skew = stats.skew(df_with_logs['value_log'])

print("After Log Transform:")
print(f"Amount (log): {log_amount_skew:.2f}")
print(f"Value (log): {log_value_skew:.2f}")

print("Skewness Reduction:")
print(f"Amount: {original_amount_skew:.2f} → {log_amount_skew:.2f} ({abs(original_amount_skew - log_amount_skew):.1f} improvement)")
print(f"Value: {original_value_skew:.2f} → {log_value_skew:.2f} ({abs(original_value_skew - log_value_skew):.1f} improvement)")

if abs(log_amount_skew) < 2 and abs(log_value_skew) < 2:
    print("✅ Log transformation successfully reduced skewness!")
else:
    print("⚠️  Consider additional transformations (Box-Cox, Yeo-Johnson)")


SKEWNESS ANALYSIS
Original Skewness:
Amount: 51.10
Value: 51.29
After Log Transform:
Amount (log): -0.21
Value (log): -0.20
Skewness Reduction:
Amount: 51.10 → -0.21 (51.3 improvement)
Value: 51.29 → -0.20 (51.5 improvement)
✅ Log transformation successfully reduced skewness!




9. Missing Values Analysis


In [11]:
print("MISSING VALUES ANALYSIS")
print("="*40)

print("Original missing values:")
original_missing = df.isnull().sum()
print(original_missing[original_missing > 0] if original_missing.sum() > 0 else "No missing values")

print("Missing values in transformed data:")
# Select only numeric columns from the transformed DataFrame for NaN check
numeric_cols_transformed = X_transformed_df.select_dtypes(include=np.number)

if numeric_cols_transformed.isnull().values.any():
    missing_counts = numeric_cols_transformed.isnull().sum()
    print(missing_counts[missing_counts > 0])
else:
    print("✅ No missing values in final transformed data!")

print("Imputation Summary:")
print("- KNN Imputer used for numerical features")
print("- Most frequent strategy for categorical features")
print("- Customer aggregates filled with 0 for new customers")

MISSING VALUES ANALYSIS
Original missing values:
No missing values
Missing values in transformed data:
✅ No missing values in final transformed data!
Imputation Summary:
- KNN Imputer used for numerical features
- Most frequent strategy for categorical features
- Customer aggregates filled with 0 for new customers


10. Save Pipeline and Data

In [12]:
# Create output directories
os.makedirs('models/', exist_ok=True)
os.makedirs('data/processed/', exist_ok=True)

# Save the fitted pipeline
joblib.dump(pipeline, 'models/feature_pipeline.pkl')

# Save transformed data
X_transformed_df.to_csv('data/processed/features_engineered.csv', index=False)

# Save feature names
with open('data/processed/feature_names.txt', 'w') as f:
    for name in feature_names:
        f.write(f"{name}\n")

print("FEATURE ENGINEERING PIPELINE SUMMARY")
print("="*60)
print(f"✓ Data processed: {df.shape[0]:,} rows")
print(f"✓ Features created: {X_transformed.shape[1]} total")
print(f"✓ Custom transformers: 3 (Aggregates, Time, Amount)")
print(f"✓ Encoding: One-hot + Label encoding")
print(f"✓ Imputation: KNN for numerical, Mode for categorical")
print(f"✓ Scaling: StandardScaler + MinMaxScaler")
print(f"✓ Pipeline saved: models/feature_pipeline.pkl")
print(f"✓ Data saved: data/processed/features_engineered.csv")
print("="*60)


FEATURE ENGINEERING PIPELINE SUMMARY
✓ Data processed: 95,662 rows
✓ Features created: 47 total
✓ Custom transformers: 3 (Aggregates, Time, Amount)
✓ Encoding: One-hot + Label encoding
✓ Imputation: KNN for numerical, Mode for categorical
✓ Scaling: StandardScaler + MinMaxScaler
✓ Pipeline saved: models/feature_pipeline.pkl
✓ Data saved: data/processed/features_engineered.csv
