# 04 - Feature Engineering

**Objective**: Create domain-informed derived features for churn prediction

**Features to Create**:
- RevenuePerMinute, CallFailureRate
- CustomerCareIntensity, EquipmentAgeRatio
- Usage pattern ratios

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print('Libraries loaded!')

In [None]:
# Paths
PRIMARY_PATH = Path('../data/03_primary')
FEATURE_PATH = Path('../data/04_feature')
FEATURE_PATH.mkdir(parents=True, exist_ok=True)

# Load preprocessed data
df_train = pd.read_csv(PRIMARY_PATH / 'preprocessed_train.csv')
df_holdout = pd.read_csv(PRIMARY_PATH / 'preprocessed_holdout.csv')

print(f"Train: {df_train.shape}")
print(f"Holdout: {df_holdout.shape}")

In [None]:
TARGET = 'Churn'
print(f"Columns: {list(df_train.columns)[:20]}...")

## 1. Revenue Efficiency Features

In [None]:
def create_revenue_features(df):
    """Create revenue efficiency features."""
    df = df.copy()
    
    # Revenue per Minute (efficiency metric)
    if 'MonthlyRevenue' in df.columns and 'MonthlyMinutes' in df.columns:
        df['RevenuePerMinute'] = df['MonthlyRevenue'] / (df['MonthlyMinutes'] + 1)
        print(" Created: RevenuePerMinute")
    
    # Revenue Change Velocity
    if 'PercChangeRevenues' in df.columns and 'PercChangeMinutes' in df.columns:
        df['RevenueChangeVelocity'] = df['PercChangeRevenues'] * df['PercChangeMinutes']
        print(" Created: RevenueChangeVelocity")
    
    # Revenue to Recurring Charge Ratio
    if 'MonthlyRevenue' in df.columns and 'TotalRecurringCharge' in df.columns:
        df['RevenueToChargeRatio'] = df['MonthlyRevenue'] / (df['TotalRecurringCharge'] + 1)
        print(" Created: RevenueToChargeRatio")
    
    return df

df_train = create_revenue_features(df_train)
df_holdout = create_revenue_features(df_holdout)

## 2. Service Quality Features

In [None]:
def create_service_quality_features(df):
    """Create service quality features."""
    df = df.copy()
    
    # Call Failure Rate
    drop_cols = ['DroppedCalls', 'BlockedCalls', 'UnansweredCalls']
    call_cols = ['PeakCallsInOut', 'OffPeakCallsInOut']
    
    drop_cols = [c for c in drop_cols if c in df.columns]
    call_cols = [c for c in call_cols if c in df.columns]
    
    if drop_cols and call_cols:
        df['CallFailureRate'] = df[drop_cols].sum(axis=1) / (df[call_cols].sum(axis=1) + 1)
        print(" Created: CallFailureRate")
    
    # Dropped to Blocked Ratio
    if 'DroppedCalls' in df.columns and 'BlockedCalls' in df.columns:
        df['DroppedToBlockedRatio'] = df['DroppedCalls'] / (df['BlockedCalls'] + 1)
        print(" Created: DroppedToBlockedRatio")
    
    return df

df_train = create_service_quality_features(df_train)
df_holdout = create_service_quality_features(df_holdout)

## 3. Customer Engagement Features

In [None]:
def create_engagement_features(df):
    """Create customer engagement features."""
    df = df.copy()
    
    # Customer Care Intensity (support dependency)
    if 'CustomerCareCalls' in df.columns and 'MonthsInService' in df.columns:
        df['CustomerCareIntensity'] = df['CustomerCareCalls'] / (df['MonthsInService'] + 1)
        print(" Created: CustomerCareIntensity")
    
    # Customer Care per Revenue (cost indicator)
    if 'CustomerCareCalls' in df.columns and 'MonthlyRevenue' in df.columns:
        df['CareCallsPerRevenue'] = df['CustomerCareCalls'] / (df['MonthlyRevenue'] + 1)
        print(" Created: CareCallsPerRevenue")
    
    return df

df_train = create_engagement_features(df_train)
df_holdout = create_engagement_features(df_holdout)

## 4. Equipment & Account Features

In [None]:
def create_equipment_features(df):
    """Create equipment and account lifecycle features."""
    df = df.copy()
    
    # Equipment Age Ratio (staleness indicator)
    if 'CurrentEquipmentDays' in df.columns and 'MonthsInService' in df.columns:
        df['EquipmentAgeRatio'] = df['CurrentEquipmentDays'] / (df['MonthsInService'] * 30 + 1)
        print(" Created: EquipmentAgeRatio")
    
    # Average Household Age (demographic)
    if 'AgeHH1' in df.columns and 'AgeHH2' in df.columns:
        df['AvgHouseholdAge'] = (df['AgeHH1'] + df['AgeHH2']) / 2
        print(" Created: AvgHouseholdAge")
    
    return df

df_train = create_equipment_features(df_train)
df_holdout = create_equipment_features(df_holdout)

## 5. Usage Pattern Features

In [None]:
def create_usage_pattern_features(df):
    """Create usage pattern features."""
    df = df.copy()
    
    # Inbound to Outbound Ratio
    if 'InboundCalls' in df.columns and 'OutboundCalls' in df.columns:
        df['InboundOutboundRatio'] = df['InboundCalls'] / (df['OutboundCalls'] + 1)
        print(" Created: InboundOutboundRatio")
    
    # Peak to OffPeak Ratio
    if 'PeakCallsInOut' in df.columns and 'OffPeakCallsInOut' in df.columns:
        df['PeakOffPeakRatio'] = df['PeakCallsInOut'] / (df['OffPeakCallsInOut'] + 1)
        print(" Created: PeakOffPeakRatio")
    
    # Overage Intensity
    if 'OverageMinutes' in df.columns and 'MonthlyMinutes' in df.columns:
        df['OverageIntensity'] = df['OverageMinutes'] / (df['MonthlyMinutes'] + 1)
        print(" Created: OverageIntensity")
    
    # Roaming Intensity
    if 'RoamingCalls' in df.columns and 'MonthlyMinutes' in df.columns:
        df['RoamingIntensity'] = df['RoamingCalls'] / (df['MonthlyMinutes'] + 1)
        print(" Created: RoamingIntensity")
    
    return df

df_train = create_usage_pattern_features(df_train)
df_holdout = create_usage_pattern_features(df_holdout)

## 6. Validate Engineered Features

In [None]:
# Get new features (columns that weren't in original preprocessed data)
original_cols = pd.read_csv(PRIMARY_PATH / 'feature_list.csv')['feature'].tolist()
original_cols.append(TARGET)

new_features = [c for c in df_train.columns if c not in original_cols]
print(f"\n NEW ENGINEERED FEATURES ({len(new_features)}):")
for f in new_features:
    print(f"   • {f}")

In [None]:
# Validate for inf/NaN
print("\n VALIDATION:")

for feat in new_features:
    inf_count = np.isinf(df_train[feat]).sum()
    nan_count = df_train[feat].isnull().sum()
    
    if inf_count > 0 or nan_count > 0:
        print(f" {feat}: inf={inf_count}, nan={nan_count}")
        # Replace inf with 0
        df_train[feat] = df_train[feat].replace([np.inf, -np.inf], 0)
        df_holdout[feat] = df_holdout[feat].replace([np.inf, -np.inf], 0)
        # Fill NaN with 0
        df_train[feat] = df_train[feat].fillna(0)
        df_holdout[feat] = df_holdout[feat].fillna(0)
    else:
        print(f" {feat}: OK")

In [None]:
# Feature correlation with target
if TARGET in df_train.columns:
    print("\n CORRELATION WITH CHURN:")
    for feat in new_features:
        corr = df_train[feat].corr(df_train[TARGET])
        direction = "↑" if corr > 0 else "↓"
        print(f"   {feat}: {corr:+.4f} {direction}")

## 7. Save Engineered Features

In [None]:
# Save
df_train.to_csv(FEATURE_PATH / 'engineered_train.csv', index=False)
df_holdout.to_csv(FEATURE_PATH / 'engineered_holdout.csv', index=False)

# Save feature list
all_features = [c for c in df_train.columns if c != TARGET]
pd.DataFrame({'feature': all_features}).to_csv(FEATURE_PATH / 'all_features.csv', index=False)

print(" Saved:")
print(f"   - {FEATURE_PATH / 'engineered_train.csv'}")
print(f"   - {FEATURE_PATH / 'engineered_holdout.csv'}")
print(f"\n Total features: {len(all_features)}")
print(f" New engineered: {len(new_features)}")

In [None]:
print("\n" + "="*60)
print(" FEATURE ENGINEERING COMPLETE")
print("="*60)
print("\n NEXT: Proceed to 05_Feature_Selection.ipynb")