# Feature Engineering - Customer Churn Prediction

This notebook focuses on feature engineering and preprocessing for the customer churn prediction model.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Load the data
df = pd.read_csv('../data/raw/customer_churn.csv')
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Data quality check
print("=== DATA QUALITY ASSESSMENT ===")
print(f"Missing values per column:")
print(df.isnull().sum())
print(f"\nDuplicate rows: {df.duplicated().sum()}")
print(f"\nData types:")
print(df.dtypes)

In [None]:
# Handle missing values
print("=== HANDLING MISSING VALUES ===")

# Check for missing values in total_charges (common issue)
if 'total_charges' in df.columns:
    # Convert total_charges to numeric (might be stored as string)
    df['total_charges'] = pd.to_numeric(df['total_charges'], errors='coerce')
    
    # Fill missing total_charges with median
    if df['total_charges'].isnull().sum() > 0:
        median_charges = df['total_charges'].median()
        df['total_charges'].fillna(median_charges, inplace=True)
        print(f"Filled {df['total_charges'].isnull().sum()} missing total_charges with median: {median_charges}")

# Remove duplicates if any
initial_shape = df.shape[0]
df.drop_duplicates(inplace=True)
print(f"Removed {initial_shape - df.shape[0]} duplicate rows")

print(f"\nFinal dataset shape: {df.shape}")

In [None]:
# Feature Engineering
print("=== FEATURE ENGINEERING ===")

# Create new features
if 'total_charges' in df.columns and 'monthly_charges' in df.columns:
    # Average monthly charges over tenure
    df['avg_monthly_charges'] = df['total_charges'] / (df['tenure'] + 1)  # +1 to avoid division by zero
    
    # Charges per month ratio
    df['charges_ratio'] = df['monthly_charges'] / df['avg_monthly_charges']
    
    # Total charges per year
    df['annual_charges'] = df['monthly_charges'] * 12

# Tenure categories
if 'tenure' in df.columns:
    df['tenure_group'] = pd.cut(df['tenure'], 
                               bins=[0, 12, 24, 48, 72], 
                               labels=['0-1 year', '1-2 years', '2-4 years', '4+ years'])

# Monthly charges categories
if 'monthly_charges' in df.columns:
    df['charges_group'] = pd.cut(df['monthly_charges'], 
                                bins=[0, 35, 65, 95, float('inf')], 
                                labels=['Low', 'Medium', 'High', 'Very High'])

print("New features created:")
new_features = ['avg_monthly_charges', 'charges_ratio', 'annual_charges', 'tenure_group', 'charges_group']
for feature in new_features:
    if feature in df.columns:
        print(f"- {feature}")

df.head()

In [None]:
# Encode categorical variables
print("=== CATEGORICAL ENCODING ===")

# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove target variable from categorical columns if present
if 'churn' in categorical_cols:
    categorical_cols.remove('churn')

print(f"Numerical columns ({len(numerical_cols)}): {numerical_cols}")
print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")

# Create a copy for encoding
df_encoded = df.copy()

# One-hot encode categorical variables
for col in categorical_cols:
    if col in df_encoded.columns:
        # Get dummies and add to dataframe
        dummies = pd.get_dummies(df_encoded[col], prefix=col, drop_first=True)
        df_encoded = pd.concat([df_encoded, dummies], axis=1)
        df_encoded.drop(col, axis=1, inplace=True)

print(f"\nAfter encoding - Dataset shape: {df_encoded.shape}")
print(f"New columns: {df_encoded.shape[1] - df.shape[1]}")

In [None]:
# Feature scaling
print("=== FEATURE SCALING ===")

# Prepare features and target
if 'churn' in df_encoded.columns:
    X = df_encoded.drop('churn', axis=1)
    y = df_encoded['churn']
else:
    X = df_encoded
    y = None

# Remove customer_id if present
if 'customer_id' in X.columns:
    X = X.drop('customer_id', axis=1)

print(f"Features shape: {X.shape}")
if y is not None:
    print(f"Target shape: {y.shape}")
    print(f"Target distribution: {y.value_counts().to_dict()}")

# Scale numerical features
scaler = StandardScaler()
numerical_features = X.select_dtypes(include=[np.number]).columns
X_scaled = X.copy()
X_scaled[numerical_features] = scaler.fit_transform(X[numerical_features])

print(f"\nScaled {len(numerical_features)} numerical features")
print(f"Numerical features: {list(numerical_features)}")

In [None]:
# Feature importance analysis
print("=== FEATURE IMPORTANCE ANALYSIS ===")

if y is not None:
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.feature_selection import mutual_info_classif
    
    # Random Forest feature importance
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_scaled, y)
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'feature': X_scaled.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("Top 10 most important features:")
    print(feature_importance.head(10))
    
    # Plot feature importance
    plt.figure(figsize=(10, 8))
    top_features = feature_importance.head(15)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title('Top 15 Feature Importance (Random Forest)')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    # Mutual information
    mi_scores = mutual_info_classif(X_scaled, y, random_state=42)
    mi_df = pd.DataFrame({
        'feature': X_scaled.columns,
        'mutual_info': mi_scores
    }).sort_values('mutual_info', ascending=False)
    
    print("\nTop 10 features by Mutual Information:")
    print(mi_df.head(10))

In [None]:
# Train-test split
print("=== TRAIN-TEST SPLIT ===")

if y is not None:
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"Training set: {X_train.shape}")
    print(f"Test set: {X_test.shape}")
    print(f"\nTraining set churn distribution:")
    print(y_train.value_counts(normalize=True))
    print(f"\nTest set churn distribution:")
    print(y_test.value_counts(normalize=True))
    
    # Save processed data
    import os
    os.makedirs('../data/processed', exist_ok=True)
    
    # Save training and test sets
    X_train.to_csv('../data/processed/X_train.csv', index=False)
    X_test.to_csv('../data/processed/X_test.csv', index=False)
    y_train.to_csv('../data/processed/y_train.csv', index=False)
    y_test.to_csv('../data/processed/y_test.csv', index=False)
    
    # Save full processed dataset
    processed_data = pd.concat([X_scaled, y], axis=1)
    processed_data.to_csv('../data/processed/customer_churn_processed.csv', index=False)
    
    print("\n✅ Processed data saved to ../data/processed/")
    print("Files saved:")
    print("- X_train.csv, X_test.csv")
    print("- y_train.csv, y_test.csv")
    print("- customer_churn_processed.csv")

In [None]:
# Feature engineering summary
print("=== FEATURE ENGINEERING SUMMARY ===")
print(f"Original features: {df.shape[1]}")
print(f"Final features: {X_scaled.shape[1]}")
print(f"Features added: {X_scaled.shape[1] - df.shape[1]}")
print(f"\nFeature types:")
print(f"- Numerical: {len(numerical_features)}")
print(f"- Categorical (encoded): {X_scaled.shape[1] - len(numerical_features)}")
print(f"\nData preprocessing completed successfully! 🎉")