# Churn Prediction Model

This notebook trains a simple churn prediction model and makes predictions on input data.

## Parameters
- `input_file`: Path to input CSV file
- `output_file`: Path to save predictions JSON
- `model_version`: Model version identifier

In [None]:
# Default parameters (will be overridden by papermill)
input_file = "../data/input/sample_customers.csv"
output_file = "../data/output/predictions.json"
model_version = "v0.0.1"

In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

print(f"Model Version: {model_version}")
print(f"Input File: {input_file}")
print(f"Output File: {output_file}")

## Load Data

In [None]:
# Load input data
df = pd.read_csv(input_file)
print(f"Loaded {len(df)} records")
print(f"Columns: {list(df.columns)}")
df.head()

## Feature Engineering

In [None]:
# Prepare features for prediction
# We'll use a simple approach: encode categorical variables and select key features

# Make a copy for processing
df_processed = df.copy()

# Select features for the model
feature_columns = [
    'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
    'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
    'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
    'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
    'MonthlyCharges', 'TotalCharges'
]

# Handle TotalCharges (convert to numeric)
df_processed['TotalCharges'] = pd.to_numeric(df_processed['TotalCharges'], errors='coerce')
df_processed['TotalCharges'].fillna(df_processed['MonthlyCharges'], inplace=True)

# Encode categorical variables
label_encoders = {}
categorical_columns = df_processed[feature_columns].select_dtypes(include=['object']).columns

for col in categorical_columns:
    le = LabelEncoder()
    df_processed[col] = le.fit_transform(df_processed[col].astype(str))
    label_encoders[col] = le

print(f"Processed {len(categorical_columns)} categorical features")
print(f"Feature shape: {df_processed[feature_columns].shape}")

## Train Simple Model

For demonstration purposes, we'll create a simple model with synthetic training data.
In production, you would load a pre-trained model or train on actual historical data.

In [None]:
# Create a simple heuristic-based "model" for demonstration
# In production, you would load a pre-trained model

def predict_churn_probability(row):
    """
    Simple heuristic-based churn prediction.
    Higher probability if:
    - Month-to-month contract
    - Low tenure
    - High monthly charges
    - Electronic check payment
    """
    score = 0.3  # Base probability
    
    # Contract type influence (encoded values)
    if row['Contract'] == 0:  # Month-to-month typically encoded as 0
        score += 0.3
    
    # Tenure influence
    if row['tenure'] < 12:
        score += 0.2
    elif row['tenure'] > 48:
        score -= 0.2
    
    # Monthly charges influence
    if row['MonthlyCharges'] > 80:
        score += 0.15
    
    # Payment method influence
    if row['PaymentMethod'] == 2:  # Electronic check
        score += 0.1
    
    # Ensure probability is between 0 and 1
    return max(0.0, min(1.0, score))

print("Model ready for predictions")

## Make Predictions

In [None]:
# Generate predictions
predictions = df_processed[feature_columns].apply(predict_churn_probability, axis=1).tolist()

print(f"Generated {len(predictions)} predictions")
print(f"Sample predictions: {predictions[:5]}")
print(f"Average churn probability: {np.mean(predictions):.3f}")

## Save Predictions

In [None]:
# Save predictions to JSON file
import os
os.makedirs(os.path.dirname(output_file), exist_ok=True)

with open(output_file, 'w') as f:
    json.dump(predictions, f, indent=2)

print(f"Predictions saved to: {output_file}")

## Summary

In [None]:
# Create summary statistics
summary = {
    'model_version': model_version,
    'total_predictions': len(predictions),
    'high_risk_customers': sum(1 for p in predictions if p > 0.7),
    'medium_risk_customers': sum(1 for p in predictions if 0.4 <= p <= 0.7),
    'low_risk_customers': sum(1 for p in predictions if p < 0.4),
    'average_churn_probability': float(np.mean(predictions)),
    'max_churn_probability': float(np.max(predictions)),
    'min_churn_probability': float(np.min(predictions))
}

print("\n=== Prediction Summary ===")
for key, value in summary.items():
    print(f"{key}: {value}")

print("\nâœ“ Prediction completed successfully!")