## Plan List of candidate features and transformations.

## Imports + load processed EDA snapshot if exists, else raw

In [3]:
import os, json
import pandas as pd
from sklearn.preprocessing import StandardScaler
pd.options.display.max_columns = None

PROCESSED_DIR = "../data/processed"
RAW_PATH = "../data/raw/telco_secret.csv"
os.makedirs(PROCESSED_DIR, exist_ok=True)

source_e = os.path.join(PROCESSED_DIR, "01_after_eda.csv")
if os.path.exists(source_e):
    df = pd.read_csv(source_e)
    print("Loaded from", source_e)
else:
    df = pd.read_csv(RAW_PATH)
    print("Loaded raw from", RAW_PATH)

df.shape

Loaded from ../data/processed/01_after_eda.csv


(3000, 21)

## Recreate previous engineered features to be safe

In [4]:
df['tenure_bucket'] = pd.cut(df['tenure'], bins=[-1,6,24,48,100],
                             labels=['0-6','7-24','25-48','49+'])
df['monthly_charges_bin'] = pd.qcut(df['monthly_charges'], q=4,
                                    labels=['Low','Med-Low','Med-High','High'])
df['churn_flag'] = (df['churn']=='Yes').astype(int)

## Optional: create numeric transformations and outlier handling

In [5]:
# Example: cap monthly_charges at 99th percentile
cap = df['monthly_charges'].quantile(0.99)
df['monthly_charges_cap'] = df['monthly_charges'].clip(upper=cap)
# sample check
df[['monthly_charges','monthly_charges_cap']].describe()

Unnamed: 0,monthly_charges,monthly_charges_cap
count,3000.0,3000.0
mean,80.55857,80.550955
std,40.54296,40.530117
min,10.03,10.03
25%,45.55,45.55
50%,81.25,81.25
75%,115.455,115.455
max,149.99,148.3505


## Persona (clustering hybrid) - if not present

In [6]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
behav_cols = ['tenure','monthly_charges_cap','app_usage_min_per_week','complaint_count_6m','engagement_score']
X = df[behav_cols].fillna(0).values
scaler = StandardScaler()
Xs = scaler.fit_transform(X)

k = 3
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(Xs)
df['cluster'] = clusters

# Map by churn rate order (more churn -> price sensitive etc.)
cluster_summary = df.groupby('cluster').agg(churn_rate=('churn_flag','mean')).reset_index()
order = cluster_summary.sort_values('churn_rate', ascending=False)['cluster'].tolist()
mapping = {order[0]:"Price-Sensitive Paula", order[1]:"Tech-Indifferent Drifters", order[2]:"At-Risk Premium Users"}
df['persona'] = df['cluster'].map(mapping)
df[['cluster','persona']].value_counts()


cluster  persona                  
1        Tech-Indifferent Drifters    1271
0        At-Risk Premium Users        1240
2        Price-Sensitive Paula         489
Name: count, dtype: int64

## Encoding: choose features and do get_dummies (simple, reproducible)

In [7]:
# Candidate features for model:
feature_cols = [
    'tenure','monthly_charges_cap','app_usage_min_per_week','complaint_count_6m','engagement_score',
    'contract','internet_service','payment_method','tenure_bucket','monthly_charges_bin'
]
df_model = df[feature_cols + ['churn_flag']].copy()

# One-hot encode categorical features using pandas (drop_first to reduce dims)
cat_cols = df_model.select_dtypes(include=['object','category']).columns.tolist()
cat_cols = [c for c in cat_cols if c != 'churn']  # ensure churn not included
df_encoded = pd.get_dummies(df_model, columns=cat_cols, drop_first=True)

print("Encoded shape:", df_encoded.shape)

Encoded shape: (3000, 19)


## Save engineered dataset and metadata

In [8]:
engineered_path = os.path.join(PROCESSED_DIR, "02_feature_engineered.csv")
df_encoded.to_csv(engineered_path, index=False)
print("✅ Saved engineered features to", engineered_path)

# Save feature list for model reproducibility
features_list = [c for c in df_encoded.columns if c != 'churn_flag']
meta = {"features": features_list, "target": "churn_flag"}
with open(os.path.join(PROCESSED_DIR, "features_meta.json"), "w") as f:
    json.dump(meta, f)
print("✅ Saved features_meta.json")

✅ Saved engineered features to ../data/processed/02_feature_engineered.csv
✅ Saved features_meta.json


## Save a sample input for the app/demo


In [9]:
sample = df.sample(min(200, len(df)), random_state=42)
sample.to_csv(os.path.join(PROCESSED_DIR, "model_input_sample.csv"), index=False)
print("✅ Saved model_input_sample.csv")

✅ Saved model_input_sample.csv


## Quick peek

In [10]:
df_encoded.head()

Unnamed: 0,tenure,monthly_charges_cap,app_usage_min_per_week,complaint_count_6m,engagement_score,churn_flag,contract_One year,contract_Two year,internet_service_Fiber optic,internet_service_No,payment_method_Credit card (automatic),payment_method_Electronic check,payment_method_Mailed check,tenure_bucket_7-24,tenure_bucket_25-48,tenure_bucket_49+,monthly_charges_bin_Med-Low,monthly_charges_bin_Med-High,monthly_charges_bin_High
0,52,114.0,492,0,0.551,1,True,False,True,False,True,False,False,False,False,True,False,True,False
1,46,137.98,497,0,0.733,0,False,False,False,False,False,False,True,False,True,False,False,False,True
2,53,51.29,358,2,0.546,0,False,False,False,False,False,False,False,False,False,True,True,False,False
3,18,112.47,486,0,0.686,0,True,False,True,False,False,False,True,True,False,False,False,True,False
4,42,63.84,486,4,0.451,1,False,False,True,False,False,False,True,False,True,False,True,False,False
