# Data Preprocessing and Feature Engineering
## Notebook 02: Data Cleaning and Feature Creation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

In [None]:
# Load dataset
df = pd.read_csv('../data/raw/master_dataset.csv')
print(f"Dataset loaded: {df.shape}")
df.head()

In [None]:
# Create RFM features
df['recency_score'] = np.where(df['tenure_months'] <= 6, 5,
                      np.where(df['tenure_months'] <= 12, 4,
                      np.where(df['tenure_months'] <= 24, 3,
                      np.where(df['tenure_months'] <= 36, 2, 1))))

total_sessions = df['monthly_web_sessions'] + df['monthly_app_sessions']
df['frequency_score'] = pd.qcut(total_sessions + df['support_tickets_12m'], 
                              q=5, labels=[1, 2, 3, 4, 5]).astype(int)

df['monetary_score'] = pd.qcut(df['arpu'], q=5, labels=[1, 2, 3, 4, 5]).astype(int)
df['rfm_score'] = df['recency_score'] + df['frequency_score'] + df['monetary_score']

In [None]:
# Usage pattern features
df['data_usage_ratio'] = df['monthly_data_gb'] / (df['data_allowance_gb'] + 1e-6)
df['usage_efficiency'] = (df['data_usage_ratio'] + df['monthly_minutes']/(df['minutes_allowance'] + 1e-6)) / 2
df['data_overage'] = (df['monthly_data_gb'] > df['data_allowance_gb']).astype(int)

# Risk features
risk_mapping = {'High': 3, 'Medium': 2, 'Low': 1}
df['satisfaction_risk'] = np.where(df['satisfaction_score'] <= 5, 'High',
                          np.where(df['satisfaction_score'] <= 7, 'Medium', 'Low'))
df['composite_risk_score'] = df['satisfaction_risk'].map(risk_mapping)

In [None]:
# Customer segmentation
def assign_value_segment(row):
    if row['arpu'] >= 80 and row['satisfaction_score'] >= 8:
        return 'Champions'
    elif row['arpu'] >= 60 and row['satisfaction_score'] >= 7:
        return 'Potential Loyalists'
    elif row['arpu'] >= 40 and row['satisfaction_score'] >= 6:
        return 'New Customers'
    else:
        return 'Need Attention'

df['customer_segment'] = df.apply(assign_value_segment, axis=1)
print(df['customer_segment'].value_counts())

In [None]:
# Prepare for modeling
df_model = df.copy()

# One-hot encode categorical features
categorical_cols = ['plan_type', 'gender', 'city', 'customer_segment', 'satisfaction_risk']
for col in categorical_cols:
    if col in df_model.columns:
        dummies = pd.get_dummies(df_model[col], prefix=col, drop_first=True)
        df_model = pd.concat([df_model, dummies], axis=1)
        df_model.drop(columns=[col], inplace=True)

# Remove unnecessary columns
cols_to_remove = ['customer_id', 'join_date', 'churn_date', 'churn_probability']
df_model = df_model.drop(columns=cols_to_remove, errors='ignore')

print(f"Final dataset shape: {df_model.shape}")

In [None]:
# Save processed data
df_model.to_csv('../data/processed/processed_dataset.csv', index=False)
print("Processed dataset saved!")

# Feature importance analysis
numeric_features = df_model.select_dtypes(include=[np.number]).columns.tolist()
churn_correlations = df_model[numeric_features].corrwith(df_model['churned']).abs().sort_values(ascending=False)
print("\nTop 10 features correlated with churn:")
print(churn_correlations.head(10))