# Telecom Customer Data Generation
## Notebook 01: Synthetic Dataset Creation

This notebook generates synthetic telecom customer data for the revenue optimization project.
The dataset includes:
- Customer demographics
- Usage patterns and billing information
- CRM data (satisfaction, complaints)
- Campaign exposure and digital engagement
- Churn labels

**Privacy Note**: All customer data is synthetically generated with hashed IDs to demonstrate privacy-first analytics aligned with Dentsu's identity solutions strategy.

In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data_generation import TelecomDataGenerator

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("Libraries imported successfully!")

## 1. Generate Synthetic Dataset

We'll generate a realistic dataset of 10,000 telecom customers with various attributes that drive churn and revenue patterns.

In [None]:
# Initialize data generator
print("Initializing Telecom Data Generator...")
generator = TelecomDataGenerator(n_customers=10000)

# Generate complete dataset
print("Generating synthetic customer data...")
datasets = generator.generate_complete_dataset()

# Extract master dataset
df = datasets['master_dataset']
print(f"\nGenerated dataset with {len(df)} customers and {len(df.columns)} features")

## 2. Dataset Overview

In [None]:
# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())

print("\nFirst 5 rows:")
df.head()

In [None]:
# Data types and missing values
print("Data Types and Missing Values:")
info_df = pd.DataFrame({
    'Data Type': df.dtypes,
    'Missing Values': df.isnull().sum(),
    'Missing %': (df.isnull().sum() / len(df) * 100).round(2)
})
print(info_df)

## 3. Key Business Metrics Summary

In [None]:
# Calculate key business metrics
print("=== KEY BUSINESS METRICS ===")
print(f"Total Customers: {len(df):,}")
print(f"Overall Churn Rate: {df['churned'].mean():.2%}")
print(f"Average ARPU: ${df['arpu'].mean():.2f}")
print(f"Median ARPU: ${df['arpu'].median():.2f}")
print(f"Total Monthly Revenue: ${df['arpu'].sum():,.2f}")
print(f"Average Customer Satisfaction: {df['satisfaction_score'].mean():.1f}/10")
print(f"Average Tenure: {df['tenure_months'].mean():.1f} months")

print("\n=== PLAN DISTRIBUTION ===")
plan_stats = df.groupby('plan_type').agg({
    'customer_id': 'count',
    'arpu': 'mean',
    'churned': 'mean'
}).round(2)
plan_stats.columns = ['Customer Count', 'Avg ARPU', 'Churn Rate']
plan_stats['% of Customers'] = (plan_stats['Customer Count'] / len(df) * 100).round(1)
print(plan_stats)

## 4. Data Quality Visualizations

In [None]:
# Create visualizations for key metrics
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Telecom Customer Dataset - Key Metrics Overview', fontsize=16, y=1.02)

# 1. ARPU distribution
axes[0, 0].hist(df['arpu'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].axvline(df['arpu'].mean(), color='red', linestyle='--', label=f'Mean: ${df["arpu"].mean():.2f}')
axes[0, 0].set_title('ARPU Distribution')
axes[0, 0].set_xlabel('ARPU ($)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()

# 2. Age distribution
axes[0, 1].hist(df['age'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
axes[0, 1].axvline(df['age'].mean(), color='red', linestyle='--', label=f'Mean: {df["age"].mean():.1f}')
axes[0, 1].set_title('Age Distribution')
axes[0, 1].set_xlabel('Age (years)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()

# 3. Satisfaction score distribution
axes[0, 2].hist(df['satisfaction_score'], bins=20, alpha=0.7, color='orange', edgecolor='black')
axes[0, 2].axvline(df['satisfaction_score'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["satisfaction_score"].mean():.1f}')
axes[0, 2].set_title('Customer Satisfaction Distribution')
axes[0, 2].set_xlabel('Satisfaction Score (1-10)')
axes[0, 2].set_ylabel('Frequency')
axes[0, 2].legend()

# 4. Plan type distribution
plan_counts = df['plan_type'].value_counts()
axes[1, 0].bar(plan_counts.index, plan_counts.values, color='lightcoral')
axes[1, 0].set_title('Plan Type Distribution')
axes[1, 0].set_xlabel('Plan Type')
axes[1, 0].set_ylabel('Number of Customers')
axes[1, 0].tick_params(axis='x', rotation=45)

# 5. Churn rate by plan type
churn_by_plan = df.groupby('plan_type')['churned'].mean()
axes[1, 1].bar(churn_by_plan.index, churn_by_plan.values * 100, color='salmon')
axes[1, 1].set_title('Churn Rate by Plan Type')
axes[1, 1].set_xlabel('Plan Type')
axes[1, 1].set_ylabel('Churn Rate (%)')
axes[1, 1].tick_params(axis='x', rotation=45)

# 6. ARPU vs Satisfaction correlation
axes[1, 2].scatter(df['satisfaction_score'], df['arpu'], alpha=0.5, color='purple')
axes[1, 2].set_title('ARPU vs Customer Satisfaction')
axes[1, 2].set_xlabel('Satisfaction Score')
axes[1, 2].set_ylabel('ARPU ($)')

# Add correlation coefficient
corr = df['satisfaction_score'].corr(df['arpu'])
axes[1, 2].text(0.05, 0.95, f'Correlation: {corr:.3f}', 
                transform=axes[1, 2].transAxes, fontsize=10,
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

## 5. Geographic and Demographic Analysis

In [None]:
# Geographic distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Customer distribution by city
city_counts = df['city'].value_counts()
axes[0].bar(city_counts.index, city_counts.values, color='lightblue')
axes[0].set_title('Customer Distribution by City')
axes[0].set_xlabel('City')
axes[0].set_ylabel('Number of Customers')
axes[0].tick_params(axis='x', rotation=45)

# Average ARPU by city
arpu_by_city = df.groupby('city')['arpu'].mean().sort_values(ascending=False)
axes[1].bar(arpu_by_city.index, arpu_by_city.values, color='lightgreen')
axes[1].set_title('Average ARPU by City')
axes[1].set_xlabel('City')
axes[1].set_ylabel('Average ARPU ($)')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Gender and age analysis
print("\n=== DEMOGRAPHIC ANALYSIS ===")
print("Gender Distribution:")
gender_stats = df.groupby('gender').agg({
    'customer_id': 'count',
    'arpu': 'mean',
    'churned': 'mean',
    'satisfaction_score': 'mean'
}).round(2)
gender_stats.columns = ['Count', 'Avg ARPU', 'Churn Rate', 'Avg Satisfaction']
print(gender_stats)

## 6. Usage Patterns Analysis

In [None]:
# Usage patterns visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Customer Usage Patterns Analysis', fontsize=16)

# Data usage distribution
axes[0, 0].hist(df['monthly_data_gb'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Monthly Data Usage Distribution')
axes[0, 0].set_xlabel('Data Usage (GB)')
axes[0, 0].set_ylabel('Frequency')

# Voice usage distribution
axes[0, 1].hist(df['monthly_minutes'], bins=50, alpha=0.7, color='lightgreen', edgecolor='black')
axes[0, 1].set_title('Monthly Voice Usage Distribution')
axes[0, 1].set_xlabel('Voice Usage (Minutes)')
axes[0, 1].set_ylabel('Frequency')

# OTT usage vs Age
axes[1, 0].scatter(df['age'], df['ott_usage_hours'], alpha=0.5, color='orange')
axes[1, 0].set_title('OTT Usage vs Age')
axes[1, 0].set_xlabel('Age (years)')
axes[1, 0].set_ylabel('OTT Usage (hours/month)')

# Digital engagement by age group
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 45, 60, 100], labels=['18-30', '31-45', '46-60', '60+'])
engagement_by_age = df.groupby('age_group')[['monthly_web_sessions', 'monthly_app_sessions']].mean()

x = range(len(engagement_by_age.index))
width = 0.35
axes[1, 1].bar([i - width/2 for i in x], engagement_by_age['monthly_web_sessions'], 
               width, label='Web Sessions', color='lightblue')
axes[1, 1].bar([i + width/2 for i in x], engagement_by_age['monthly_app_sessions'], 
               width, label='App Sessions', color='lightcoral')
axes[1, 1].set_title('Digital Engagement by Age Group')
axes[1, 1].set_xlabel('Age Group')
axes[1, 1].set_ylabel('Monthly Sessions')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(engagement_by_age.index)
axes[1, 1].legend()

plt.tight_layout()
plt.show()

## 7. Save Generated Datasets

In [None]:
# Save all datasets to CSV files
data_dir = "../data/raw"
os.makedirs(data_dir, exist_ok=True)

print("Saving datasets to CSV files...")

for name, dataset in datasets.items():
    filepath = os.path.join(data_dir, f"{name}.csv")
    dataset.to_csv(filepath, index=False)
    print(f"âœ“ Saved {name}: {dataset.shape} -> {filepath}")

print(f"\nAll datasets saved to {data_dir}/")
print("Files created:")
for file in os.listdir(data_dir):
    if file.endswith('.csv'):
        filepath = os.path.join(data_dir, file)
        size_mb = os.path.getsize(filepath) / (1024 * 1024)
        print(f"  {file} ({size_mb:.2f} MB)")

## 8. Dataset Quality Report

In [None]:
# Generate comprehensive dataset quality report
print("=" * 60)
print("TELECOM DATASET QUALITY REPORT")
print("=" * 60)

print(f"\nðŸ“Š DATASET OVERVIEW")
print(f"Total Records: {len(df):,}")
print(f"Total Features: {len(df.columns)}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Data Quality Score: {((df.count().sum() / (len(df) * len(df.columns))) * 100):.1f}% (no missing values)")

print(f"\nðŸ’¼ BUSINESS METRICS")
print(f"Average ARPU: ${df['arpu'].mean():.2f} (${df['arpu'].std():.2f} std)")
print(f"Churn Rate: {df['churned'].mean():.2%}")
print(f"Average Satisfaction: {df['satisfaction_score'].mean():.1f}/10")
print(f"Customer Lifetime (avg): {df['tenure_months'].mean():.1f} months")

print(f"\nðŸ“± USAGE PATTERNS")
print(f"Avg Data Usage: {df['monthly_data_gb'].mean():.1f} GB/month")
print(f"Avg Voice Usage: {df['monthly_minutes'].mean():.0f} minutes/month")
print(f"Avg OTT Usage: {df['ott_usage_hours'].mean():.1f} hours/month")
print(f"Digital Engagement: {(df['monthly_web_sessions'] + df['monthly_app_sessions']).mean():.1f} sessions/month")

print(f"\nðŸŽ¯ MARKETING INSIGHTS")
print(f"Avg Campaign Exposure: {df['campaigns_exposed'].mean():.1f} campaigns/customer")
print(f"Overall Conversion Rate: {(df['total_conversions'] / df['total_clicks'].replace(0, 1)).mean():.2%}")
print(f"Customer Support Load: {df['support_tickets_12m'].mean():.1f} tickets/customer/year")

print(f"\nðŸ’° REVENUE INSIGHTS")
total_revenue = df['arpu'].sum()
churned_revenue_loss = df[df['churned'] == 1]['arpu'].sum()
print(f"Total Monthly Revenue: ${total_revenue:,.2f}")
print(f"Monthly Revenue at Risk (from churned): ${churned_revenue_loss:,.2f} ({churned_revenue_loss/total_revenue:.1%})")
print(f"High-Value Customers (ARPU > $75): {(df['arpu'] > 75).sum():,} ({(df['arpu'] > 75).mean():.1%})")

print(f"\nðŸ”’ PRIVACY COMPLIANCE")
print(f"âœ“ All customer IDs are hashed (privacy-first approach)")
print(f"âœ“ Synthetic data generation ensures no real customer information")
print(f"âœ“ Data anonymization aligned with Dentsu identity solutions strategy")

print(f"\nâœ… DATASET READY FOR ANALYSIS")
print(f"The synthetic telecom dataset has been successfully generated and saved.")
print(f"Next steps: Data preprocessing and feature engineering (Notebook 02)")
print("=" * 60)