In [2]:
import pandas as pd
import numpy as np
from scipy import stats

def generate_customer_data(n_records=5000, seed=42):
    np.random.seed(seed)

    def create_feature(distribution, **kwargs):
        return getattr(np.random, distribution)(size=n_records, **kwargs)

    data = {
        'ClientNumber': range(1, n_records + 1),
        'YearsOld': np.clip(create_feature('normal', loc=40, scale=15), 18, 80).astype(int),
        'Sex': create_feature('choice', a=['M', 'F']),
        'SubscriptionPlan': create_feature('choice', a=['Monthly', 'Annual', 'Biennial'], p=[0.6, 0.3, 0.1]),
        'MonthlyFee': np.round(create_feature('lognormal', mean=4, sigma=0.3), 2),
        'MonthsActive': create_feature('randint', low=1, high=73),
        'SupportPlan': create_feature('choice', a=['Included', 'None']),
        'NetworkType': create_feature('choice', a=['Copper', 'Optical', 'None'], p=[0.3, 0.5, 0.2]),
        'DigitalInvoice': create_feature('choice', a=['Yes', 'No']),
        'BillingMethod': create_feature('choice', a=['E-payment', 'Mail', 'AutoBank', 'AutoCard'])
    }

    df = pd.DataFrame(data)

    df['TotalBilled'] = (df['MonthlyFee'] * df['MonthsActive']).round(2)

    churn_likelihood = (
        0.1 +
        0.1 * (df['SubscriptionPlan'] == 'Monthly') +
        0.05 * (df['NetworkType'] == 'Optical') +
        0.05 * (df['SupportPlan'] == 'None') +
        0.05 * (df['MonthsActive'] < 12) +
        0.05 * (df['MonthlyFee'] > df['MonthlyFee'].quantile(0.75))
    )
    df['AccountClosed'] = np.random.binomial(n=1, p=churn_likelihood).astype(bool)

    df['AverageBillAmount'] = df['TotalBilled'] / df['MonthsActive']
    df['ClientValue'] = df['TotalBilled'] * (~df['AccountClosed'])

    # Introduce data quality issues
    df.loc[np.random.choice(df.index, 100, replace=False), 'SupportPlan'] = np.nan
    df.loc[np.random.choice(df.index, 100, replace=False), 'NetworkType'] = np.nan
    df.loc[np.random.choice(df.index, 50, replace=False), 'MonthlyFee'] *= 5

    return df

# Generate and save dataset
customer_data = generate_customer_data()
customer_data.to_csv('telco_customer_data.csv', index=False)

print(customer_data.head())
print(f"\nDataset dimensions: {customer_data.shape}")
print(f"\nAccount closure rate: {customer_data['AccountClosed'].mean():.2%}")

   ClientNumber  YearsOld Sex SubscriptionPlan  MonthlyFee  MonthsActive  \
0             1        47   F          Monthly       90.71            47   
1             2        37   M          Monthly       50.51            69   
2             3        49   F         Biennial       55.18            24   
3             4        62   F          Monthly       39.41            71   
4             5        36   F          Monthly       43.32            34   

  SupportPlan NetworkType DigitalInvoice BillingMethod  TotalBilled  \
0        None     Optical            Yes     E-payment      4263.37   
1        None        None             No     E-payment      3485.19   
2        None      Copper            Yes      AutoBank      1324.32   
3    Included        None            Yes      AutoCard      2798.11   
4    Included     Optical            Yes     E-payment      1472.88   

   AccountClosed  AverageBillAmount  ClientValue  
0           True              90.71         0.00  
1           Tr