In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic customer churn dataset with realistic messiness
def create_messy_churn_dataset(n_customers=5000):
    """Create a realistic messy customer churn dataset for cleaning practice"""
    
    # Base customer data
    customers = []
    
    for i in range(n_customers):
        # Introduce realistic data quality issues
        customer_id = f"CUST_{i:05d}" if np.random.random() > 0.02 else None  # 2% missing IDs
        
        # Names with inconsistent formatting
        first_names = ['John', 'Jane', 'Mike', 'Sarah', 'David', 'Lisa', ' Emma ', 'ALEX', 'maria', '']
        last_names = ['Smith', 'Johnson', 'Brown', 'Davis', 'Wilson', '', 'MILLER', 'garcia', ' Lee ']
        
        first_name = np.random.choice(first_names)
        last_name = np.random.choice(last_names)
        
        # Ages with outliers and missing values
        if np.random.random() > 0.05:  # 5% missing ages
            age = np.random.normal(40, 15)
            # Add some unrealistic outliers
            if np.random.random() > 0.98:
                age = np.random.choice([200, -5, 999])
        else:
            age = None
            
        # Monthly charges with outliers
        monthly_charge = np.random.normal(75, 25)
        if np.random.random() > 0.995:  # Rare extreme outliers
            monthly_charge = np.random.choice([5000, -100, 0])
            
        # Total charges (should correlate with tenure * monthly)
        tenure_months = np.random.randint(1, 73) if np.random.random() > 0.03 else None
        if tenure_months and np.random.random() > 0.1:  # 10% have inconsistent totals
            total_charges = monthly_charge * tenure_months * np.random.uniform(0.8, 1.2)
        else:
            total_charges = monthly_charge * (tenure_months or 12) if np.random.random() > 0.08 else None
            
        # Contract types with inconsistent values
        contract_types = ['Month-to-month', 'One year', 'Two year', 'monthly', 'ANNUAL', '1-year', '2-YEAR', '']
        contract = np.random.choice(contract_types)
        
        # Phone service with yes/no variations
        phone_service = np.random.choice(['Yes', 'No', 'YES', 'no', '1', '0', 'True', 'False', ''])
        
        # Internet service
        internet_types = ['DSL', 'Fiber optic', 'No', 'dsl', 'FIBER', 'None', 'Cable', '']
        internet_service = np.random.choice(internet_types)
        
        # Churn (target variable) - some missing values
        if np.random.random() > 0.02:
            churn = np.random.choice(['Yes', 'No', 'YES', 'no', '1', '0', 'True', 'False'])
        else:
            churn = None
            
        customers.append({
            'customer_id': customer_id,
            'first_name': first_name,
            'last_name': last_name,
            'age': age,
            'tenure_months': tenure_months,
            'monthly_charges': monthly_charge,
            'total_charges': total_charges,
            'contract': contract,
            'phone_service': phone_service,
            'internet_service': internet_service,
            'churn': churn
        })
    
    df = pd.DataFrame(customers)
    
    # Add some duplicate rows (exact duplicates)
    duplicate_indices = np.random.choice(df.index, size=int(n_customers * 0.02), replace=False)
    duplicates = df.loc[duplicate_indices].copy()
    df = pd.concat([df, duplicates], ignore_index=True)
    
    # Add some near-duplicates (same customer_id, slightly different data)
    near_duplicate_indices = np.random.choice(df.index, size=int(n_customers * 0.01), replace=False)
    near_duplicates = df.loc[near_duplicate_indices].copy()
    near_duplicates['monthly_charges'] = near_duplicates['monthly_charges'] + np.random.uniform(-1, 1, len(near_duplicates))
    df = pd.concat([df, near_duplicates], ignore_index=True)
    
    return df.sample(frac=1).reset_index(drop=True)  # Shuffle the data

# Create the messy dataset
print("üîÑ Generating messy customer churn dataset...")
df_raw = create_messy_churn_dataset()

print(f"‚úÖ Created dataset with {len(df_raw)} rows and {len(df_raw.columns)} columns")
print("\nüìä First look at the data:")
display(df_raw.head(10))

print("\nüìà Basic info about the dataset:")
df_raw.info()

print("\nüîç Let's examine data quality issues...")
print("Missing values per column:")
missing_summary = df_raw.isnull().sum()
missing_pct = (df_raw.isnull().sum() / len(df_raw) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing_Count': missing_summary,
    'Missing_Percentage': missing_pct
})
display(missing_df[missing_df['Missing_Count'] > 0])

print("\nüîç Unique values in categorical columns:")
for col in ['contract', 'phone_service', 'internet_service', 'churn']:
    if col in df_raw.columns:
        print(f"\n{col}:")
        display(df_raw[col].value_counts(dropna=False))

print("\nüìä Numeric columns summary:")
numeric_cols = df_raw.select_dtypes(include=[np.number]).columns
display(df_raw[numeric_cols].describe())

print("\n‚ö†Ô∏è  Potential outliers in numeric data:")
for col in numeric_cols:
    q1 = df_raw[col].quantile(0.25)
    q3 = df_raw[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = df_raw[(df_raw[col] < lower_bound) | (df_raw[col] > upper_bound)]
    if len(outliers) > 0:
        print(f"\n{col}: {len(outliers)} potential outliers")
        print(f"  Range: [{df_raw[col].min():.2f}, {df_raw[col].max():.2f}]")
        print(f"  Expected range: [{lower_bound:.2f}, {upper_bound:.2f}]")

print("\nüîÑ Checking for duplicates...")
exact_duplicates = df_raw.duplicated().sum()
print(f"Exact duplicates: {exact_duplicates}")

# Check for near-duplicates based on customer_id
customer_id_duplicates = df_raw['customer_id'].value_counts()
near_duplicates = customer_id_duplicates[customer_id_duplicates > 1]
if len(near_duplicates) > 0:
    print(f"Customers with multiple records: {len(near_duplicates)}")
    print("Sample of duplicate customer IDs:")
    display(near_duplicates.head())

üîÑ Generating messy customer churn dataset...
‚úÖ Created dataset with 5150 rows and 11 columns

üìä First look at the data:


Unnamed: 0,customer_id,first_name,last_name,age,tenure_months,monthly_charges,total_charges,contract,phone_service,internet_service,churn
0,CUST_04069,Jane,Wilson,28.383528,28.0,82.265524,2076.619072,Two year,YES,dsl,1
1,CUST_01842,John,Smith,17.26659,15.0,110.725018,1531.464689,Month-to-month,,FIBER,Yes
2,CUST_01165,ALEX,MILLER,51.243016,51.0,70.51479,3596.254298,1-year,False,No,1
3,CUST_02844,Mike,Brown,27.303178,29.0,55.968944,1378.104977,2-YEAR,0,,True
4,CUST_02690,,MILLER,71.080809,1.0,61.371716,61.232039,2-YEAR,,Cable,True
5,CUST_00096,David,Lee,58.347852,13.0,92.355835,1169.689723,Two year,Yes,,True
6,CUST_03104,John,Lee,23.437586,38.0,85.160785,2767.226351,Month-to-month,YES,,False
7,CUST_03514,,Johnson,68.097003,69.0,90.822188,6266.730977,ANNUAL,True,DSL,YES
8,CUST_00474,John,,59.550053,56.0,77.758034,4741.775628,,0,DSL,Yes
9,CUST_00699,,garcia,44.635599,29.0,76.557915,2444.393781,One year,1,FIBER,1



üìà Basic info about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5150 entries, 0 to 5149
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       5045 non-null   object 
 1   first_name        5150 non-null   object 
 2   last_name         5150 non-null   object 
 3   age               4908 non-null   float64
 4   tenure_months     4994 non-null   float64
 5   monthly_charges   5150 non-null   float64
 6   total_charges     5098 non-null   float64
 7   contract          5150 non-null   object 
 8   phone_service     5150 non-null   object 
 9   internet_service  5150 non-null   object 
 10  churn             5055 non-null   object 
dtypes: float64(4), object(7)
memory usage: 442.7+ KB

üîç Let's examine data quality issues...
Missing values per column:


Unnamed: 0,Missing_Count,Missing_Percentage
customer_id,105,2.04
age,242,4.7
tenure_months,156,3.03
total_charges,52,1.01
churn,95,1.84



üîç Unique values in categorical columns:

contract:


contract
2-YEAR            674
                  674
1-year            664
monthly           650
One year          642
Two year          638
Month-to-month    607
ANNUAL            601
Name: count, dtype: int64


phone_service:


phone_service
True     622
         574
Yes      573
0        571
1        571
False    567
YES      566
no       566
No       540
Name: count, dtype: int64


internet_service:


internet_service
Fiber optic    678
Cable          671
No             661
None           660
FIBER          635
DSL            626
               619
dsl            600
Name: count, dtype: int64


churn:


churn
True     664
YES      647
No       644
1        635
0        635
Yes      617
False    608
no       605
None      95
Name: count, dtype: int64


üìä Numeric columns summary:


Unnamed: 0,age,tenure_months,monthly_charges,total_charges
count,4908.0,4994.0,5150.0,5098.0
mean,46.86064,36.497197,78.053381,2776.72912
std,81.753009,20.780275,121.752956,4930.936735
min,-18.836004,1.0,-100.0,-5431.928991
25%,29.81346,18.0,58.494581,1098.11921
50%,39.708134,36.0,75.303103,2395.977922
75%,49.903292,54.0,92.75018,3940.025488
max,999.0,72.0,5000.0,306462.175383



‚ö†Ô∏è  Potential outliers in numeric data:

age: 137 potential outliers
  Range: [-18.84, 999.00]
  Expected range: [-0.32, 80.04]

monthly_charges: 54 potential outliers
  Range: [-100.00, 5000.00]
  Expected range: [7.11, 144.13]

total_charges: 51 potential outliers
  Range: [-5431.93, 306462.18]
  Expected range: [-3164.74, 8202.88]

üîÑ Checking for duplicates...
Exact duplicates: 100
Customers with multiple records: 146
Sample of duplicate customer IDs:


customer_id
CUST_04703    3
CUST_04920    2
CUST_01734    2
CUST_01228    2
CUST_02150    2
Name: count, dtype: int64

In [14]:
df_raw['age'].min()

np.float64(-18.836003774275135)

In [35]:
below0 = df_raw[df_raw['age'] <= 0].count()
below0 = below0['age']
below0
above120 = df_raw[df_raw['age'] > 120]['age'].count()
above120
beyondBounds = below0 + above120
beyondBounds
withinBounds = df_raw[(df_raw['age'] <= 120) & (df_raw['age'] > 0)]['age'].count()
withinBounds
# percentage = (beyondBounds / (beyondBounds + withinBounds)) * 100
# percentage
df_raw.count()

customer_id         5045
first_name          5150
last_name           5150
age                 4908
tenure_months       4994
monthly_charges     5150
total_charges       5098
contract            5150
phone_service       5150
internet_service    5150
churn               5055
dtype: int64

In [36]:
5150-4908

242