In [1]:
import pandas as pd
import numpy as np
import random

# Define number of customers
num_customers = 1000

# Generate synthetic customer data
data = {
    "Customer_ID": range(1, num_customers + 1),
    "Last_Order_Date": pd.date_range(start="2023-01-01", periods=num_customers, freq="D").astype(str),
    "Order_Frequency": np.random.randint(1, 15, num_customers),  # Number of orders per month
    "Spend": np.random.uniform(50, 500, num_customers).round(2),  # Amount spent in dollars
    "Discount_Used": np.random.choice(["Yes", "No"], num_customers),  # Whether a discount was used
    "App_Usage": np.random.choice(["High", "Medium", "Low"], num_customers),  # Engagement level
    "Delivery_Issues": np.random.choice(["Yes", "No"], num_customers, p=[0.2, 0.8]),  # Delivery problems
    "Churn": np.random.choice(["Yes", "No"], num_customers, p=[0.3, 0.7])  # Whether the customer churned
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Save dataset to CSV file
df.to_csv("customer_churn_data.csv", index=False)

# Display first 5 rows
print("Dataset created successfully!")
df.head()


Dataset created successfully!


Unnamed: 0,Customer_ID,Last_Order_Date,Order_Frequency,Spend,Discount_Used,App_Usage,Delivery_Issues,Churn
0,1,2023-01-01,6,102.8,Yes,High,No,No
1,2,2023-01-02,3,422.74,Yes,Medium,No,Yes
2,3,2023-01-03,4,404.5,No,Medium,Yes,Yes
3,4,2023-01-04,3,294.69,Yes,Medium,No,No
4,5,2023-01-05,4,257.29,No,High,No,No


In [2]:
df=pd.read_csv("customer_churn_data.csv")

In [3]:
df.head()

Unnamed: 0,Customer_ID,Last_Order_Date,Order_Frequency,Spend,Discount_Used,App_Usage,Delivery_Issues,Churn
0,1,2023-01-01,6,102.8,Yes,High,No,No
1,2,2023-01-02,3,422.74,Yes,Medium,No,Yes
2,3,2023-01-03,4,404.5,No,Medium,Yes,Yes
3,4,2023-01-04,3,294.69,Yes,Medium,No,No
4,5,2023-01-05,4,257.29,No,High,No,No


In [4]:
print(df.isnull().sum())

Customer_ID        0
Last_Order_Date    0
Order_Frequency    0
Spend              0
Discount_Used      0
App_Usage          0
Delivery_Issues    0
Churn              0
dtype: int64


In [6]:
# Fill missing values ONLY for numerical columns
df.fillna(df.select_dtypes(include=['number']).mean(), inplace=True)


In [8]:
# Fill missing values for numeric columns
df.fillna(df.select_dtypes(include=['number']).mean(), inplace=True)

# Fill missing values for categorical columns (fixed method)
df = df.assign(
    Discount_Used=df["Discount_Used"].fillna(df["Discount_Used"].mode()[0]),
    App_Usage=df["App_Usage"].fillna(df["App_Usage"].mode()[0]),
    Delivery_Issues=df["Delivery_Issues"].fillna(df["Delivery_Issues"].mode()[0])
)

# Convert Last_Order_Date to datetime format
df["Last_Order_Date"] = pd.to_datetime(df["Last_Order_Date"])

# Confirm changes
print(df.isnull().sum())

Customer_ID        0
Last_Order_Date    0
Order_Frequency    0
Spend              0
Discount_Used      0
App_Usage          0
Delivery_Issues    0
Churn              0
dtype: int64


In [9]:
# Calculate churn rate
churn_rate = df["Churn"].value_counts(normalize=True) * 100
print("Churn Rate:\n", churn_rate)


Churn Rate:
 Churn
No     69.6
Yes    30.4
Name: proportion, dtype: float64


In [10]:
# Filter high-risk customers (low order frequency & low spending)
high_risk_customers = df[(df["Order_Frequency"] < 2) & (df["Spend"] < 100)]
print("High-Risk Customers:\n", high_risk_customers.head())


High-Risk Customers:
      Customer_ID Last_Order_Date  Order_Frequency  Spend Discount_Used  \
453          454      2024-03-29                1  59.93           Yes   
506          507      2024-05-21                1  63.89            No   
784          785      2025-02-23                1  59.64            No   
958          959      2025-08-16                1  94.77            No   
993          994      2025-09-20                1  50.62            No   

    App_Usage Delivery_Issues Churn  
453       Low             Yes    No  
506      High              No    No  
784      High              No    No  
958      High              No   Yes  
993    Medium             Yes   Yes  


In [11]:
avg_spend = df.groupby("Churn")["Spend"].mean()
print("Average Spend by Churn Status:\n", avg_spend)


Average Spend by Churn Status:
 Churn
No     274.781480
Yes    282.591546
Name: Spend, dtype: float64


In [12]:
order_analysis = df.groupby("Churn")["Order_Frequency"].mean()
print("Average Order Frequency by Churn Status:\n", order_analysis)


Average Order Frequency by Churn Status:
 Churn
No     7.466954
Yes    7.611842
Name: Order_Frequency, dtype: float64


In [13]:
discount_analysis = df.groupby("Churn")["Discount_Used"].value_counts(normalize=True) * 100
print("Discount Usage by Churn Status:\n", discount_analysis)


Discount Usage by Churn Status:
 Churn  Discount_Used
No     No               53.017241
       Yes              46.982759
Yes    No               51.973684
       Yes              48.026316
Name: proportion, dtype: float64


In [14]:
delivery_issues_analysis = df.groupby("Churn")["Delivery_Issues"].value_counts(normalize=True) * 100
print("Delivery Issues by Churn Status:\n", delivery_issues_analysis)


Delivery Issues by Churn Status:
 Churn  Delivery_Issues
No     No                 79.166667
       Yes                20.833333
Yes    No                 79.934211
       Yes                20.065789
Name: proportion, dtype: float64
