# **Project Title: "Customer Churn Prediction for Telecom"**

## **1. PROBLEM STATEMENT:**
Subscription-based businesses (telecom, streaming, SaaS) face a key challenge: customer churn (users canceling their service). Reducing churn is critical because retaining a customer is cheaper than acquiring a new one.


## **OBJECTIVE:**
Build a predictive model to identify customers at risk of churn and recommend retention strategies.

## **2. IMPORT REQUIRED LIBRARIES**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## **3. DATASET LOAD AND OVERVIEW**

In [None]:
churn=pd.read_csv("Telco_Customer_Churn.csv")
churn.head()

In [None]:
df=churn.copy()
df.head()

In [None]:
df.shape

In [None]:
df.shape[0]

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.duplicated().sum()

## **BASIC STATISTICS SUMMARY**

In [None]:
df.describe()

In [None]:
df.describe(include="object")

## **3. DATA CLEANING**


- Convert TotalCharges to numeric
- Handle missing values
- Create target variable (Churn_flag)


In [None]:
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors="coerce")

In [None]:
df['TotalCharges'].fillna(df['TotalCharges'].median(),inplace=True)
df.dtypes

In [None]:
df["Churn_flag"] = df["Churn"].map({"Yes":1, "No":0})

In [None]:
df.to_csv("clean_churn.csv")

In [None]:
df1=df.copy()

## **5. Exploratory Data Analysis (EDA)**
Using Seaborn plots to analyze categorical and numerical features.


### *Remove unwanted column:*

In [None]:
df.columns

In [None]:
df=df.drop(columns=['customerID'],axis=1)

In [None]:
df.columns

### *Total churn vs non-churn:*

In [None]:
df['Churn'].value_counts()

### *Total % churn vs non-churn:*

In [None]:
df['Churn'].value_counts(normalize=True)*100

In [None]:
plt.style.use("fivethirtyeight")
sns.set_style("whitegrid")
sns.set_palette("muted")

In [None]:
plt.Figure(figsize=(10,5))
df['Churn'].value_counts().plot.pie(autopct="%1.1f%%")
plt.title("Churn %")
plt.ylabel("")
plt.show()

In [None]:
cat_cols=df.select_dtypes(include="object").columns
num_cols=df.select_dtypes(include=["int64","float64"]).columns

## **UNIVARIATE ANALYSIS**

In [None]:
for col in cat_cols:
    plt.Figure(figsize=(10,5))
    sns.countplot(data=df,x=col)
    plt.title(f"Distribution of {col}")
    plt.xlabel(f'{col}')
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
for col in num_cols:
    plt.Figure(figsize=(10,5))
    sns.histplot(data=df,x=col,bins=30,kde=True)
    plt.title(f"Distribution of {col}")
    plt.xlabel(f'{col}')
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
for col in num_cols:
    plt.Figure(figsize=(10,5))
    sns.boxplot(data=df,y=col)
    plt.title(f"Outliers in {col}")
    plt.xlabel(f'{col}')
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.show()

## **BIVARIATE ANALYSIS BASED ON TARGET="CHURN"**

In [None]:
for col in cat_cols:
    plt.Figure(figsize=(10,5))
    sns.countplot(data=df,x=col,hue="Churn")
    plt.title(f"Churn vs {col}")
    plt.xlabel(f'{col}')
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
for col in num_cols:
    plt.Figure(figsize=(10,5))
    sns.histplot(data=df,x=col,hue="Churn",kde=True,bins=30)
    plt.title(f"Churn vs {col}")
    plt.xlabel(f'{col}')
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
for col in num_cols:
    plt.Figure(figsize=(10,5))
    sns.boxplot(data=df,y=col,hue="Churn")
    plt.title(f"Churn vs {col}")
    plt.xlabel(f'{col}')
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
for col in num_cols:
        Q1=df[col].quantile(0.25)
        Q3=df[col].quantile(0.75)
        IQR=Q3-Q1
        lower=Q1-1.5*IQR
        upper=Q3+1.5*IQR
        df=df[(df[col]>=lower)&(df[col]<=upper)]

In [None]:
for col in num_cols:
    plt.Figure(figsize=(10,5))
    sns.boxplot(data=df,y=col,hue="Churn",palette="viridis")
    plt.title(f"Churn vs {col}")
    plt.xlabel(f'{col}')
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
df['TenureGroup']=pd.cut(df['tenure'],bins=[-1,12,24,48,72],labels=["0-12","13-24","25-48","49-72"])
sns.countplot(x="TenureGroup",hue="Churn",data=df)
plt.title("Churn count by tenure group")
plt.show()

## **MULTIVARIATE ANALYSIS**

In [None]:
    plt.Figure(figsize=(20,10))
    sns.heatmap(df[num_cols].corr(),annot=True)
    plt.title("Correlation Heatmap of Features")
    plt.show()

In [None]:
for col in num_cols:
    if col=="SeniorCitizen":
        continue
    plt.Figure(figsize=(10,5))
    sns.scatterplot(data=df,x=col,hue="Churn",y="MonthlyCharges",alpha=0.6)
    plt.title(f"Churn in Monthlycharges vs {col}")
    plt.xlabel(f'{col}')
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
num_cols2=[col for col in num_cols if col !="SeniorCitizen"]
sns.pairplot(df[num_cols2+["Churn"]],hue="Churn",diag_kind="kde")
plt.show()

## **6. Feature Engineering**
Drop unnecessary columns and separate categorical and numerical features.


In [None]:
for col in cat_cols:
    print(f"{col}:{df[col].to_list()[:10]}")

In [None]:
for col in num_cols:
    print(f"{col}:{df[col].to_list()[:10]}")

In [None]:
cat_cols1=df1.select_dtypes(include="object").columns
num_cols1=df1.select_dtypes(include=["int64","float64"]).columns

In [None]:
X = df1.drop(["Churn","Churn_flag"], axis=1)
y = df1["Churn_flag"]

cat_cols1 = X.select_dtypes(include="object").columns.tolist()
num_cols1 = X.select_dtypes(exclude="object").columns.tolist()


print("Categorical Columns:", cat_cols1)
print("Numerical Columns:", num_cols1)

## **7. Preprocessing & Train-Test Split**
- OneHotEncoding for categorical features
- Scaling for numerical features


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

print("Training Data:", X_train.shape)
print("Test Data:", X_test.shape)

print("Training Data:", y_train.shape)
print("Test Data:", y_test.shape)

In [None]:
preprocessor=ColumnTransformer(transformers=[("cat",OneHotEncoder(handle_unknown="ignore"),cat_cols1),("num",StandardScaler(),num_cols1)])

In [None]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [None]:
y_train.value_counts(normalize=True)

In [None]:
y_test.value_counts(normalize=True)

In [None]:
y_train.value_counts().unique()

## **8. Model Training and Evaluation**
Logistic Regression and Random Forest.


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


In [None]:

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_processed, y_train)
log_pred=log_model.predict(X_test_processed)
log_prob = log_model.predict_proba(X_test_processed)[:,1]


In [None]:

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_processed, y_train)
rf_pred=rf_model.predict(X_test_processed)
rf_prob = rf_model.predict_proba(X_test_processed)[:,1]

In [None]:


print("LogisticRegression:\n",(classification_report(y_test,log_pred)))
print("RandomForestClassifier:\n",(classification_report(y_test,rf_pred)))


In [None]:
print("ROC AUC Score:\n", roc_auc_score(y_test, log_prob))
print("ROC AUC Score:\n", roc_auc_score(y_test, rf_prob))


In [None]:
# Confusion Matrix
sns.heatmap(confusion_matrix(y_test, log_pred), annot=True, fmt="d", cmap="Blues")
plt.title("Logistic Regression Confusion Matrix")
plt.show()

In [None]:
sns.heatmap(confusion_matrix(y_test, rf_pred), annot=True, fmt="d", cmap="Greens")
plt.title("Random Forest Confusion Matrix")
plt.show()

# **10. Business Insights from Data**

### **HIGH CHURN RISK**:
- ### PaymentMethod-Electronic check
- ### Contract-Month to month
- ### InternetService-Fiber  optic 
- ### SeniorCitizen-01
- ### tenure-<12 MONTHS




In [None]:
df.columns

In [None]:
# Check churn rates by important features
for col in ["Contract", "InternetService", "PaymentMethod"]:
    churn_rate = df.groupby(col)["Churn_flag"].mean() * 100
    print(f"\nChurn Rate by {col}:\n", churn_rate)


- Month-to-Month contracts have the highest churn (43%).

- Customers paying via Electronic Check churn more often(46%).

- Fiber Optic internet users churn more than DSL users(42%).

# **11. Retention Strategy**

In [None]:
# Get churn probabilities from Logistic Regression
log_probs = log_model.predict_proba(X_test_processed)[:,1]

# Create action list
action_list = pd.DataFrame({
    "CustomerID": df1.loc[X_test.index, "customerID"],
    "Churn_Probability": log_probs
})

# Add recommended actions
action_list["Action"] = [
    "Retention Call" if p >= 0.7 else 
    "Email Coupon" if p >= 0.4 else 
    "Monitor Only"
    for p in log_probs
]

action_list.head(10)


In [None]:
# Get churn probabilities from Random Forest 
rf_probs = rf_model.predict_proba(X_test_processed)[:,1]

# Create a DataFrame with customer IDs, churn probability, and recommended action
action_list = pd.DataFrame({
    "CustomerID": df1.loc[X_test.index, "customerID"],
    "Churn_Probability": rf_probs
})

# Add Action column based on probability thresholds
action_list["Action"] = [
    "Retention Call" if p >= 0.7 else 
    "Email Coupon" if p >= 0.4 else 
    "Monitor Only" 
    for p in rf_probs
]

action_list.head(10)


# **12. ROI ESTIMATION**

In [None]:
# Average monthly charges of high-risk customers
avg_monthly = df1.loc[y_test.index, "MonthlyCharges"].mean()

# High-risk customers = churn probability >= 0.7
high_risk_count = (log_probs >= 0.7).sum()

# Assume retention strategy saves 5% of high-risk customers
retention_lift = 0.05  
saved_revenue_monthly = avg_monthly * high_risk_count * retention_lift
saved_revenue_yearly = saved_revenue_monthly * 12

print("Estimated Monthly Revenue Saved:", round(saved_revenue_monthly, 2))
print("Estimated Yearly Revenue Saved:", round(saved_revenue_yearly, 2))


In [None]:


# Step 1: Average monthly charges of high-risk customers
avg_monthly = df1.loc[y_test.index, "MonthlyCharges"].mean()

# Step 2: Count how many high-risk customers (p >= 0.7)
high_risk_count = (log_probs >= 0.7).sum()

# Step 3: Retention improvement scenarios (1% to 10%)
retention_rates = [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.10]

# Step 4: Calculate yearly savings for each rate
yearly_savings = []
for r in retention_rates:
    saved = avg_monthly * high_risk_count * r * 12
    yearly_savings.append(saved)

# Step 5: Put results into a DataFrame (easy for seaborn)
roi_df = pd.DataFrame({
    "Retention (%)": [r*100 for r in retention_rates],
    "Revenue Saved ($)": yearly_savings
})

# Step 6: Plot with seaborn
plt.figure(figsize=(8,5))
sns.barplot(x="Retention (%)", y="Revenue Saved ($)", data=roi_df, color="skyblue", edgecolor="black")

# Step 7: Add labels on top of bars
for i, row in roi_df.iterrows():
    plt.text(row["Retention (%)"], row["Revenue Saved ($)"] + 500, 
             "$" + str(int(row["Revenue Saved ($)"])), ha="center")

# Step 8: Titles
plt.title("Yearly Revenue Saved by Retention", fontsize=14)
plt.xlabel("Retention Improvement (%)", fontsize=12)
plt.xticks(rotation=50)
plt.ylabel("Revenue Saved ($)", fontsize=12)

plt.show()


In [None]:
# Average monthly charges of high-risk customers
avg_monthly = df1.loc[y_test.index, "MonthlyCharges"].mean()

# Assume we save 5% of high-risk customers
high_risk_count = (rf_probs >= 0.7).sum()
retention_lift = 0.05  

saved_revenue_monthly = avg_monthly * high_risk_count * retention_lift
saved_revenue_yearly = saved_revenue_monthly * 12

print("Estimated Monthly Revenue Saved:", round(saved_revenue_monthly,2))
print("Estimated Yearly Revenue Saved:", round(saved_revenue_yearly,2))


In [None]:


# Step 1: Get probabilities from Random Forest
rf_probs = rf_model.predict_proba(X_test_processed)[:,1]

# Step 2: Average monthly charges of high-risk customers
avg_monthly_rf = df1.loc[y_test.index, "MonthlyCharges"].mean()

# Step 3: Count how many high-risk customers (p >= 0.7)
high_risk_count_rf = (rf_probs >= 0.7).sum()

# Step 4: Retention improvement scenarios (1% to 10%)
retention_rates = [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.10]

# Step 5: Calculate yearly savings
yearly_savings_rf = []
for r in retention_rates:
    saved = avg_monthly_rf * high_risk_count_rf * r * 12
    yearly_savings_rf.append(saved)

# Step 6: Put results in DataFrame
roi_rf = pd.DataFrame({
    "Retention (%)": [r*100 for r in retention_rates],
    "Revenue Saved ($)": yearly_savings_rf
})

# Step 7: Plot with seaborn
plt.figure(figsize=(8,5))
sns.barplot(x="Retention (%)", y="Revenue Saved ($)", data=roi_rf, color="lightgreen", edgecolor="black")

# Step 8: Show values on bars
for i, row in roi_rf.iterrows():
    plt.text(row["Retention (%)"], row["Revenue Saved ($)"] + 500, 
             "$" + str(int(row["Revenue Saved ($)"])), ha="center")

# Step 9: Titles
plt.title("Yearly Revenue Saved by Retention (Random Forest)", fontsize=14)
plt.xlabel("Retention Improvement (%)", fontsize=12)
plt.xticks(rotation=50)
plt.ylabel("Revenue Saved ($)", fontsize=12)

plt.show()


# **13.Final Results & Conclusion**

### Final Results
- Dataset: 7043 customers, churn rate approx 26%
- Best model: Logistic regression (ROC AUC 0.84)
- High-risk customers identified: 25%
- Top churn drivers: Month-to-Month contracts, Electronic Check payments, Fiber optic internet
- ROI: Saving just 5% of high-value customers = 5441 yearly

### Conclusion
This project demonstrates how predictive modeling + business insights can reduce churn and improve revenue for subscription services.
