In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import shap
from textblob import TextBlob
import streamlit as st
import plotly.express as px
import plotly.graph_objects as go
import joblib
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Load dataset


In [7]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')


#  Data Preprocessing

In [8]:
df.replace('No internet service', 'No', inplace=True)
df.replace('No phone service', 'No', inplace=True)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
df.drop('customerID', axis=1, inplace=True)


# Encode categorical variables

In [9]:
le = LabelEncoder()
for column in df.select_dtypes(include='object').columns:
    df[column] = le.fit_transform(df[column])

#  Feature Engineering 

In [10]:
df['TenureGroup'] = pd.cut(df['tenure'], bins=[0, 12, 24, 48, 60, np.inf], 
                          labels=['0-12', '13-24', '25-48', '49-60', '60+'])
df['TenureGroup'] = le.fit_transform(df['TenureGroup'])
df['MonthlyToTotalRatio'] = df['MonthlyCharges'] / (df['TotalCharges'] + 1e-5)  # Avoid division by zero


Synthetic NLP feature: Customer feedback sentiment

In [11]:
np.random.seed(42)
feedback = ['Great service, very satisfied' if np.random.rand() > 0.4 else 
            'Poor support, slow response' if np.random.rand() > 0.2 else 
            'Average experience' for _ in range(len(df))]
df['FeedbackSentiment'] = [TextBlob(text).sentiment.polarity for text in feedback]


 Define features and target

In [12]:
X = df.drop('Churn', axis=1)
y = df['Churn']

Scale numerical features

In [13]:
scaler = StandardScaler()
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges', 'FeedbackSentiment', 'MonthlyToTotalRatio']
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

#  Handle Class Imbalance 

In [14]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


 Model Training with Hyperparameter Tuning 

In [16]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1]
}
model = XGBClassifier(random_state=42, eval_metric='logloss')
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

Model Evaluation

In [17]:
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"AUC-ROC: {roc_auc_score(y_test, y_pred_proba):.2f}")

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.82      0.84      1021
           1       0.83      0.87      0.85      1049

    accuracy                           0.84      2070
   macro avg       0.84      0.84      0.84      2070
weighted avg       0.84      0.84      0.84      2070

AUC-ROC: 0.92


In [18]:
import matplotlib.pyplot as plt

# Visualizations 


1. Churn Distribution

In [21]:
plt.figure(figsize=(8, 5))
sns.countplot(x='Churn', data=df)
plt.title('Churn Distribution', fontsize=14, pad=10)
plt.xlabel('Churn', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.savefig('churn_distribution.png', bbox_inches='tight', dpi=300)
plt.close()


 2. Correlation Heatmap

In [25]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap', fontsize=14, pad=10)
plt.savefig('correlation_heatmap.png', bbox_inches='tight', dpi=300)
plt.close()



3. Pair Plot for Key Numerical Features

In [24]:
sns.pairplot(df[numerical_cols + ['Churn']], hue='Churn', palette='deep')
plt.suptitle('Pair Plot of Numerical Features by Churn', y=1.02, fontsize=14)
plt.savefig('pair_plot.png', bbox_inches='tight', dpi=300)
plt.close()

4. Box Plots for Numerical Features

In [26]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for i, col in enumerate(['tenure', 'MonthlyCharges', 'TotalCharges']):
    sns.boxplot(x='Churn', y=col, data=df, ax=axes[i])
    axes[i].set_title(f'{col} by Churn', fontsize=12)
plt.tight_layout()
plt.savefig('box_plots.png', bbox_inches='tight', dpi=300)
plt.close()

 5. Violin Plot for Tenure

In [27]:
plt.figure(figsize=(8, 5))
sns.violinplot(x='Churn', y='tenure', data=df, palette='deep')
plt.title('Tenure Distribution by Churn', fontsize=14, pad=10)
plt.xlabel('Churn', fontsize=12)
plt.ylabel('Tenure', fontsize=12)
plt.savefig('violin_plot.png', bbox_inches='tight', dpi=300)
plt.close()

6. Confusion Matrix

In [28]:
plt.figure(figsize=(8, 5))
sns.violinplot(x='Churn', y='tenure', data=df, palette='deep')
plt.title('Tenure Distribution by Churn', fontsize=14, pad=10)
plt.xlabel('Churn', fontsize=12)
plt.ylabel('Tenure', fontsize=12)
plt.savefig('violin_plot.png', bbox_inches='tight', dpi=300)
plt.close()

7. ROC Curve

In [29]:
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 5))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc_score(y_test, y_pred_proba):.2f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curve', fontsize=14, pad=10)
plt.legend(fontsize=10)
plt.savefig('roc_curve.png', bbox_inches='tight', dpi=300)
plt.close()

8. Precision-Recall Curve

In [30]:
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 5))
plt.plot(recall, precision, label='Precision-Recall Curve', linewidth=2)
plt.xlabel('Recall', fontsize=12)
plt.ylabel('Precision', fontsize=12)
plt.title('Precision-Recall Curve', fontsize=14, pad=10)
plt.legend(fontsize=10)
plt.savefig('precision_recall_curve.png', bbox_inches='tight', dpi=300)
plt.close()


9. Feature Importance from XGBoost

In [31]:
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance (XGBoost)', fontsize=14, pad=10)
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.savefig('feature_importance.png', bbox_inches='tight', dpi=300)
plt.close()

10. SHAP Summary Plot

In [32]:
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X_test, show=False)
plt.title('SHAP Feature Importance', fontsize=14, pad=10)
plt.savefig('shap_summary.png', bbox_inches='tight', dpi=300)
plt.close()

11. SHAP Dependence Plot for Top Feature

In [33]:
plt.figure(figsize=(8, 5))
shap.dependence_plot(feature_importance['Feature'].iloc[0], shap_values, X_test, show=False)
plt.title(f'SHAP Dependence Plot for {feature_importance["Feature"].iloc[0]}', fontsize=14, pad=10)
plt.savefig('shap_dependence.png', bbox_inches='tight', dpi=300)
plt.close()

<Figure size 800x500 with 0 Axes>

Save test data with predictions

In [None]:
joblib.dump(best_model, 'churn_model.pkl')
X_test_df = pd.DataFrame(X_test, columns=X.columns)
X_test_df['Churn_Probability'] = y_pred_proba
X_test_df.to_csv('test_data_with_predictions.csv', index=False)