In [None]:
# Buiding Hierarchical Clustering Heatmap of Feature Correlations for Telecom Company data

import pandas as pd
import sqlite3
import statsmodels.api as sm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)
print("Data imported successfully.")


conn = sqlite3.connect("customer_data.db")
df.to_sql("Telco_customers", conn, if_exists="replace", index=False)


full_df = pd.read_sql_query("SELECT * FROM Telco_customers", conn)


full_df['Churn'] = full_df['Churn'].map({'Yes': 1, 'No': 0})


full_df['TotalCharges'] = pd.to_numeric(full_df['TotalCharges'], errors='coerce')


full_df = full_df.dropna()


categorical_cols = full_df.select_dtypes(include=['object']).columns.tolist()


full_df = pd.get_dummies(full_df, columns=categorical_cols, drop_first=True)


selected_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Contract_One year', 'Contract_Two year']


scaler = StandardScaler()
full_df[selected_features[:3]] = scaler.fit_transform(full_df[selected_features[:3]])


X = full_df[selected_features]
X = sm.add_constant(X)  # Add intercept


y = full_df['Churn']


X = X.astype(float)
y = y.astype(float)

# Fit Logistic Regression Model
logit_model = sm.Logit(y, X).fit()

print("\nLogistic Regression Model Summary:")
print(logit_model.summary())

# Compute log-odds and churn probability
full_df['log_odds'] = (logit_model.params['const'] +
                        (logit_model.params['tenure'] * full_df['tenure']) +
                        (logit_model.params['MonthlyCharges'] * full_df['MonthlyCharges']) +
                        (logit_model.params['TotalCharges'] * full_df['TotalCharges']) +
                        (logit_model.params['Contract_One year'] * full_df['Contract_One year']) +
                        (logit_model.params['Contract_Two year'] * full_df['Contract_Two year']))

full_df['Churn_Probability'] = np.exp(full_df['log_odds']) / (1 + np.exp(full_df['log_odds']))

print(full_df.head())

# Hierarchical Clustering Heatmap
plt.figure(figsize=(12, 8))
sns.clustermap(full_df.corr(), cmap="coolwarm", annot=True, fmt=".2f", linewidths=0.5, method="ward")
plt.title("Hierarchical Clustering Heatmap of Feature Correlations")
plt.show()

# Close database connection
conn.close()


Data imported successfully.
Optimization terminated successfully.
         Current function value: 0.436126
         Iterations 8

Logistic Regression Model Summary:
                           Logit Regression Results                           
Dep. Variable:                  Churn   No. Observations:                 7032
Model:                          Logit   Df Residuals:                     7026
Method:                           MLE   Df Model:                            5
Date:                Fri, 16 May 2025   Pseudo R-squ.:                  0.2468
Time:                        08:44:59   Log-Likelihood:                -3066.8
converged:                       True   LL-Null:                       -4071.7
Covariance Type:            nonrobust   LLR p-value:                     0.000
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                -0.9672  