In [3]:
# The code predicts customers that are at a high risk of cancellation in the Telecom company data using logsitics regression model

import pandas as pd
import sqlite3
import statsmodels.api as sm
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the data
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)
print("Data imported successfully.")

# Connect to SQLite database and load data
conn = sqlite3.connect("customer_data.db")
df.to_sql("Telco_customers", conn, if_exists="replace", index=False)
full_df = pd.read_sql_query("SELECT * FROM Telco_customers", conn)

# Data preprocessing
full_df['Churn'] = full_df['Churn'].map({'Yes': 1, 'No': 0})
full_df['TotalCharges'] = pd.to_numeric(full_df['TotalCharges'], errors='coerce')
full_df = full_df.dropna()

# Store customer IDs before one-hot encoding
customer_ids = full_df['customerID'].copy()

categorical_cols = full_df.select_dtypes(include=['object']).columns.tolist()
# Remove 'customerID' from the list of categorical columns if it's there (it shouldn't be)
if 'customerID' in categorical_cols:
    categorical_cols.remove('customerID')
full_df = pd.get_dummies(full_df, columns=categorical_cols, drop_first=True)

# Add back the customerID column
full_df['customerID'] = customer_ids

# Select features for the model
selected_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Contract_One year', 'Contract_Two year']

# Scale numerical features used in the model
numerical_features = selected_features[:3]
scaler = StandardScaler()
full_df[numerical_features] = scaler.fit_transform(full_df[numerical_features])

X = full_df[selected_features]
X = sm.add_constant(X)  # Add intercept
y = full_df['Churn']

X = X.astype(float)
y = y.astype(float)

# Fit Logistic Regression Model
logit_model = sm.Logit(y, X).fit()

print("\nLogistic Regression Model Summary:")
print(logit_model.summary())

# Compute log-odds and churn probability
full_df['log_odds'] = (logit_model.params['const'] +
                        (logit_model.params['tenure'] * full_df['tenure']) +
                        (logit_model.params['MonthlyCharges'] * full_df['MonthlyCharges']) +
                        (logit_model.params['TotalCharges'] * full_df['TotalCharges']) +
                        (logit_model.params['Contract_One year'] * full_df['Contract_One year']) +
                        (logit_model.params['Contract_Two year'] * full_df['Contract_Two year']))

full_df['Churn_Probability'] = np.exp(full_df['log_odds']) / (1 + np.exp(full_df['log_odds']))

print("\nFirst few rows with churn probability:")
print(full_df[['customerID', 'Churn', 'Churn_Probability']].head())

# Identify high potential churn customers
churn_probability_threshold = 0.7  # You can adjust this threshold
high_potential_churn = full_df[full_df['Churn_Probability'] >= churn_probability_threshold]

# Save the high potential churn customers to an Excel file
output_excel_file = "high_potential_churn_customers.xlsx"
high_potential_churn.to_excel(output_excel_file, index=False)

print(f"\nHigh potential churn customers (probability >= {churn_probability_threshold}) saved to '{output_excel_file}'")

# Close database connection
conn.close()

Data imported successfully.
Optimization terminated successfully.
         Current function value: 0.436126
         Iterations 8

Logistic Regression Model Summary:
                           Logit Regression Results                           
Dep. Variable:                  Churn   No. Observations:                 7032
Model:                          Logit   Df Residuals:                     7026
Method:                           MLE   Df Model:                            5
Date:                Thu, 22 May 2025   Pseudo R-squ.:                  0.2468
Time:                        09:42:56   Log-Likelihood:                -3066.8
converged:                       True   LL-Null:                       -4071.7
Covariance Type:            nonrobust   LLR p-value:                     0.000
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                -0.9672  