In [5]:
#Code to predict probability of churn in Telecom data using logistics regression 

import pandas as pd
import sqlite3
import statsmodels.api as sm
import numpy as np

# Load dataset
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)
print("Data imported successfully.")

# Store dataset in SQLite
conn = sqlite3.connect("customer_data.db")
df.to_sql("Telco_customers", conn, if_exists="replace", index=False)

# Retrieve full dataset
full_df = pd.read_sql_query("SELECT * FROM Telco_customers", conn)

# Convert Churn column to binary (1 = Yes, 0 = No)
full_df['Churn'] = full_df['Churn'].map({'Yes': 1, 'No': 0})

# Convert TotalCharges to numeric (fixes non-numeric issue)
full_df['TotalCharges'] = pd.to_numeric(full_df['TotalCharges'], errors='coerce')

# Drop any NaN values
full_df = full_df.dropna()

# Identify categorical columns
categorical_cols = full_df.select_dtypes(include=['object']).columns.tolist()


full_df = pd.get_dummies(full_df, columns=categorical_cols, drop_first=True) 


selected_features = ('tenure', 'MonthlyCharges', 'TotalCharges', 'Contract_One year', 'Contract_Two year')


X = full_df[list(selected_features)]
X = sm.add_constant(X)  # Add intercept

# Print selected features
print("\nManually Selected Features for X:")
print(selected_features)

# Define dependent variable (Y)
y = full_df['Churn']

# Convert data types to float
X = X.astype(float)
y = y.astype(float)

# Fit Logistic Regression Model
logit_model = sm.Logit(y, X).fit()


print("\nLogistic Regression Model Summary:")
print(logit_model.summary())


full_df['log_odds'] = (logit_model.params['const'] +
                        (logit_model.params['tenure'] * full_df['tenure']) +
                        (logit_model.params['MonthlyCharges'] * full_df['MonthlyCharges']) +
                        (logit_model.params['TotalCharges'] * full_df['TotalCharges']) +
                        (logit_model.params['Contract_One year'] * full_df['Contract_One year']) +
                        (logit_model.params['Contract_Two year'] * full_df['Contract_Two year']))


full_df['Churn_Probability'] = np.exp(full_df['log_odds']) / (1 + np.exp(full_df['log_odds']))

print(full_df.head())
conn.close()


Data imported successfully.

Manually Selected Features for X:
('tenure', 'MonthlyCharges', 'TotalCharges', 'Contract_One year', 'Contract_Two year')
Optimization terminated successfully.
         Current function value: 0.436126
         Iterations 8

Logistic Regression Model Summary:
                           Logit Regression Results                           
Dep. Variable:                  Churn   No. Observations:                 7032
Model:                          Logit   Df Residuals:                     7026
Method:                           MLE   Df Model:                            5
Date:                Mon, 10 Mar 2025   Pseudo R-squ.:                  0.2468
Time:                        10:28:57   Log-Likelihood:                -3066.8
converged:                       True   LL-Null:                       -4071.7
Covariance Type:            nonrobust   LLR p-value:                     0.000
                        coef    std err          z      P>|z|      [0.025      0