In [9]:
#Code to build regression model to predict Churn Rate based on Monthly Charges, Tenor and Contract type

import pandas as pd
import sqlite3
import statsmodels.api as sm

# Load dataset
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)
print("Data imported successfully.")

# Store dataset in SQLite
conn = sqlite3.connect("customer_data.db")
df.to_sql("Telco_customers", conn, if_exists="replace", index=False)

# Retrieve full dataset
full_df = pd.read_sql_query("SELECT * FROM Telco_customers", conn)

# Convert categorical churn to binary
full_df['Churn'] = full_df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)
full_df['TotalCharges'] = pd.to_numeric(full_df['TotalCharges'], errors='coerce')  # Convert TotalCharges to numeric
full_df['tenure'] = pd.to_numeric(full_df['tenure'], errors='coerce')  # Ensure tenure is numeric

# **Remove NaN values**
full_df = full_df.dropna()

# **One-hot encode categorical variables**
full_df = pd.get_dummies(full_df, columns=['Contract'], drop_first=True)  # Drops one category to avoid multicollinearity

# **Select Independent (X) and Dependent (y) Variables**
X = full_df[['MonthlyCharges', 'Contract_One year', 'Contract_Two year']]
y = full_df['Churn']

# **Check Summary Statistics Before Regression**
print("\n--- Summary Statistics of Independent Variables (X) ---")
print(X.describe())

print("\n--- Summary Statistics of Dependent Variable (y: Churn) ---")
print(y.describe())

# **Ensure Numeric Data Types**
X = sm.add_constant(X)  # Add intercept
X = X.astype(float)  # Convert all to float
y = y.astype(float)  # Convert target to float

# **Fit Model on Individual Customer Data**
model = sm.OLS(y, X).fit()

# Print Regression Equation
coefficients = model.params
equation_terms = [f"({coeff:.4f} * {col})" for col, coeff in coefficients.items()]
regression_equation = "Churn = " + " + ".join(equation_terms)

print("\nRegression Equation:")
print(regression_equation)

# Print model summary
print("\nModel Summary:")
print(model.summary())

conn.close()


Data imported successfully.

--- Summary Statistics of Independent Variables (X) ---
       MonthlyCharges
count     7032.000000
mean        64.798208
std         30.085974
min         18.250000
25%         35.587500
50%         70.350000
75%         89.862500
max        118.750000

--- Summary Statistics of Dependent Variable (y: Churn) ---
count    7032.000000
mean        0.265785
std         0.441782
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: Churn, dtype: float64

Regression Equation:
Churn = (0.2643 * const) + (0.0025 * MonthlyCharges) + (-0.3111 * Contract_One year) + (-0.3851 * Contract_Two year)

Model Summary:
                            OLS Regression Results                            
Dep. Variable:                  Churn   R-squared:                       0.195
Model:                            OLS   Adj. R-squared:                  0.195
Method:                 Least Squares   F-statistic:                