In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
df = pd.read_csv("Telco_customer_churn.csv")
df.head()

In [None]:
df.shape


In [None]:
df.columns

In [None]:
df.drop(["Country","State","City","Zip Code","Latitude","Longitude"], axis=1, inplace=True)

In [None]:
df.drop("Count", axis=1, inplace=True)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df["Churn Value"].value_counts()

In [None]:
df.drop(["Churn Label","Churn Score","Churn Reason"], axis=1, inplace=True)

In [None]:
df.shape

In [None]:
X = df.drop("Churn Value", axis=1)
y = df["Churn Value"]

X.head()

In [None]:
X_encoded = pd.get_dummies(X, drop_first=True)

X_encoded.head()

In [None]:
X_encoded.shape

In [None]:
X = df.drop(["Churn Value", "CustomerID"], axis=1)

In [None]:
X_encoded = pd.get_dummies(X, drop_first=True)
X_encoded.shape

In [None]:
X.nunique().sort_values(ascending=False).head(10)

In [None]:
X.dtypes

In [None]:
X["Total Charges"] = pd.to_numeric(X["Total Charges"], errors="coerce")
X["Monthly Charges"] = pd.to_numeric(X["Monthly Charges"], errors="coerce")
X["CLTV"] = pd.to_numeric(X["CLTV"], errors="coerce")
X["Lat Long"] = pd.to_numeric(X["Lat Long"], errors="coerce")

In [None]:
X.dtypes

In [None]:
X_encoded = pd.get_dummies(X, drop_first=True)
X_encoded.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, 
    y, 
    test_size=0.2, 
    random_state=42
)

In [None]:
X_encoded.isnull().sum().sum()

In [None]:
X_encoded = X_encoded.fillna(0)

In [None]:
# Remove unwanted columns again safely
X = df.drop(["Churn Value", "CustomerID", "CLTV"], axis=1)
y = df["Churn Value"]

In [None]:
X["Total Charges"] = pd.to_numeric(X["Total Charges"], errors="coerce")
X["Monthly Charges"] = pd.to_numeric(X["Monthly Charges"], errors="coerce")
X["Lat Long"] = pd.to_numeric(X["Lat Long"], errors="coerce")

In [None]:
X = X.fillna(0)

In [None]:
X_encoded = pd.get_dummies(X, drop_first=True)

In [None]:
X_encoded.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y,
    test_size=0.2,
    random_state=42
)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
model = LogisticRegression(max_iter=2000)

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
feature_importance = pd.DataFrame({
    "Feature": X_encoded.columns,
    "Coefficient": model.coef_[0]
})

feature_importance = feature_importance.sort_values(
    by="Coefficient", 
    ascending=False
)

feature_importance.head(10)

Key Business Insights

*Customers with month-to-month contracts show higher churn probability.

*Higher monthly charges are associated with increased churn.

*Customers with lower tenure months are more likely to churn.

*Lack of tech support increases churn risk.

*Long-term contract customers show lower churn probability.

Conclusion

The Logistic Regression model achieved an accuracy of 80%.
The model effectively predicts customer churn and identifies major factors influencing churn behavior.

Key drivers include contract type, tenure, and monthly charges.
Business strategies such as promoting long-term contracts and improving support services can reduce churn.

In [None]:
# Get churn probability scores
churn_prob = model.predict_proba(X_test_scaled)[:, 1]

# Create dataframe for segmentation
segmentation_df = X_test.copy()
segmentation_df["Churn Probability"] = churn_prob
segmentation_df["Actual Churn"] = y_test.values

# Create segments
segmentation_df["Customer Segment"] = pd.cut(
    segmentation_df["Churn Probability"],
    bins=[0, 0.3, 0.7, 1],
    labels=["Loyal", "Dormant", "At Risk"]
)

segmentation_df["Customer Segment"].value_counts()

In [None]:
!pip install shap

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Model
model = LogisticRegression(max_iter=2000)
model.fit(X_train_scaled, y_train)

In [None]:
import shap

explainer = shap.LinearExplainer(model, X_train_scaled)
shap_values = explainer.shap_values(X_test_scaled)

shap.summary_plot(
    shap_values,
    X_test_scaled,
    feature_names=X_encoded.columns
)

In [None]:
# Get churn probabilities
y_prob = model.predict_proba(X_test_scaled)[:, 1]

# Create dataframe for segmentation
segmentation_df = X_test.copy()
segmentation_df["Churn_Probability"] = y_prob
segmentation_df["Actual_Churn"] = y_test.values

segmentation_df.head()

In [None]:
def assign_segment(row):
    if row["Churn_Probability"] > 0.6:
        return "At Risk"
    elif row["Tenure Months"] > 24:
        return "Loyal"
    else:
        return "Dormant"

segmentation_df["Customer_Segment"] = segmentation_df.apply(assign_segment, axis=1)

segmentation_df["Customer_Segment"].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=segmentation_df, x="Customer_Segment")
plt.title("Customer Segmentation Based on Churn Risk")
plt.show()

In [None]:
segmentation_df.to_csv("Customer_Segmentation_Output.csv", index=False)

In [None]:
import sqlite3

# Create in-memory database
conn = sqlite3.connect(":memory:")

# Store dataframe as SQL table
df.to_sql("customers", conn, index=False, if_exists="replace")

print("Table created successfully!")

In [None]:
query1 = """
SELECT Contract,
       COUNT(*) AS Total_Customers,
       SUM("Churn Value") AS Total_Churned
FROM customers
GROUP BY Contract
"""

pd.read_sql(query1, conn)

In [None]:
query2 = """
SELECT "Churn Value",
       AVG("Monthly Charges") AS Avg_Monthly_Charges
FROM customers
GROUP BY "Churn Value"
"""

pd.read_sql(query2, conn)

In [None]:
query3 = """
SELECT "Internet Service",
       COUNT(*) AS Total_Customers,
       SUM("Churn Value") AS Churned_Customers
FROM customers
GROUP BY "Internet Service"
"""

pd.read_sql(query3, conn)

In [None]:
query4 = """
SELECT CustomerID,
       "Total Charges"
FROM customers
ORDER BY "Total Charges" DESC
LIMIT 10
"""

pd.read_sql(query4, conn)