In [15]:
import numpy as np
import pandas as pd

# Load dataset
df = pd.read_csv("Telco-Customer-Churn.csv")

# Preview
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [16]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [17]:
# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Replace missing values with 0 (new customers)
df['TotalCharges'] = df['TotalCharges'].fillna(0)


 **Convert target variable to binary**

In [18]:
df['ChurnFlag'] = df['Churn'].map({'Yes': 1, 'No': 0})


**Select Features for Modeling**

In [19]:
features = [
    'tenure',
    'MonthlyCharges',
    'TotalCharges',
    'Contract',
    'InternetService',
    'PaymentMethod'
]

X = df[features]
y = df['ChurnFlag']


**Encode Categorical Variables**

**Use One-Hot Encoding (standard & Power BI-friendly)**

In [20]:
X = pd.get_dummies(X, drop_first=True)


**Train-Test Split**

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)


**Train Logistic Regression Model**

In [22]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


**Model Evaluation**

In [23]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))


              precision    recall  f1-score   support

           0       0.84      0.89      0.87      1552
           1       0.64      0.53      0.58       561

    accuracy                           0.80      2113
   macro avg       0.74      0.71      0.72      2113
weighted avg       0.79      0.80      0.79      2113

ROC AUC Score: 0.8379998437988129


**Generate Predictions for ALL Customers**

threshold:- 
0.7 → stricter high-risk definition  |  
0.4 → aggressive retention strategy

In [24]:
# Predict churn probability for all customers
df['ChurnProbability'] = model.predict_proba(X)[:, 1]

# Predict churn class
df['ChurnPredicted'] = np.where(df['ChurnProbability'] >= 0.7, 'Yes', 'No')


**Prepare Output File for Power BI**

Create a clean prediction file

In [25]:
prediction_output = df[['customerID', 'ChurnProbability', 'ChurnPredicted']]

prediction_output.head()


Unnamed: 0,customerID,ChurnProbability,ChurnPredicted
0,7590-VHVEG,0.528073,No
1,5575-GNVDE,0.072879,No
2,3668-QPYBK,0.38788,No
3,7795-CFOCW,0.040846,No
4,9237-HQITU,0.718075,Yes


**Export CSV for Power BI**

In [26]:
prediction_output.to_csv("Telco_Churn_Predictions.csv", index=False)
