In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

In [6]:
d=pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv',encoding='cp1252')

In [7]:
df=d.head(50)

In [8]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
7,6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
8,7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
9,6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,...,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        50 non-null     object 
 1   gender            50 non-null     object 
 2   SeniorCitizen     50 non-null     int64  
 3   Partner           50 non-null     object 
 4   Dependents        50 non-null     object 
 5   tenure            50 non-null     int64  
 6   PhoneService      50 non-null     object 
 7   MultipleLines     50 non-null     object 
 8   InternetService   50 non-null     object 
 9   OnlineSecurity    50 non-null     object 
 10  OnlineBackup      50 non-null     object 
 11  DeviceProtection  50 non-null     object 
 12  TechSupport       50 non-null     object 
 13  StreamingTV       50 non-null     object 
 14  StreamingMovies   50 non-null     object 
 15  Contract          50 non-null     object 
 16  PaperlessBilling  50 non-null     object 
 17 

In [10]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,50.0,50.0,50.0
mean,0.08,30.06,67.244
std,0.274048,25.282413,28.522323
min,0.0,1.0,18.95
25%,0.0,8.5,49.325
50%,0.0,23.5,67.675
75%,0.0,51.25,94.1875
max,1.0,72.0,113.25


In [12]:
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [13]:
df.isna().sum()*100/len(df)

customerID          0.0
gender              0.0
SeniorCitizen       0.0
Partner             0.0
Dependents          0.0
tenure              0.0
PhoneService        0.0
MultipleLines       0.0
InternetService     0.0
OnlineSecurity      0.0
OnlineBackup        0.0
DeviceProtection    0.0
TechSupport         0.0
StreamingTV         0.0
StreamingMovies     0.0
Contract            0.0
PaperlessBilling    0.0
PaymentMethod       0.0
MonthlyCharges      0.0
TotalCharges        0.0
Churn               0.0
dtype: float64

In [18]:
if 'TotalCharges' in d.columns:
    d['TotalCharges'] = pd.to_numeric(d['TotalCharges'], errors='coerce')

# Drop rows with missing target or key numeric fields
d = d.dropna(subset=['Churn'])

In [19]:
y = df['Churn'].map({'Yes': 1, 'No': 0})   # convert to 0/1
X = df.drop(columns=['Churn'])

In [20]:

# 4) Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'bool']).columns.tolist()


In [21]:
# 5) Preprocessing: scale numeric, one‑hot encode categorical
from sklearn.preprocessing import OneHotEncoder

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [22]:

# 6) Build model pipeline (Logistic Regression for churn)
clf = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', LogisticRegression(max_iter=1000))
])

In [23]:

# 7) Train–test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [24]:
# 8) Train the model
clf.fit(X_train, y_train)

# 9) Predict on test data
y_pred = clf.predict(X_test)

In [25]:
# 10) Evaluate with classification metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Churn Prediction Performance:")
print(f"Accuracy : {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall   : {rec:.3f}")
print(f"F1-score : {f1:.3f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Churn Prediction Performance:
Accuracy : 0.800
Precision: 0.600
Recall   : 1.000
F1-score : 0.750

Classification Report:
              precision    recall  f1-score   support

    No Churn       1.00      0.71      0.83         7
       Churn       0.60      1.00      0.75         3

    accuracy                           0.80        10
   macro avg       0.80      0.86      0.79        10
weighted avg       0.88      0.80      0.81        10

Confusion Matrix:
[[5 2]
 [0 3]]
