In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
df = pd.read_csv("data/ml_ready_churn.csv")
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,EstimatedTotalSpend
0,0,0,1,0,1,0,1,29.85,29.85,0,...,False,False,False,False,False,False,False,True,False,29.85
1,1,0,0,0,34,1,0,56.95,1889.5,0,...,False,False,False,False,True,False,False,False,True,1936.3
2,1,0,0,0,2,1,1,53.85,108.15,1,...,False,False,False,False,False,False,False,False,True,107.7
3,1,0,0,0,45,0,0,42.3,1840.75,0,...,False,False,False,False,True,False,False,False,False,1903.5
4,0,0,0,0,2,1,1,70.7,151.65,1,...,False,False,False,False,False,False,False,True,False,141.4


In [3]:
#Split Features and Target
X = df.drop('Churn', axis=1)
y = df['Churn']

In [4]:
print(X.isnull().sum().sort_values(ascending=False))

EstimatedTotalSpend                   0
OnlineSecurity_Yes                    0
OnlineSecurity_No internet service    0
InternetService_No                    0
InternetService_Fiber optic           0
                                     ..
PhoneService                          0
tenure                                0
Dependents                            0
Partner                               0
SeniorCitizen                         0
Length: 7073, dtype: int64


In [5]:
X = X.fillna(X.median(numeric_only=True))

In [6]:
# Check for NaNs
print("Total NaNs in X:", X.isnull().sum().sum())
print("NaNs per column:\n", X.isnull().sum().sort_values(ascending=False).head())

Total NaNs in X: 0
NaNs per column:
 EstimatedTotalSpend                   0
OnlineSecurity_Yes                    0
OnlineSecurity_No internet service    0
InternetService_No                    0
InternetService_Fiber optic           0
dtype: int64


In [7]:
#split test and train set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
#training logestic regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
#predict  & evaluate
y_pred = model.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7892122072391767

Confusion Matrix:
 [[911 124]
 [173 201]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86      1035
           1       0.62      0.54      0.58       374

    accuracy                           0.79      1409
   macro avg       0.73      0.71      0.72      1409
weighted avg       0.78      0.79      0.78      1409



In [12]:
import joblib
joblib.dump(model, 'model/churn_logistic_model.pkl')

['model/churn_logistic_model.pkl']

### Day 8 Summary

- Loaded clean ML-ready churn dataset
- Split into training and test sets (80/20)
- Trained a logistic regression model
- Evaluated performance with accuracy and confusion matrix
- Saved the model for future prediction use