In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define file paths for the training and testing datasets
# Assuming these files are in the same directory where the Colab notebook is running.
train_file = "churn-bigml-80.csv"
test_file = "churn-bigml-20.csv"

# Load the datasets
df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

print("--- Training Data Snapshot ---")
print(df_train.head())
print("\n--- Training Data Info ---")
df_train.info()

--- Training Data Snapshot ---
  State  Account length  Area code International plan Voice mail plan  \
0    KS             128        415                 No             Yes   
1    OH             107        415                 No             Yes   
2    NJ             137        415                 No              No   
3    OH              84        408                Yes              No   
4    OK              75        415                Yes              No   

   Number vmail messages  Total day minutes  Total day calls  \
0                     25              265.1              110   
1                     26              161.6              123   
2                      0              243.4              114   
3                      0              299.4               71   
4                      0              166.7              113   

   Total day charge  Total eve minutes  Total eve calls  Total eve charge  \
0             45.07              197.4               99             

In [5]:
# 1. Drop non-predictive features
# 'State' and 'Area code' are typically dropped in a basic model.
df_train = df_train.drop(['State', 'Area code'], axis=1)
df_test = df_test.drop(['State', 'Area code'], axis=1)

# 2. Encode Binary Categorical Features (Yes/No and True/False)
# Create a mapping dictionary to convert text labels to 0 and 1.
mapping = {'Yes': 1, 'No': 0, True: 1, False: 0}

for col in ['International plan', 'Voice mail plan', 'Churn']:
    df_train[col] = df_train[col].map(mapping)
    df_test[col] = df_test[col].map(mapping)

# 3. Define Features (X) and Target (y)
X_train = df_train.drop('Churn', axis=1)
y_train = df_train['Churn']

X_test = df_test.drop('Churn', axis=1)
y_test = df_test['Churn']

print("\n--- Processed Training Features Head (Numerical) ---")
print(X_train.head())



--- Processed Training Features Head (Numerical) ---
   Account length  International plan  Voice mail plan  Number vmail messages  \
0             128                   0                1                     25   
1             107                   0                1                     26   
2             137                   0                0                      0   
3              84                   1                0                      0   
4              75                   1                0                      0   

   Total day minutes  Total day calls  Total day charge  Total eve minutes  \
0              265.1              110             45.07              197.4   
1              161.6              123             27.47              195.5   
2              243.4              114             41.38              121.2   
3              299.4               71             50.90               61.9   
4              166.7              113             28.34              

In [6]:
# 1. Initialize StandardScaler
scaler = StandardScaler()

# 2. Fit the scaler on the TRAINING data and transform both sets
# IMPORTANT: Never fit the scaler on the test data to prevent data leakage.
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# 3. Initialize and Train the Logistic Regression Model
# Logistic Regression is a good starting point for binary classification problems.
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

print("\nLogistic Regression Model Training Complete!")


Logistic Regression Model Training Complete!


In [8]:
# 1. Make Predictions on the scaled test data
y_pred = model.predict(X_test_scaled)


# 2. Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("\n--- Model Evaluation Results ---")
print(f"Accuracy on Test Set: {accuracy:.4f}\n")
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

# Interpretation of Confusion Matrix (for Churn=1):
# [[True Negatives (Non-churn correctly predicted), False Positives (Non-churn incorrectly predicted as churn)],
#  [False Negatives (Churners incorrectly predicted as non-churn), True Positives (Churners correctly predicted)]]


--- Model Evaluation Results ---
Accuracy on Test Set: 0.8546

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.97      0.92       572
           1       0.47      0.19      0.27        95

    accuracy                           0.85       667
   macro avg       0.68      0.58      0.59       667
weighted avg       0.82      0.85      0.83       667

Confusion Matrix:
 [[552  20]
 [ 77  18]]
