In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported!")


✓ Libraries imported!


In [3]:
df = pd.read_csv('data/processed_data.csv')
print(f"Dataset shape: {df.shape}")

# Prepare features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")

Dataset shape: (7043, 21)
Training samples: 5634
Test samples: 1409


In [4]:
with open('data/best_model.txt', 'r') as f:
    best_model_name = f.read().strip()

print(f"\n{'='*60}")
print(f"TRAINING BEST MODEL: {best_model_name}")
print(f"{'='*60}")



TRAINING BEST MODEL: Logistic Regression


In [5]:
if 'XGBoost' in best_model_name:
    model = XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        eval_metric='logloss',
        random_state=42
    )
elif 'Random Forest' in best_model_name:
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42
    )
else:
    model = LogisticRegression(max_iter=1000, random_state=42)

print(f"Model initialized: {type(model).__name__}")

Model initialized: LogisticRegression


In [6]:
print(f"\nTraining {best_model_name}...")
model.fit(X_train, y_train)
print("✓ Model training complete!")


Training Logistic Regression...
✓ Model training complete!


In [7]:
print(f"\n{'='*60}")
print("CROSS-VALIDATION (5-Fold)")
print(f"{'='*60}")

cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

print(f"CV Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")



CROSS-VALIDATION (5-Fold)
CV Scores: [0.82431233 0.80834073 0.80390417 0.77994676 0.77797513]
Mean CV Accuracy: 0.7989 (+/- 0.0176)


In [8]:
y_train_pred = model.predict(X_train)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

print(f"\n{'='*60}")
print("TRAINING SET PERFORMANCE")
print(f"{'='*60}")
print(f"Accuracy:  {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall:    {train_recall:.4f}")
print(f"F1-Score:  {train_f1:.4f}")


TRAINING SET PERFORMANCE
Accuracy:  0.8012
Precision: 0.6521
Recall:    0.5378
F1-Score:  0.5894


In [9]:
y_test_pred = model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"\n{'='*60}")
print("TEST SET PERFORMANCE")
print(f"{'='*60}")
print(f"Accuracy:  {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall:    {test_recall:.4f}")
print(f"F1-Score:  {test_f1:.4f}")



TEST SET PERFORMANCE
Accuracy:  0.7935
Precision: 0.6309
Recall:    0.5348
F1-Score:  0.5789


In [10]:
print(f"\n{'='*60}")
print("TOP 10 IMPORTANT FEATURES")
print(f"{'='*60}")

if hasattr(model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(feature_importance.head(10).to_string(index=False))
    
    # Save feature importance
    feature_importance.to_csv('data/feature_importance.csv', index=False)
    print("\n✓ Feature importance saved")
    
    # Visualize
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance.head(10)['feature'], 
             feature_importance.head(10)['importance'])
    plt.xlabel('Importance', fontweight='bold')
    plt.title('Top 10 Feature Importance', fontweight='bold')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('data/feature_importance.png')
    print("✓ Feature importance plot saved")
    plt.show()




TOP 10 IMPORTANT FEATURES


In [12]:

import joblib

model_path = 'models/trained_model.pkl'
joblib.dump(model, model_path)
print(f"\n✓ Model saved to '{model_path}'")


✓ Model saved to 'models/trained_model.pkl'


In [13]:
training_summary = {
    'model_name': best_model_name,
    'train_samples': X_train.shape[0],
    'test_samples': X_test.shape[0],
    'features': X.shape[1],
    'cv_mean_accuracy': cv_scores.mean(),
    'cv_std_accuracy': cv_scores.std(),
    'train_accuracy': train_accuracy,
    'train_precision': train_precision,
    'train_recall': train_recall,
    'train_f1': train_f1,
    'test_accuracy': test_accuracy,
    'test_precision': test_precision,
    'test_recall': test_recall,
    'test_f1': test_f1
}

pd.DataFrame([training_summary]).to_csv('data/training_summary.csv', index=False)
print("✓ Training summary saved")

✓ Training summary saved


In [14]:
print(f"\n{'='*60}")
print("TRAINING COMPLETE - SUMMARY")
print(f"{'='*60}")
print(f"Model: {best_model_name}")
print(f"Training Samples: {X_train.shape[0]}")
print(f"Features Used: {X.shape[1]}")
print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1-Score: {test_f1:.4f}")
print(f"\n✓ Model ready for evaluation and tuning!")
print(f"{'='*60}")


TRAINING COMPLETE - SUMMARY
Model: Logistic Regression
Training Samples: 5634
Features Used: 20
Cross-Validation Accuracy: 0.7989
Test Accuracy: 0.7935
Test F1-Score: 0.5789

✓ Model ready for evaluation and tuning!
