In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import joblib
import numpy as np

In [2]:
df= pd.read_csv("C:/Users/ACER/OneDrive/Desktop/customer_churn_data.csv")

In [13]:
df.head(10)

Unnamed: 0,customerid,age,gender,tenure,monthlycharges,contracttype,internetservice,totalcharges,techsupport,churn
0,1,49,Male,4,88.35,Month-to-Month,Fiber Optic,353.4,Yes,Yes
1,2,43,Male,0,36.67,Month-to-Month,Fiber Optic,0.0,Yes,Yes
2,3,51,Female,2,63.79,Month-to-Month,Fiber Optic,127.58,No,Yes
3,4,60,Female,8,102.34,One-Year,DSL,818.72,Yes,Yes
4,5,42,Male,32,69.01,Month-to-Month,,2208.32,No,Yes
5,6,42,Female,16,119.75,Two-Year,DSL,1916.0,Yes,Yes
6,7,60,Male,14,80.32,One-Year,,1124.48,No,Yes
7,8,52,Female,6,58.9,One-Year,,353.4,No,Yes
8,9,40,Female,53,49.81,Two-Year,Fiber Optic,2639.93,Yes,No
9,10,50,Female,10,61.55,Month-to-Month,Fiber Optic,615.5,Yes,Yes


In [4]:
print(f"‚úÖ E-Commerce dataset: {df.shape}")
print(f"üìã Your exact columns: {list(df.columns)}")

# Safe column cleaning
def safe_clean_columns(df):
    new_cols = [str(col).strip().lower().replace(" ", "_").replace(".", "") 
                for col in df.columns]
    df.columns = new_cols
    return df

ecommerce_df = safe_clean_columns(df)
print(f"\n‚úÖ Cleaned columns: {list(df.columns)}")

‚úÖ E-Commerce dataset: (1000, 10)
üìã Your exact columns: ['CustomerID', 'Age', 'Gender', 'Tenure', 'MonthlyCharges', 'ContractType', 'InternetService', 'TotalCharges', 'TechSupport', 'Churn']

‚úÖ Cleaned columns: ['customerid', 'age', 'gender', 'tenure', 'monthlycharges', 'contracttype', 'internetservice', 'totalcharges', 'techsupport', 'churn']


In [6]:
# 2. Target setup (Churn = last column)
target_col = df.columns[-1]  # 'churn'
print(f"üéØ Target: '{target_col}'")

def safe_churn_target(series):
    """E-commerce churn handling (handles any format)"""
    series = series.astype(str).str.strip().str.lower()
    churn_clean = pd.Series(0, index=series.index)
    churn_clean[series.isin(['1', 'yes', 'true', 'y'])] = 1
    return churn_clean.astype(int)

y = safe_churn_target(df[target_col])
print(f"‚úÖ Churn distribution: {np.bincount(y)} ({y.mean():.1%} churn rate)")

üéØ Target: 'churn'
‚úÖ Churn distribution: [117 883] (88.3% churn rate)


In [7]:
# 3. E-COMMERCE SPECIFIC features (your exact columns)
exclude_cols = ['customerid', 'customer_id', target_col]

features = [
    'age', 'gender', 'tenure', 'monthlycharges', 
    'contracttype', 'internetservice', 'totalcharges', 'techsupport'
]

# Auto-detect available features from your dataset
available_features = [col for col in features if col in df.columns]
print(f"\nüìä E-Commerce features ({len(available_features)}): {available_features}")

X = ecommerce_df[available_features].copy()

# Safe preprocessing
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Fill missing values
X[numeric_features] = X[numeric_features].fillna(X[numeric_features].mean())
X[categorical_features] = X[categorical_features].fillna('unknown')

print(f"üìà Numeric ({len(numeric_features)}): {numeric_features}")
print(f"üî§ Categorical ({len(categorical_features)}): {categorical_features}")


üìä E-Commerce features (8): ['age', 'gender', 'tenure', 'monthlycharges', 'contracttype', 'internetservice', 'totalcharges', 'techsupport']
üìà Numeric (4): ['age', 'tenure', 'monthlycharges', 'totalcharges']
üî§ Categorical (4): ['gender', 'contracttype', 'internetservice', 'techsupport']


In [8]:
 #4. E-Commerce optimized pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)
])

ecommerce_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200, 
        max_depth=10,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])


In [9]:
# 5. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nüîÑ Training: {X_train.shape[0]} samples")
print(f"Train churn: {y_train.mean():.1%}, Test churn: {y_test.mean():.1%}")

ecommerce_pipeline.fit(X_train, y_train)


üîÑ Training: 800 samples
Train churn: 88.2%, Test churn: 88.5%


In [10]:
# 6. BULLETPROOF evaluation (no errors!)
y_pred = ecommerce_pipeline.predict(X_test)

# SAFE ROC-AUC calculation
auc_score = None
try:
    proba_output = ecommerce_pipeline.predict_proba(X_test)
    if (proba_output.shape[1] == 2 and len(np.unique(y_test)) == 2):
        auc_score = roc_auc_score(y_test, proba_output[:, 1])
    else:
        print("‚ö†Ô∏è ROC-AUC skipped (single class detected)")
except Exception as e:
    print(f"‚ö†Ô∏è ROC-AUC error: {str(e)[:50]}...")

accuracy = ecommerce_pipeline.score(X_test, y_test)

print(f"\nüìà E-COMMERCE MODEL PERFORMANCE")
print(f"‚úÖ Accuracy:     {accuracy:.3f}")
if auc_score:
    print(f"‚úÖ ROC-AUC:     {auc_score:.3f}")

print("\nüìä Classification Report:")
print(classification_report(y_test, y_pred))


üìà E-COMMERCE MODEL PERFORMANCE
‚úÖ Accuracy:     1.000
‚úÖ ROC-AUC:     1.000

üìä Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00       177

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [12]:
# 7. Save production model
joblib.dump(ecommerce_pipeline, "ecommerce_churn_pipeline.pkl")
joblib.dump({
    'features': available_features,
    'numeric_features': numeric_features,
    'categorical_features': categorical_features,
    'target_col': target_col,
    'performance': {'accuracy': accuracy, 'auc': auc_score}
}, "ecommerce_pipeline_info.pkl")

print(f"\nüíæ SAVED:")
print("‚úÖ ecommerce_churn_pipeline.pkl")
print("‚úÖ ecommerce_pipeline_info.pkl")


üíæ SAVED:
‚úÖ ecommerce_churn_pipeline.pkl
‚úÖ ecommerce_pipeline_info.pkl
