In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import joblib

print("All imports successful")

All imports successful


In [23]:
df = pd.read_csv("C:/Users/ACER/OneDrive/Desktop/telco churn data.csv")

# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('.', '')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Churn distribution:\n{df.iloc[:,-1].value_counts()}")


Dataset shape: (7043, 21)
Columns: ['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents', 'tenure', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod', 'monthlycharges', 'totalcharges', 'churn']
Churn distribution:
churn
No     5174
Yes    1869
Name: count, dtype: int64


In [24]:
df.head(3)


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [25]:
# Exact Telco features (standard dataset)
TELCO_FEATURES = [
    'seniorcitizen', 'tenure', 'gender', 'partner', 'dependents',
    'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity',
    'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv',
    'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod',
    'monthlycharges', 'totalcharges'
]

# Filter available features
available_features = [f for f in TELCO_FEATURES if f in df.columns]
target_col = df.columns[-1]  # Usually 'churn'

print(f"Using {len(available_features)} features: {available_features}")
print(f"Target column: {target_col}")

# Prepare X and y
X = df[available_features].copy()
y_raw = df[target_col].astype(str).str.strip().str.lower()

# Convert churn to binary (0/1)
y = pd.Series(0, index=y_raw.index)
y[y_raw.isin(['yes', '1', 'true'])] = 1
y = y.astype(int)

print(f"Churn distribution: {np.bincount(y)} ({y.mean():.1%} positive)")


Using 19 features: ['seniorcitizen', 'tenure', 'gender', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod', 'monthlycharges', 'totalcharges']
Target column: churn
Churn distribution: [5174 1869] (26.5% positive)


In [26]:
# Numeric features
NUMERIC_FEATURES = ['seniorcitizen', 'tenure', 'monthlycharges', 'totalcharges']

# Categorical features  
CATEGORICAL_FEATURES = [f for f in available_features if f not in NUMERIC_FEATURES]

print(f"Numeric features ({len(NUMERIC_FEATURES)}): {NUMERIC_FEATURES}")
print(f"Categorical features ({len(CATEGORICAL_FEATURES)}): {CATEGORICAL_FEATURES[:5]}...")


Numeric features (4): ['seniorcitizen', 'tenure', 'monthlycharges', 'totalcharges']
Categorical features (15): ['gender', 'partner', 'dependents', 'phoneservice', 'multiplelines']...


In [27]:
# Numeric pipeline with imputation
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline with imputation
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, NUMERIC_FEATURES),
        ('cat', categorical_transformer, CATEGORICAL_FEATURES)
    ])

# Complete pipeline
telco_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=5,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

print("Pipeline built successfully")


Pipeline built successfully


In [28]:
# Fix totalcharges (common string issue)
if 'totalcharges' in X.columns:
    X['totalcharges'] = pd.to_numeric(X['totalcharges'], errors='coerce').fillna(0)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Train pipeline
telco_pipeline.fit(X_train, y_train)
print("Training completed")


Training set: (5634, 19)
Test set: (1409, 19)
Training completed


In [29]:
# Predictions
y_pred = telco_pipeline.predict(X_test)
y_pred_proba = telco_pipeline.predict_proba(X_test)[:, 1]

# Metrics
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy:.3f}")
print(f"ROC-AUC: {auc:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.767
ROC-AUC: 0.841

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.77      0.83      1035
           1       0.54      0.75      0.63       374

    accuracy                           0.77      1409
   macro avg       0.72      0.76      0.73      1409
weighted avg       0.80      0.77      0.78      1409



In [30]:
# Save pipeline
joblib.dump(telco_pipeline, "telco_churn_pipeline.pkl")

# Save metadata for Streamlit
metadata = {
    'features': available_features,
    'numeric_features': NUMERIC_FEATURES,
    'categorical_features': CATEGORICAL_FEATURES,
    'target_col': target_col,
    'performance': {
        'accuracy': accuracy,
        'roc_auc': auc
    }
}
joblib.dump(metadata, "telco_metadata.pkl")

print("Files saved:")
print("- telco_churn_pipeline.pkl")
print("- telco_metadata.pkl")
print("Ready for Streamlit deployment")


Files saved:
- telco_churn_pipeline.pkl
- telco_metadata.pkl
Ready for Streamlit deployment
