In [1]:
# 1. IMPORTS
# ============================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
df = pd.read_csv("C:/Users/ACER/OneDrive/Desktop/subscription churn dataset.csv")

In [20]:
df.head(10)

Unnamed: 0,accountage,monthlycharges,totalcharges,subscriptiontype,paymentmethod,paperlessbilling,contenttype,multideviceaccess,deviceregistered,viewinghoursperweek,...,contentdownloadspermonth,genrepreference,userrating,supportticketspermonth,gender,watchlistsize,parentalcontrol,subtitlesenabled,customerid,churn
0,20,11.055215,221.104302,Premium,Mailed check,No,Both,No,Mobile,36.758104,...,10,Sci-Fi,2.176498,4,Male,3,No,No,CB6SXPNVZA,0
1,57,5.175208,294.986882,Basic,Credit card,Yes,Movies,No,Tablet,32.450568,...,18,Action,3.478632,8,Male,23,No,Yes,S7R2G87O09,0
2,73,12.106657,883.785952,Basic,Mailed check,Yes,Movies,No,Computer,7.39516,...,23,Fantasy,4.238824,6,Male,1,Yes,Yes,EASDC20BDT,0
3,32,7.263743,232.439774,Basic,Electronic check,No,TV Shows,No,Tablet,27.960389,...,30,Drama,4.276013,2,Male,24,Yes,Yes,NPF69NT69N,0
4,57,16.953078,966.325422,Premium,Electronic check,Yes,TV Shows,No,TV,20.083397,...,20,Comedy,3.61617,4,Female,0,No,No,4LGYPK7VOL,0
5,113,7.295744,824.419081,Premium,Mailed check,Yes,Both,No,Mobile,21.67829,...,35,Comedy,3.721134,8,Female,2,Yes,Yes,JY5HS0GWHW,0
6,38,12.340675,468.945639,Premium,Bank transfer,No,Both,No,Computer,36.512761,...,28,Action,4.090868,9,Female,20,No,Yes,79XSO6P5O3,0
7,25,7.24755,181.188753,Standard,Electronic check,Yes,TV Shows,No,TV,16.355816,...,10,Fantasy,3.410221,2,Female,22,No,No,2LDC9AQ3C5,0
8,26,19.803233,514.88405,Standard,Bank transfer,No,Movies,No,Tablet,8.202929,...,28,Fantasy,2.679986,0,Male,5,Yes,Yes,74DURHL3Y8,1
9,14,18.842934,263.80108,Standard,Bank transfer,No,Movies,No,Computer,38.560694,...,0,Comedy,2.993441,0,Male,18,No,No,CY8S2R3A1T,0


In [9]:
print(f"‚úÖ Dataset loaded: {df.shape}")
print(f"üìã ACTUAL COLUMNS ({len(df.columns)} total):")
print(list(df.columns))
print(f"\nüîç Churn column found: {df.columns[-1]}")  # Last column is churn
print("Churn unique values:", df.iloc[:, -1].unique())

‚úÖ Dataset loaded: (243787, 21)
üìã ACTUAL COLUMNS (21 total):
['AccountAge', 'MonthlyCharges', 'TotalCharges', 'SubscriptionType', 'PaymentMethod', 'PaperlessBilling', 'ContentType', 'MultiDeviceAccess', 'DeviceRegistered', 'ViewingHoursPerWeek', 'AverageViewingDuration', 'ContentDownloadsPerMonth', 'GenrePreference', 'UserRating', 'SupportTicketsPerMonth', 'Gender', 'WatchlistSize', 'ParentalControl', 'SubtitlesEnabled', 'CustomerID', 'Churn']

üîç Churn column found: Churn
Churn unique values: [0 1]


In [11]:
 #2. SAFE column cleaning (handles any format)
def safe_clean_columns(df):
    """Clean column names without breaking"""
    new_cols = []
    for col in df.columns:
        clean = str(col).strip().lower().replace(" ", "_").replace(".", "")
        new_cols.append(clean)
    df.columns = new_cols
    return df

subscription_df = safe_clean_columns(df)
print(f"\n‚úÖ Cleaned columns: {list(df.columns)[:5]}...")


‚úÖ Cleaned columns: ['accountage', 'monthlycharges', 'totalcharges', 'subscriptiontype', 'paymentmethod']...


In [12]:
# 3. Robust target (always last column)
target_col = subscription_df.columns[-1]
print(f"üéØ Target column: '{target_col}'")

def safe_churn_target(series):
    series = series.astype(str).str.strip().str.lower()
    churn_clean = pd.Series(0, index=series.index)
    churn_clean[series.isin(['1', 'yes', 'true', 'y'])] = 1
    return churn_clean.astype(int)

y = safe_churn_target(subscription_df[target_col])
print(f"‚úÖ Churn distribution: {np.bincount(y)}")

üéØ Target column: 'churn'
‚úÖ Churn distribution: [199605  44182]


In [13]:
exclude_cols = ['customerid', 'customer_id', target_col]
features = [col for col in subscription_df.columns 
           if col not in exclude_cols and subscription_df[col].dtype != 'object']

print(f"\nüìä Auto-detected {len(features)} features:")
print(features[:10], "..." if len(features) > 10 else "")

# Limit to best subscription features if too many
if len(features) > 15:
    # Prioritize subscription metrics
    priority_features = ['accountage', 'monthlycharges', 'totalcharges', 'viewinghours', 
                        'supporttickets', 'watchlistsize', 'downloads', 'userrating']
    features = [f for f in priority_features if f in subscription_df.columns] + features[:10]
    features = list(set(features))  # Remove duplicates

X = subscription_df[features].copy()

# Safe fillna
numeric_cols = X.select_dtypes(include=[np.number]).columns
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].mean())
object_cols = X.select_dtypes(include=['object']).columns
X[object_cols] = X[object_cols].fillna('unknown')



üìä Auto-detected 9 features:
['accountage', 'monthlycharges', 'totalcharges', 'viewinghoursperweek', 'averageviewingduration', 'contentdownloadspermonth', 'userrating', 'supportticketspermonth', 'watchlistsize'] 


In [14]:
# 5. Auto-detect numeric/categorical
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"üìà Numeric ({len(numeric_features)}): {numeric_features}")
print(f"üî§ Categorical ({len(categorical_features)}): {categorical_features}")

üìà Numeric (9): ['accountage', 'monthlycharges', 'totalcharges', 'viewinghoursperweek', 'averageviewingduration', 'contentdownloadspermonth', 'userrating', 'supportticketspermonth', 'watchlistsize']
üî§ Categorical (0): []


In [15]:
# 6. Dynamic pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)
])

subscription_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200, max_depth=12, class_weight='balanced', 
        random_state=42, n_jobs=-1
    ))
])

In [16]:
# 7. Train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nüîÑ Training: {X_train.shape}")
subscription_pipeline.fit(X_train, y_train)


üîÑ Training: (195029, 9)


In [18]:
# 8. Safe evaluation
y_pred = subscription_pipeline.predict(X_test)

# SAFE proba handling
try:
    proba_output = subscription_pipeline.predict_proba(X_test)
    if proba_output.shape[1] == 2 and len(np.unique(y_test)) == 2:
        y_pred_proba = proba_output[:, 1]
        auc_score = roc_auc_score(y_test, y_pred_proba)
    else:
        auc_score = None
        print("‚ö†Ô∏è ROC-AUC skipped (single class or proba issue)")
except Exception as e:
    auc_score = None
    print(f"‚ö†Ô∏è ROC-AUC error: {e}")

accuracy = subscription_pipeline.score(X_test, y_test)

print(f"\nüìà RESULTS")
print(f"‚úÖ Accuracy: {accuracy:.3f}")
if auc_score:
    print(f"‚úÖ ROC-AUC:  {auc_score:.3f}")

print("\nüìä Classification Report:")
print(classification_report(y_test, y_pred))

‚ö†Ô∏è ROC-AUC error: name 'roc_auc_score' is not defined

üìà RESULTS
‚úÖ Accuracy: 0.720

üìä Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.75      0.81     39921
           1       0.34      0.60      0.44      8837

    accuracy                           0.72     48758
   macro avg       0.62      0.67      0.62     48758
weighted avg       0.79      0.72      0.75     48758



In [19]:
joblib.dump(subscription_pipeline, "subscription_churn_pipeline.pkl")
joblib.dump({
    'features': features,
    'numeric_features': numeric_features,
    'categorical_features': categorical_features,
    'target_col': target_col
}, "subscription_pipeline_info.pkl")

print(f"\nüíæ SAVED: subscription_churn_pipeline.pkl")


üíæ SAVED: subscription_churn_pipeline.pkl
