In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib

In [16]:
# Load and preprocess data (replace with your dataset)
df = pd.read_csv('Subscription_Service_Churn_Dataset.csv')  # Replace with your file
X = df.drop('Churn', axis=1)  # Features (simplify your feature selection)
y = df['Churn']  # Target

In [17]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Define the categorical columns
categorical_cols = ['SubscriptionType', 'PaymentMethod', 'PaperlessBilling', 
                    'ContentType', 'MultiDeviceAccess', 'DeviceRegistered', 
                    'GenrePreference', 'Gender', 'ParentalControl', 'SubtitlesEnabled']

# Define numeric columns
numeric_cols = ['AccountAge', 'MonthlyCharges', 'TotalCharges', 
                'ViewingHoursPerWeek', 'AverageViewingDuration', 
                'ContentDownloadsPerMonth', 'UserRating', 
                'SupportTicketsPerMonth', 'WatchlistSize']

# Remove CustomerID and other non-numeric, non-useful columns
X_train = X_train.drop(columns=['CustomerID'])  
X_test = X_test.drop(columns=['CustomerID'])  

# Create a pipeline for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values in numeric columns
            ('scaler', StandardScaler())
        ]), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='drop'  # Drop any columns not specified
)

# Apply the transformation
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)


In [19]:
# Standardize the numeric data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_transformed)
X_test_scaled = scaler.transform(X_test_transformed)

In [20]:
# Train a simpler model (Logistic Regression)
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [21]:

# Test the model
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8031088082901554


In [22]:
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming 'preprocessor' is your fitted ColumnTransformer
# And 'model' is your trained model, e.g., LogisticRegression, RandomForest, etc.

# Save the preprocessor
joblib.dump(preprocessor, 'preprocessor.pkl')

# If you have a separate scaler (usually already included in the preprocessor)
scaler = StandardScaler().fit(X_train_transformed)  # Fit scaler on transformed training data
joblib.dump(scaler, 'scaler.pkl')

# Save the model
joblib.dump(model, 'simple_churn_model.pkl')


['simple_churn_model.pkl']