In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os

In [13]:
import pandas as pd

# Load the dataset
data_path ='C://Users//Shaban Hassan//OneDrive//Desktop//internship//churn.csv'
df = pd.read_csv(data_path)

# Show first few rows
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [15]:
# Basic info
df.info()

# Check for nulls
print("\nMissing values:")
print(df.isnull().sum())

# Optional: check unique values of target
print("\nTarget distribution:")
print(df['Churn'].value_counts())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [3]:

# Load dataset
DATA_PATH = 'C://Users//Shaban Hassan//OneDrive//Desktop//internship//churn.csv'
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset '{DATA_PATH}' not found. Please add it to the project directory.")
data = pd.read_csv(DATA_PATH)

# Example: Assume 'Churn' is the target column, and all others are features
X = data.drop('Churn', axis=1)
y = data['Churn']

# Identify categorical and numerical columns
dtype_map = X.dtypes
categorical_cols = dtype_map[dtype_map == 'object'].index.tolist()
numerical_cols = dtype_map[dtype_map != 'object'].index.tolist()

# Preprocessing for numerical data
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Define models to try
models = {
    'logreg': LogisticRegression(max_iter=1000),
    'rf': RandomForestClassifier()
}

# Define parameter grid for GridSearchCV
param_grid = [
    {
        'classifier': [LogisticRegression(max_iter=1000)],
        'classifier__C': [0.1, 1.0, 10.0]
    },
    {
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10, 20]
    }
]

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())  # Placeholder, will be set by GridSearchCV
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Evaluate on test set
y_pred = grid_search.predict(X_test)
print("Test set accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Export the best pipeline
joblib.dump(grid_search.best_estimator_, 'churn_best_pipeline.joblib')
print("Best pipeline exported to 'churn_best_pipeline.joblib'") 

Best parameters: {'classifier': LogisticRegression(max_iter=1000), 'classifier__C': 0.1}
Best cross-validation accuracy: 0.8024
Test set accuracy: 0.7984386089425124
              precision    recall  f1-score   support

          No       0.84      0.89      0.87      1035
         Yes       0.64      0.54      0.59       374

    accuracy                           0.80      1409
   macro avg       0.74      0.72      0.73      1409
weighted avg       0.79      0.80      0.79      1409

Best pipeline exported to 'churn_best_pipeline.joblib'


In [5]:
import joblib

# Load the pipeline
pipeline = joblib.load('churn_best_pipeline.joblib')

# Print the pipeline steps
print("Pipeline steps:")
for name, step in pipeline.named_steps.items():
    print(f"{name}: {step}")

# Show the best model and its parameters
print("\nBest model (classifier):")
print(pipeline.named_steps['classifier'])
print("\nModel parameters:")
print(pipeline.named_steps['classifier'].get_params()) 

Pipeline steps:
preprocessor: ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['SeniorCitizen', 'tenure', 'MonthlyCharges']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['customerID', 'gender', 'Partner',
                                  'Dependents', 'PhoneService', 'MultipleLines',
                                  'InternetService', 'OnlineSecurity',
                                  'OnlineBackup', 'DeviceProtection',
                                  'TechSupport', 'St