#Task 2: End-to-End ML Pipeline with Scikit-learn Pipeline API



**Objective:**
Build a reusable and production-ready machine learning pipeline for predicting customer churn.

**Dataset:**
Telco Churn Dataset

**Install Required Libraries**

In [None]:
!pip install scikit-learn joblib panda

Collecting panda
  Downloading panda-0.3.1.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: panda
  Building wheel for panda (setup.py) ... [?25l[?25hdone
  Created wheel for panda: filename=panda-0.3.1-py3-none-any.whl size=7239 sha256=93dc397b1a512b3b0b2bd3b7fec825c41988231ab8681d620c9464e2849549b8
  Stored in directory: /root/.cache/pip/wheels/98/41/5b/6ca54e0b6a35e1b7248c12f56fcb753dfb7717fefaa0fb45f5
Successfully built panda
Installing collected packages: panda
Successfully installed panda-0.3.1


**Import Libraries**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib


**Load Dataset**

In [None]:
import pandas as pd

df = pd.read_csv("//Telco Customer Churn.csv")
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


**Basic Cleaning**

In [None]:
# Convert TotalCharges to number
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Remove missing rows
df.dropna(inplace=True)

# Target variable
y = df["Churn"].map({"Yes": 1, "No": 0})

# Features
X = df.drop(["Churn", "customerID"], axis=1)


**Separate Numeric & Categorical Columns**

In [None]:
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X.select_dtypes(include=["object"]).columns


**Preprocessing**

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)


**Train-Test Split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


**Logistic Regression Pipeline**

In [None]:
logistic_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

logistic_pipeline.fit(X_train, y_train)


**Evaluate Logistic Regression**


In [None]:
y_pred = logistic_pipeline.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))


Logistic Regression Accuracy: 0.7874911158493249


**Random Forest Pipeline**

In [None]:
rf_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", RandomForestClassifier(random_state=42))
])

rf_pipeline.fit(X_train, y_train)


**Evaluate Random Forest**

In [None]:
rf_pred = rf_pipeline.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))


Random Forest Accuracy: 0.7775408670931059


**Simple Hyperparameter Tuning**

In [None]:
param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 10]
}

grid_search = GridSearchCV(
    rf_pipeline,
    param_grid,
    cv=3,
    scoring="accuracy"
)

grid_search.fit(X_train, y_train)


**Model Result**

In [None]:
print("Best Accuracy:", grid_search.best_score_)
best_model = grid_search.best_estimator_


Best Accuracy: 0.8016


**Save Model (Production Ready)**

In [None]:
joblib.dump(best_model, "churn_pipeline.pkl")


['churn_pipeline.pkl']

**Load & Predict**

In [None]:
loaded_model = joblib.load("churn_pipeline.pkl")
loaded_model.predict(X_test.iloc[:5])


array([0, 0, 1, 0, 0])

In [None]:
from google.colab import files

files.download("churn_pipeline.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>