# Recusrsive Feature Elimination

Recursively remove features to improve model's performance reduce computation cost, and reduce overfitting.

## Build a model on the entire set of predictors (features)

In [26]:
import pandas as pd
import numpy as np
import matplotlib as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [13]:
df = pd.read_csv('../../data_sets/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [14]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [15]:
# Clean up data

df.drop(['customerID'], axis='columns', inplace=True)

In [16]:

df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df = df.dropna(subset=["TotalCharges"])


In [17]:

df["TotalCharges"].dtype

dtype('float64')

In [18]:
# Dummy encoding
df = pd.get_dummies(df, drop_first=True)

In [19]:
df.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,False,True,False,True,False,False
1,0,34,56.95,1889.5,True,False,False,True,False,False,...,False,False,False,True,False,False,False,False,True,False
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,False,True,False,False,True,True
3,0,45,42.3,1840.75,True,False,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
4,0,2,70.7,151.65,False,False,False,True,False,False,...,False,False,False,False,False,True,False,True,False,True


In [21]:
# y-variable = churn
# X variables = everything else

# Drop the obvious domain knowledge features.

X = df.drop(columns=['Churn_Yes'])

y = df['Churn_Yes']

In [22]:
df.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,False,True,False,True,False,False
1,0,34,56.95,1889.5,True,False,False,True,False,False,...,False,False,False,True,False,False,False,False,True,False
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,False,True,False,False,True,True
3,0,45,42.3,1840.75,True,False,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
4,0,2,70.7,151.65,False,False,False,True,False,False,...,False,False,False,False,False,True,False,True,False,True


In [23]:
# What is the churn rate?

df.Churn_Yes.value_counts()/len(df)*100

Churn_Yes
False    73.421502
True     26.578498
Name: count, dtype: float64

In [24]:
# Train Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [25]:
# Build original model
model_orig = LogisticRegression()
model_orig.fit(X_train, y_train)
y_pred_orig = model_orig.predict(X_test)

accuracy_orig = accuracy_score(y_test, y_pred_orig)
print('Accuracy of the base model is: ', round(accuracy_orig*100, 2))

Accuracy of the base model is:  79.24


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
# Apply chi2

chi2_selector = SelectKBest(chi2, k=5)
X_train_chi2 = chi2_selector.fit_transform(X_train, y_train)

X_test_chi2 = chi2_selector.transform(X_test)

In [28]:
# Build ch2 model
model_chi2 = LogisticRegression()
model_chi2.fit(X_train_chi2, y_train)
y_pred_chi2 = model_chi2.predict(X_test_chi2)

accuracy_chi2 = accuracy_score(y_test, y_pred_chi2)
print('Accuracy of the chi2 model is: ', round(accuracy_chi2*100, 2))

Accuracy of the chi2 model is:  78.2


In [30]:
# ✅ Industry-standard: tune k with cross-validation (no copy/paste guessing)

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Pipeline: feature selection happens inside CV folds (prevents leakage)
pipe = Pipeline(steps=[
    ("chi2", SelectKBest(score_func=chi2)),
    ("lr", LogisticRegression(max_iter=10000))
])

# Try a sensible range of k values (adjust based on your feature count)
param_grid = {
    "chi2__k": [5, 10, 20, 30, 50, 100]
}

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV accuracy:", round(grid.best_score_ * 100, 2), "%")

# Evaluate on test set using the best (k, model) found by CV
y_pred = grid.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test accuracy (best k):", round(accuracy * 100, 2), "%")




Best params: {'chi2__k': 30}
Best CV accuracy: 80.31 %
Test accuracy (best k): 79.53 %
