In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SequentialFeatureSelector

In [None]:
df = pd.read_csv('CustomerChurn.csv')

In [None]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [None]:
#To Convert Total Charges into Numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [None]:
#We are dropping the null value and also we dont need customerid
df.dropna(inplace=True)
df.drop('customerID',axis= 1,inplace=True)

In [None]:
#To make categorical into numeric we need to encode so we use one hot encoding technique
df =  pd.get_dummies(df,drop_first=True).astype(int)

In [None]:

X = df.drop(['Churn_Yes'],axis=1)
y = df['Churn_Yes']

In [None]:
df.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,1,29,29,0,1,0,0,1,0,...,0,0,0,0,0,1,0,1,0,0
1,0,34,56,1889,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,2,53,108,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,1
3,0,45,42,1840,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,0,2,70,151,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

OrignalModel without rfse


In [None]:
# original form
model_org = LogisticRegression()
model_org.fit(X_train_sc, y_train)
y_pred = model_org.predict(X_test_sc)
accuracy_score(y_test, y_pred)

0.7882018479033405

select 5 features

In [None]:

model_sfs = LogisticRegression()
sfs = SequentialFeatureSelector(model_sfs, n_features_to_select=5)
sfs.fit(X_train_sc, y_train)

In [None]:
X.columns[sfs.get_support()]

Index(['tenure', 'PhoneService_Yes', 'InternetService_Fiber optic',
       'StreamingMovies_Yes', 'PaymentMethod_Electronic check'],
      dtype='object')

In [None]:
selected_features_mask = sfs.get_support()
X_train_selected = X_train_sc[:, selected_features_mask]
X_test_selected = X_test_sc[:, selected_features_mask]

In [None]:
model_sfs_selected = LogisticRegression()
model_sfs_selected.fit(X_train_selected, y_train)
y_pred_sfs = model_sfs_selected.predict(X_test_selected)
accuracy_score(y_test, y_pred_sfs)

0.7903340440653873

select 20 features

In [None]:
sfs_1 = SequentialFeatureSelector(model_sfs, n_features_to_select=20)
sfs_1.fit(X_train_sc, y_train)

In [None]:
selected_features_mask_1 = sfs_1.get_support()
X_train_selected_1 = X_train_sc[:, selected_features_mask_1]
X_test_selected_1 = X_test_sc[:, selected_features_mask_1]

In [None]:
model_sfs_selected_1 = LogisticRegression()
model_sfs_selected_1.fit(X_train_selected_1, y_train)
y_pred_sfs_1 = model_sfs_selected_1.predict(X_test_selected_1)
accuracy_score(y_test, y_pred_sfs_1)

0.7903340440653873

select 15 feautures

In [None]:
sfs_2 = SequentialFeatureSelector(model_sfs, n_features_to_select=15)
sfs_2.fit(X_train_sc, y_train)

In [None]:
feature_selection_mask_2 = sfs_2.support_
X_train_selected_2 = X_train_sc[:, feature_selection_mask_2]
X_test_selected_2 = X_test_sc[:, feature_selection_mask_2]

In [None]:
model_sfs_selected_2 = LogisticRegression()
model_sfs_selected_2.fit(X_train_selected_2, y_train)
y_pred_sfs_2 = model_sfs_selected_2.predict(X_test_selected_2)
accuracy_score(y_test, y_pred_sfs_2)

0.7974413646055437