In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


from sklearn.ensemble import RandomForestClassifier



In [3]:
df = pd.read_excel("telco_customer_churn_adapted_v2.xlsx")
df.head()

Unnamed: 0,Customer ID,Tenure Months,Location,Device Class,Games Product,Music Product,Education Product,Call Center,Video Product,Use MyApp,Payment Method,Monthly Purchase (Thou. IDR),Churn Label,Longitude,Latitude,CLTV (Predicted Thou. IDR)
0,0,2,Jakarta,Mid End,Yes,Yes,No,No,No,No,Digital Wallet,70.005,Yes,106.816666,-6.2,4210.7
1,1,2,Jakarta,High End,No,No,No,No,No,No,Pulsa,91.91,Yes,106.816666,-6.2,3511.3
2,2,8,Jakarta,High End,No,No,Yes,No,Yes,Yes,Pulsa,129.545,Yes,106.816666,-6.2,6983.6
3,3,28,Jakarta,High End,No,No,Yes,Yes,Yes,Yes,Pulsa,136.24,Yes,106.816666,-6.2,6503.9
4,4,49,Jakarta,High End,No,Yes,Yes,No,Yes,Yes,Debit,134.81,Yes,106.816666,-6.2,6942.0


In [4]:
df.drop(columns="Customer ID",inplace=True)

In [5]:
df.columns

Index(['Tenure Months', 'Location', 'Device Class', 'Games Product',
       'Music Product', 'Education Product', 'Call Center', 'Video Product',
       'Use MyApp', 'Payment Method', 'Monthly Purchase (Thou. IDR)',
       'Churn Label', 'Longitude', 'Latitude', 'CLTV (Predicted Thou. IDR)'],
      dtype='object')

In [14]:
df.dtypes

Tenure Months                     int64
Location                         object
Device Class                     object
Games Product                    object
Music Product                    object
Education Product                object
Call Center                      object
Video Product                    object
Use MyApp                        object
Payment Method                   object
Monthly Purchase (Thou. IDR)    float64
Churn Label                      object
Longitude                       float64
Latitude                        float64
CLTV (Predicted Thou. IDR)      float64
dtype: object

In [6]:
X = df.drop(["Churn Label", "Longitude", "Latitude"], axis=1)
y = df["Churn Label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5634, 12), (1409, 12), (5634,), (1409,))

In [7]:
cat_cols = X_train.select_dtypes(include="object").columns.tolist()
num_cols = X_train.select_dtypes(exclude="object").columns.tolist()

In [8]:
cat_cols, num_cols

(['Location',
  'Device Class',
  'Games Product',
  'Music Product',
  'Education Product',
  'Call Center',
  'Video Product',
  'Use MyApp',
  'Payment Method'],
 ['Tenure Months',
  'Monthly Purchase (Thou. IDR)',
  'CLTV (Predicted Thou. IDR)'])

In [13]:
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(sparse=False, handle_unknown="ignore"))
])

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])

preprocessor = ColumnTransformer([

    ("cat", cat_pipe, cat_cols),
    ("num", num_pipe, num_cols)
])

model_pipe = Pipeline([
    ("prep", preprocessor),
    ("algo", RandomForestClassifier(n_jobs=-1, random_state=123))
    ("algo", xgb.XGBClassifier(n_jobs=-1, random_state=123))
])

param_grid = {
    "algo__n_estimators": [100, 200, 300],
    "algo__max_depth": [None, 5, 7, 9],
    "algo__min_samples_leaf": [1, 3, 5, 7]
}

model = GridSearchCV(model_pipe, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)


Fitting 3 folds for each of 48 candidates, totalling 144 fits




In [15]:
print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

{'algo__max_depth': 9, 'algo__min_samples_leaf': 7, 'algo__n_estimators': 100}
0.8297834575789848 0.7981895633652822 0.7856635911994322
