# Creating and tracking XGBoost hyperparameter experiments with MLflow and Optuna

## Imports

In [1]:
#----------------------------------------- packages
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
#----------------------------------------- settings
pd.set_option('display.max_columns', None)

## Data preparation
We use the [Telco Customer Churn](https://www.kaggle.com/datasets/blastchar/telco-customer-churn) available on Kaggle which is a binary classification problem. The data set includes information about:
* Customers who left within the last month – the column is called Churn
* Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies
* Customer account information – how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges
* Demographic info about customers – gender, age range, and if they have partners and dependents

### Data processing
There are several Kaggle notebooks that demonstrate how to prepare the Telco churn data. I use the workbook provided by [Kaan Boke](https://www.kaggle.com/code/kaanboke/xgboost-lightgbm-catboost-imbalanced-data) for simplicity. I will not do any data transformation or normalisation, however, you could easily add it in this section.

In [2]:
# Read data
df = pd.read_csv('data/telco_customer_churn.csv')

# Remove columns we are not going to use
df = df.drop(['customerID','gender','PhoneService'],axis=1)

# 1/0 encode the dependent variable
le = LabelEncoder()
df['Churn']=le.fit_transform(df['Churn'])

# Missing data
df['TotalCharges']= df['TotalCharges'].apply(lambda x: x if x!= ' ' else np.nan).astype(float)
df[['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']]= df[['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']].replace('No internet service','No')

# Get dummy variables
df = pd.get_dummies(df)

# X and y arrays before train test split
X = df.drop('Churn', axis=1)
y = df['Churn']

In [3]:
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_Yes,TechSupport_No,TechSupport_Yes,StreamingTV_No,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,1,1,0,0,1,0,1,0,0,1,0,0,1,1,0,1,0,1,0,1,0,1,0,0,0,1,0,0,1,0
1,0,34,56.95,1889.5,1,0,1,0,1,0,0,1,0,0,0,1,1,0,0,1,1,0,1,0,1,0,0,1,0,1,0,0,0,0,1
2,0,2,53.85,108.15,1,0,1,0,1,0,0,1,0,0,0,1,0,1,1,0,1,0,1,0,1,0,1,0,0,0,1,0,0,0,1
3,0,45,42.3,1840.75,1,0,1,0,0,1,0,1,0,0,0,1,1,0,0,1,0,1,1,0,1,0,0,1,0,1,0,1,0,0,0
4,0,2,70.7,151.65,1,0,1,0,1,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,0,1,0,0,1,0


In [4]:
y.head()

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int32

### Train test split
We use a 80/20 split.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
X_train

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_Yes,TechSupport_No,TechSupport_Yes,StreamingTV_No,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
2142,0,21,64.85,1336.80,1,0,0,1,1,0,0,1,0,0,0,1,1,0,0,1,1,0,1,0,0,1,0,1,0,1,0,0,0,0,1
1623,0,54,97.20,5129.45,1,0,1,0,0,0,1,0,1,0,1,0,0,1,1,0,1,0,0,1,0,1,0,0,1,0,1,1,0,0,0
6074,0,1,23.45,23.45,0,1,1,0,0,1,0,1,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,0,1,0,0,1,0
1362,0,4,70.20,237.95,1,0,1,0,1,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,0,1,0,0,1,0
6754,0,0,61.90,,1,0,0,1,0,0,1,1,0,0,0,1,0,1,1,0,0,1,1,0,1,0,0,0,1,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,0,1,95.00,95.00,0,1,1,0,1,0,0,0,1,0,0,1,1,0,1,0,1,0,0,1,0,1,1,0,0,0,1,0,0,1,0
5191,0,23,91.10,2198.30,0,1,0,1,0,0,1,1,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1,0,1,0,0
5226,0,12,21.15,306.05,0,1,0,1,1,0,0,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,0,1,0,0,1,0
5390,1,12,99.45,1200.15,1,0,1,0,0,0,1,0,1,0,1,0,1,0,0,1,1,0,0,1,0,1,1,0,0,0,1,0,0,1,0
