# Imports

In [1]:
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from constant import TRAIN_DATA_PATH, MODEL_PATH
from process import *

import pandas as pd
import numpy as np

# Data

## Load Data

In [2]:
data = pd.read_csv(TRAIN_DATA_PATH)
df = data.copy()

In [3]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [4]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
df.tail()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.8,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7042,3186-AJIEK,Male,0,No,No,66,Yes,No,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


## Process Data

In [7]:
processed_data = process_data(dataframe=df.drop(["customerID", "Churn"], axis=1))

In [8]:
processed_data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0,0.0,1,0,-0.608696,0,1,0,0,2,0,0,0,0,0,1,2,-0.745170,-0.402798
1,1,0.0,0,0,0.108696,1,0,0,2,0,2,0,0,0,1,0,3,-0.246550,0.146087
2,1,0.0,0,0,-0.586957,1,0,0,2,2,0,0,0,0,0,1,3,-0.303588,-0.379687
3,1,0.0,0,0,0.347826,0,1,0,2,0,2,2,0,0,1,0,0,-0.516099,0.131698
4,0,0.0,0,0,-0.586957,1,0,1,0,0,0,0,0,0,0,1,2,0.006440,-0.366848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0.0,1,1,-0.108696,1,2,0,2,0,2,2,2,2,1,1,3,0.265869,0.175898
7039,0,0.0,1,1,0.934783,1,2,1,0,2,2,0,2,2,1,1,1,0.604416,1.761589
7040,0,0.0,1,1,-0.391304,0,1,0,2,0,0,0,0,0,0,1,2,-0.749770,-0.309352
7041,1,1.0,1,0,-0.543478,1,2,1,0,0,0,0,0,0,0,1,3,0.074517,-0.321114


# Train

In [9]:
def print_result(cs_results):
    cv_results = cross_validate(xgboost_model, x, y, cv = 5, scoring = ['accuracy', 'f1', 'roc_auc'])
    print(f"test_accuracy: {cv_results['test_accuracy'].mean()}")
    print(f"test_f1: {cv_results['test_f1'].mean()}")
    print(f"test_roc_auc: {cv_results['test_roc_auc'].mean()}")

## Split Data for Train

In [10]:
x = processed_data
y = LabelEncoder().fit_transform(df["Churn"])

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

## Create Model

In [12]:
xgboost_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

## Cross Validation

In [13]:
cv_results = cross_validate(xgboost_model, x, y, cv = 5, scoring = ['accuracy', 'f1', 'roc_auc'])
print_result(cv_results)

test_accuracy: 0.7864555777792115
test_f1: 0.5615893138263909
test_roc_auc: 0.8247425559165483


## Finding Best Params

In [14]:
xgboost_params = {'learning_rate' : [0.005, 0.01, 0.05],
                  'max_depth' : [1, 5, 10],
                  'n_estimators': [500, 750, 1000],
                  'colsample_bytree' : [0.45, 0.45, 0.55]}

In [15]:
xgboost_grid = GridSearchCV(xgboost_model, xgboost_params, cv = 3, n_jobs = -1, verbose = 1).fit(x, y)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [16]:
xgboost_grid.best_params_

{'colsample_bytree': 0.45,
 'learning_rate': 0.05,
 'max_depth': 1,
 'n_estimators': 1000}

In [17]:
xgboost_model = xgboost_model.set_params(**xgboost_grid.best_params_, random_state = 17).fit(x, y)

In [19]:
cv_results = cross_validate(xgboost_model, x, y, cv = 5, scoring = ['accuracy', 'f1', 'roc_auc'])
print_result(cv_results)

test_accuracy: 0.8053383484418349
test_f1: 0.5939760687143099
test_roc_auc: 0.8491313426759062


## Saving Model

In [None]:
with open(MODEL_PATH,'wb') as f:
    pickle.dump(xgboost_model, f)