In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import numpy as np
import pandas as pd


## Load Data

In [9]:
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
               'PhoneService', 'MultipleLines', 'InternetService',
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
               'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
               'PaymentMethod']

df_train_full = pd.read_csv('data/train.csv', index_col=0)
df_train_full.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
2499,6061-GWWAV,Male,0,No,Yes,41,Yes,No,DSL,Yes,...,Yes,No,Yes,No,One year,No,Mailed check,70.2,2894.55,0
5807,8464-EETCQ,Male,0,No,No,57,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),18.8,1094.35,0
5118,7621-VPNET,Female,0,Yes,No,42,Yes,Yes,Fiber optic,No,...,Yes,Yes,No,No,Month-to-month,Yes,Credit card (automatic),85.9,3729.75,0
275,6432-TWQLB,Male,0,Yes,No,5,Yes,Yes,Fiber optic,No,...,No,Yes,No,No,Month-to-month,Yes,Electronic check,85.4,401.1,1
1350,4102-HLENU,Female,0,Yes,No,67,Yes,Yes,DSL,Yes,...,Yes,No,No,No,Two year,No,Mailed check,65.65,4322.85,0


In [10]:
dicts = df_train_full[numerical+categorical].to_dict(orient='records')


encoder = DictVectorizer(sparse=False)
X_encoded = encoder.fit_transform(dicts)
y = df_train_full['Churn']

In [11]:
def grid_search(model,params):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
    search = GridSearchCV(model, params, scoring='roc_auc', n_jobs=-1, cv=cv)
    result = search.fit(X_encoded,y)
    return result

# 1. Logistic Regression

In [12]:
LR_params = {'C':[1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]}

LR_model = LogisticRegression(max_iter=1000, random_state=0)

LR_search_result = grid_search(LR_model, LR_params)
print('Best estimator: ', LR_search_result.best_estimator_)
print('Best score: ', LR_search_result.best_score_)


Best estimator:  LogisticRegression(C=1, max_iter=1000, random_state=0)
Best score:  0.8431099141171027


# 2. Random Forest

In [13]:
RF_model = RandomForestClassifier(random_state=0)

RF_params = {
 'max_depth': [10, 50,100],
 'max_features': ['auto', 'sqrt'],
 'n_estimators': [200, 400, 600]}

RF_search_result = grid_search(RF_model,RF_params)

print('Best estimator: ', RF_search_result.best_estimator_)
print('Best score: ', RF_search_result.best_score_)
# Best: RandomForestClassifier(max_depth=10, n_estimators=600, random_state=0)

Best estimator:  RandomForestClassifier(max_depth=10, n_estimators=600, random_state=0)
Best score:  0.8428031028659636


# 3. XGBoost

XGBClassifier(use_label_encoder=False,eval_metric='auc',random_state=config['random_state'])