In [None]:
import xgboost as xgb
import pandas as pd
from sklearn import *
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
df_train = pd.read_csv("../input/telecom-churn-datasets/churn-bigml-80.csv")
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.Churn.value_counts()

In [None]:
df_train.Churn.value_counts()/len(df_train)

In [None]:
df_train.columns

In [None]:
df_test = pd.read_csv("../input/telecom-churn-datasets/churn-bigml-20.csv")
df_test.info()

In [None]:
df_test.Churn.value_counts()/len(df_test)

In [None]:
len(df_test)/len(df_train)

In [None]:
cat_columns = ['State', 'Area code', 'International plan', 'Voice mail plan']
num_columns = ['Account length', 'Number vmail messages', 'Total day minutes',
       'Total day calls', 'Total day charge', 'Total eve minutes',
       'Total eve calls', 'Total eve charge', 'Total night minutes',
       'Total night calls', 'Total night charge', 'Total intl minutes',
       'Total intl calls', 'Total intl charge', 'Customer service calls']

In [None]:
target = "Churn"
X_train = df_train.drop(columns=target)
y_train = df_train[target]
X_test = df_test.drop(columns=target)
y_test = df_test[target]

In [None]:
cat_pipe = pipeline.Pipeline([
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', preprocessing.OneHotEncoder(handle_unknown='error', drop="first"))
]) 

num_pipe = pipeline.Pipeline([
    ('imputer', impute.SimpleImputer(strategy='median')),
    ('scaler', preprocessing.StandardScaler()),
])

preprocessing_pipe = compose.ColumnTransformer([
    ("cat", cat_pipe, cat_columns),
    ("num", num_pipe, num_columns)
])

X_train = preprocessing_pipe.fit_transform(X_train)
X_test = preprocessing_pipe.transform(X_test)

In [None]:
pd.DataFrame(X_train.toarray()).describe()

In [None]:
est = linear_model.LogisticRegression(solver="liblinear")
est.fit(X_train, y_train)
y_test_pred = est.predict(X_test)
est.score(X_test, y_test)

In [None]:
est = tree.DecisionTreeClassifier(max_depth=6)
est.fit(X_train, y_train)
y_test_pred = est.predict(X_test)
est.score(X_test, y_test)

In [None]:
print(metrics.classification_report(y_test, y_test_pred))

In [None]:
metrics.confusion_matrix(y_test, y_test_pred)

In [None]:
eval_sets = [
    (X_train, y_train),
    (X_test, y_test)
]

cls = xgb.XGBRFClassifier(silent=False, 
                          scale_pos_weight=1,
                          learning_rate=0.1,  
                          colsample_bytree = 0.99,
                          subsample = 0.8,
                          objective='binary:logistic', 
                          n_estimators=100, 
                          reg_alpha = 0.003,
                          max_depth=10, 
                          gamma=10,
                          min_child_weight = 1
                          
                         )

print(cls.fit(X_train
              , y_train
              , eval_set = eval_sets
              , early_stopping_rounds = 10
              , eval_metric = ["error", "logloss"]
              , verbose = True
             ))
print("test accuracy: " , cls.score(X_test, y_test))

In [None]:
cls.evals_result()

In [None]:
y_test_pred = cls.predict(X_test)

In [None]:
metrics.confusion_matrix(y_test, y_test_pred)

In [None]:
y_test_prob = cls.predict_proba(X_test)[:, 1]
y_test_prob

In [None]:
auc = metrics.roc_auc_score(y_test, y_test_prob)
auc

In [None]:
ftr, tpr, thresholds = metrics.roc_curve(y_test, y_test_prob)

In [None]:
plt.rcParams['figure.figsize'] = 8,8
plt.plot(ftr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC, auc: " + str(auc))

In [None]:
params = {  'objective': "binary:logistic"
          , 'colsample_bytree': 0.9
          , 'learning_rate': 0.01
          , 'max_depth': 10
          , 'alpha': 0.5
          , 'min_child_weight': 1
          , 'subsample': 1
          , 'eval_metric': "auc"
          , 'n_estimators': 300
          , 'verbose': True
         }

data_dmatrix = xgb.DMatrix(data=X_train,label=y_train) 

cv_results = xgb.cv(dtrain=data_dmatrix
                    , params=params
                    , nfold=5
                    , maximize = "auc"
                    , num_boost_round=100
                    , early_stopping_rounds=10
                    , metrics=["logloss", "error", "auc"]
                    , as_pandas=True
                    , seed=123
                    , verbose_eval=True
                   )

cv_results

In [None]:
cv_results[["train-error-mean"]].plot()

In [None]:
plt.rcParams['figure.figsize'] = 50,50

xgb.plot_tree(cls, num_trees=0, rankdir='LR')

In [None]:
plt.rcParams['figure.figsize'] =15, 15
xgb.plot_importance(cls, )

In [None]:
cls.feature_importances_

In [None]:
one_hot_encoder = preprocessing_pipe.transformers_[0][1].steps[1][1]
one_hot_encoder

In [None]:
one_hot_encoder.get_feature_names()

In [None]:
preprocessing_pipe.transformers_[0][1]

In [None]:
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}


cls = xgb.XGBRFClassifier(silent=False, 
                          scale_pos_weight=1,
                          learning_rate=0.01,  
                          colsample_bytree = 0.99,
                          subsample = 0.8,
                          objective='binary:logistic', 
                          n_estimators=100, 
                          reg_alpha = 0.003,
                          max_depth=10, 
                          gamma=10,
                          min_child_weight = 1
                         )

grid_search = model_selection.GridSearchCV(
    estimator=cls,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 12,
    cv = 10,
    verbose=True,
    return_train_score=True
)

grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
pd.DataFrame(grid_search.cv_results_)

In [None]:
folds = 5
param_comb = 5

cls = xgb.XGBRFClassifier(silent=False, 
                          scale_pos_weight=1,
                          learning_rate=0.01,  
                          colsample_bytree = 0.99,
                          subsample = 0.8,
                          objective='binary:logistic', 
                          n_estimators=100, 
                          reg_alpha = 0.003,
                          max_depth=10, 
                          gamma=10,
                          min_child_weight = 1
                         )

skf = model_selection.StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
random_search = model_selection.RandomizedSearchCV(cls, 
                                   param_distributions=parameters, 
                                   n_iter=param_comb, 
                                   scoring='accuracy', 
                                   n_jobs=12, 
                                   cv=skf.split(X_train,y_train), 
                                   verbose=3, 
                                   random_state=1001 )

random_search.fit(X_train, y_train)

In [None]:
random_search.best_score_, random_search.best_params_

In [None]:
pd.DataFrame(random_search.cv_results_)