In [None]:
#Training Pipeline for the TFIDF /XGBOOST Approach

from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from skopt import BayesSearchCV

from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

column_transformer = ColumnTransformer([('tfidf1', TfidfVectorizer(), 'body_chat'),
                                        ('tfidf2', TfidfVectorizer(), 'body_couns')],
                                        remainder="passthrough")

pipe = Pipeline([
    ('transf', column_transformer),
    ('sampling', None),
   ('classifier', XGBClassifier()),
])

# Define the parameters for grid search
param_grid = {'transf__tfidf1__min_df': [1, 2, 5, 10, 25, 50, 75, 100, 150, 200],
              'transf__tfidf1__max_df': [x/100 for x in range(20, 101, 10)],
              'transf__tfidf1__use_idf': [True, False],
              # Same parameter grid for the second tf-idf vectorizer
              'transf__tfidf2__min_df': [1, 2, 5, 10, 25, 50, 75, 100, 150, 200],
              'transf__tfidf2__max_df': [x/100 for x in range(20, 101, 10)],
              'transf__tfidf2__use_idf': [True, False],
              #Different Sampling Methods
              'sampling': [RandomOverSampler(), RandomUnderSampler(), None, SMOTE()],  # Oversampling, Undersampling, or None
              # model parameters
              'classifier__min_child_weight': [1, 5, 10, 20],
              'classifier__gamma': [0, 0.25, 0.5, 1, 1.5, 2, 5, 10],
              'classifier__subsample': [0.2, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0],
              'classifier__colsample_bytree': [0.2, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0],
              'classifier__max_depth': [2, 4, 6, 8, 10, 12, 14, 16],
              'classifier__eta': [0.005, 0.01, 0.05, 0.1, 0.2],
              'classifier__scale_pos_weight' : [0.001, 0.01, 0.1, 0.2, 0.5, 0.8, 1, 5, 10, 100, 1000]
              }

rkf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state = 123)

In [None]:

grid_search = BayesSearchCV(estimator=pipe,
                            search_spaces=param_grid,
                            scoring='roc_auc_score',
                            cv=rkf,
                            n_jobs=-1,
                            n_iter=250,
                            verbose=2)

grid_search.fit(train[["body_couns", "body_chat"]],
                train["outcome"])