**XGboost Intro**

XGboost is tree based model and one of the most powerful machine learning techniques; it can be used with patterns, numbers and text problems. However RNN models more common for text problems.

**Different models structure and design produce better ensemble or stacking results.**

This model can be used as an ensemble or stack item alongside with RNN models to produce better results than any of the two models.

# I. Introduction and Imports

First we load in the **Quora Insincere Data Training set** that has **been preprocessed**. Here we want to predict whether a person's question was either insincere or sincere.

In [None]:
# kaggle standard imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# extra imports
np.random.seed(235)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, make_scorer, balanced_accuracy_score, roc_curve
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.datasets import make_classification
from matplotlib import pyplot
from sklearn import decomposition
import matplotlib.pyplot as plt
import gc
import re

# XGboost related
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import xgboost as xgb
from xgboost import XGBClassifier
from scipy.sparse import csr_matrix, hstack

# Any results you write to the current directory are saved as output.

# 1. Data Peparation
 references:
* Data preparing and process inspired by (Shujian Liu) Kernals

In [None]:
print('load data') 
# load training and testing data
train_df = pd.read_csv("../input/train-processed/train_processed.csv")

train_df.shape[0]

In [None]:
#Here we just want to check if there are no miscellaneous outcomes
train_df["target"].unique()

In [None]:
# Our question_text is an object which is what we expect since it is text
train_df.dtypes

In [None]:
# We need to determinne default behaviour for missing data and drop them
print(train_df.isna().sum())
train_df.loc[train_df['question_text'].isna().values == True]
train_df = train_df.dropna()
print(train_df.isna().sum())


In [None]:
classification = ['Sincere','Insincere']
insincere = (train_df['target'] == 1).sum()
sincere = (train_df['target'] == 0).sum()
count_of_sincerity = [sincere, insincere]

plt.bar(classification, count_of_sincerity)
plt.title('Sentiment type distribution in training set')
plt.xlabel('Classification')
plt.ylabel('Number of each classification')
plt.show()

In [None]:
#visualize the most frequent words
insincere = " ".join([sentence for sentence in train_df['question_text'][train_df['target'] == 1]])

from wordcloud import WordCloud
wordcloud = WordCloud(width = 800, height = 500, random_state = 42, max_font_size =100).generate(insincere)

#set up plots
plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
count = train_df.groupby(by='target').count().copy()
display(count['question_text'][1])
number_insincere = count['question_text'][1]

In [None]:
# split training data to validation
train_df, test_df = train_test_split(train_df, train_size=0.8)
train_df = train_df.dropna()
test_df = test_df.dropna()

In [None]:
print('size of training data: ', train_df.shape)
print('Size of total data: ', )

# **2. Prepare Vectors For XGboost input**

**2.1 Count Vectorizer**

**2.2 Tf-IDF Vectorizer**

In [None]:
def tfidfvectorizer(X_train, X_val, X_test):
    # word level tf-idf for XGB
    tfidf_vect = TfidfVectorizer(analyzer='word', 
                                 token_pattern=r'\w{1,}', 
                                 max_features=9000)
    tfidf_vect.fit(train_df['question_text'])
    xtrain_tfidf =  tfidf_vect.transform(X_train)
    xvalid_tfidf =  tfidf_vect.transform(X_val)
    xtest_tfidf =  tfidf_vect.transform(X_test)
    # characters level tf-idf for XGB
    tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', 
                                             token_pattern=r'\w{1,}', 
                                             ngram_range=(2,3), 
                                             max_features=9000)
    tfidf_vect_ngram_chars.fit(train_df['question_text'])
    xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_train) 
    xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_val) 
    xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_test) 
    return xtrain_tfidf, xvalid_tfidf, xtest_tfidf, xtrain_tfidf_ngram_chars, xvalid_tfidf_ngram_chars, xtest_tfidf_ngram_chars

**2.3 Topic Models for features**

In [None]:
# train a LDA Model
#lda_model = decomposition.LatentDirichletAllocation(n_components=20, 
#                                                    learning_method='online', 
#                                                    max_iter=20)
#X_topics = lda_model.fit_transform(xtrain_count)
#topic_word = lda_model.components_ 
#vocab = count_vect.get_feature_names()

# view the topic models
#n_top_words = 10
#topic_summaries = []
#for i, topic_dist in enumerate(topic_word):
#    topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
#    topic_summaries.append(' '.join(topic_words))

**2.4 Features Engineering**

In [None]:
gc.collect()

In [None]:
def get_features(data):
    for dataframe in data:
        dataframe = pd.DataFrame(dataframe)
        dataframe["text_size"] = dataframe["question_text"].apply(len).astype('uint16')
        dataframe["exc_count"] = dataframe["question_text"].apply(lambda x: x.count("!")).astype('uint16')
        dataframe["quetion_count"] = dataframe["question_text"].apply(lambda x: x.count("?")).astype('uint16')
        dataframe["punctuation_count"] = dataframe["question_text"].apply(lambda x: sum(x.count(p) for p in '.,;:^_`')).astype('uint16')
        dataframe["symbol_count"] = dataframe["question_text"].apply(lambda x: sum(x.count(p) for p in '*&$%')).astype('uint16')
        dataframe["words_count"] = dataframe["question_text"].apply(lambda x: len(x.split())).astype('uint16')
        dataframe["unique_words"] = dataframe["question_text"].apply(lambda x: (len(set(1 for w in x.split())))).astype('uint16')
        dataframe["unique_rate"] = dataframe["unique_words"] / dataframe["words_count"]
        dataframe["word_max_length"] = dataframe["question_text"].apply(lambda x: max([len(word) for word in x.split()]) ).astype('uint16')
    return data

In [None]:
print('generate the features')

data = train_df, test_df
data = get_features(data)


In [None]:
feature_cols = ["text_size", "exc_count", "quetion_count", "punctuation_count", "symbol_count", "words_count", "unique_words", "unique_rate", "word_max_length"]

# Hyper Paramater Tuning

In [None]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
X_train = csr_matrix(train_df[feature_cols].values)
y_train = train_df["target"]
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=4)
gsearch1.fit(X_train, y_train)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test2 = {
 'max_depth':[4,5,6],
 'min_child_weight':[4,5,6]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(X_train,y_train)
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

In [None]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(X_train,y_train)
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
param_test4 = {
 'learning_rate':[i/10.0 for i in range(0,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(X_train,y_train)
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

**Input Final Format**

In [None]:
print('final preparation for input')
def prep_input_vals(train_index, val_df_kfolds):
    X_train = csr_matrix(train_df.iloc[train_index][feature_cols].dropna().values)
    X_val = csr_matrix(val_df_kfolds[feature_cols].values)
    X_test = csr_matrix(test_df[feature_cols].values)
    return X_train, X_val, X_test
gc.collect()

In [None]:
def create_hstack(X_train, xtrain_tfidf, xtrain_tfidf_ngram_chars, X_val, xvalid_tfidf, xvalid_tfidf_ngram_chars, 
                 X_test, xtest_tfidf, xtest_tfidf_ngram_chars):
    input_train = hstack([X_train, xtrain_tfidf, xtrain_tfidf_ngram_chars])
    input_valid = hstack([X_val, xvalid_tfidf, xvalid_tfidf_ngram_chars])
    input_test = hstack([X_test, xtest_tfidf, xtest_tfidf_ngram_chars])
    return input_train, input_valid, input_test

#print('input_train: ', input_train)
train_word_vector = None
train_char_vector = None
valid_word_vector = None
valid_char_vector = None
test_word_vector = None
test_char_vector = None
#print('input_train: ', input_train)

**Build The model**

In [None]:
'''reference: some settings inspired by Toxic competition kernels'''
def build_xgb(train_X, train_y, valid_X, valid_y=None, subsample=0.75):

    xgtrain = xgb.DMatrix(train_X, label=train_y)
    if valid_y is not None:
        xgvalid = xgb.DMatrix(valid_X, label=valid_y)
    else:
        xgvalid = None
    
    model_params = {}
    # binary 0 or 1
    model_params['objective'] = 'binary:logistic'
    # eta is the learning_rate, [default=0.3]
    model_params['eta'] = 0.3
    # depth of the tree, deeper more complex.
    model_params['max_depth'] =7
    # 0 [default] print running messages, 1 means silent mode
    model_params['silent'] = 1
    model_params['eval_metric'] = 'auc'
    # will give up further partitioning [default=1]
    model_params['min_child_weight'] =1
    # subsample ratio for the training instance
    model_params['subsample'] = subsample
    # subsample ratio of columns when constructing each tree
    model_params['colsample_bytree'] = subsample
    # random seed
    model_params['seed'] = 2021
    
    # convert params to list
    model_params = model_params
    
    return xgtrain, xgvalid, model_params

**Train The Model**

In [None]:
def run_model(xgtrain, xgvalid, model_params, num_rounds=500, patience=2):
    # watchlist what information should be printed. specify validation monitoring
    watchlist = [ (xgtrain, 'train'), (xgvalid, 'test') ]
    #early_stopping_rounds = stop if performance does not improve for k rounds
    model = xgb.train(model_params, xgtrain, num_rounds, watchlist, early_stopping_rounds=patience)
    
    return model

In [None]:
# Python program to get average of a list
def Average(lst):
    return sum(lst) / len(lst)

In [None]:
def plot_roc(X_test, model, input_test):
    y_test = test_df["target"]
    # generate a no skill prediction (majority class)
    ns_probs = [0 for _ in range(len(y_test))]
    predictions = np.zeros(( X_test.shape[0], 1) )
    predictions[:,0] = model.predict(xgb.DMatrix(input_test), ntree_limit=model.best_ntree_limit)
    xg_probs = predictions[:,0]
    ns_auc = roc_auc_score(y_test, ns_probs)
    xg_auc = roc_auc_score(y_test, xg_probs)
    # summarize scores
    print('No Skill: ROC AUC=%.3f' % (ns_auc))
    print('Logistic: ROC AUC=%.3f' % (xg_auc))
    # calculate roc curves
    ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
    lr_fpr, lr_tpr, _ = roc_curve(y_test, xg_probs)
    # plot the roc curve for the model
    pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
    pyplot.plot(lr_fpr, lr_tpr, marker='.', label='XGBoost')
    # axis labels
    pyplot.xlabel('False Positive Rate')
    pyplot.ylabel('True Positive Rate')
    # show the legend
    pyplot.legend()
    # show the plot
    pyplot.show()

In [None]:
def thresholding(validate_hat, y_val):
    best_f1 = 0
    best_accuracy = 0
    scores_list = []
    accuracy_list = []
    fscores = []    
    thresholds = np.arange(0, 1, 0.01)
    for threshold in thresholds:
        score = f1_score(y_val, (validate_hat > threshold).astype(int))
        fscores.append(score)
        scores_list.append([threshold, score])
        _accuracy = balanced_accuracy_score(y_val, (validate_hat > threshold).astype(int))
        accuracy_list .append([threshold, _accuracy])
        print('F1 score: {} for threshold: {}'.format(score, threshold))
        print('accuracy score: {} for threshold: {}'.format(_accuracy, threshold))

        accuracy_list.sort(key=lambda x:x[1] , reverse=True)
        scores_list.sort(key=lambda x:x[1] , reverse=True)

        best_threshold = scores_list[0][0]
        print('best threshold to generate predictions: ', best_threshold)
        print('best f1 score: ', scores_list[0][1])
        print('best accuracy score: ', accuracy_list[0][1])
        if scores_list[0][1] > best_f1:
            best_f1 = scores_list[0][1]
            best_accuracy = accuracy_list[0][1]
    plt.plot(thresholds, fscores)
    plt.xlabel("Threhold")
    plt.ylabel("F1-Score")
    plt.show()
    return best_f1, best_accuracy, best_threshold

In [None]:
def train_xgboost():

    # split kfolds for tfidf char and word data
    # split kfolds for train data
    param_grid = [{'min_child_weight': np.arange(0.1, 10.1, 0.1)}] #set of trial values for min_child_weight
    i=1
    kf = StratifiedKFold(n_splits=4,random_state=1,shuffle=True)
    X = train_df["question_text"].dropna()
    y = train_df["target"].dropna()
    for train_index,test_index in kf.split(X,y):
        print('\n{} of kfold {}'.format(i,kf.n_splits))
        X_train,X_val = X.iloc[train_index].dropna(),X.iloc[test_index].dropna()
        y_train,y_val = y.iloc[train_index].dropna(),y.iloc[test_index].dropna()
        X_test, y_test = test_df["question_text"],test_df["target"]
        # Vectorize data
        print("Vectorizing Data...")
        xtrain_tfidf, xvalid_tfidf, xtest_tfidf, xtrain_tfidf_ngram_chars, xvalid_tfidf_ngram_chars, xtest_tfidf_ngram_chars = tfidfvectorizer(X_train, X_val, X_test)
        gc.collect()
        # We need to make sure for the hstack later that they are all of the same size
        print("Prepping input values...")
        gc.collect()
        # Get input values
        X_train, X_val, X_test = prep_input_vals(train_index, train_df.iloc[test_index].dropna()) # pass in the index so we can get our spicey sparse matrix 
                                                                                                # + the validate matrix we also we want to pass into our spicey sparse
        gc.collect()
        print("Creating Hstack...")
        # Create hstack
        input_train, input_valid, input_test = create_hstack(X_train, xtrain_tfidf, xtrain_tfidf_ngram_chars, 
                                                             X_val, xvalid_tfidf, xvalid_tfidf_ngram_chars, 
                                                             X_test, xtest_tfidf, xtest_tfidf_ngram_chars)
        gc.collect()
        print("Building model...")
        #Build the model
        xgtrain, xgvalid, model_params = build_xgb(input_train, y_train ,input_valid, y_val)
        gc.collect()
        print("Running model...")
        model = run_model(xgtrain, xgvalid, model_params)
        gc.collect()
        print('predict validation...')
        validate_hat = np.zeros(( X_val.shape[0], 1) )
        validate_hat[:,0] = model.predict(xgb.DMatrix(input_valid), ntree_limit=model.best_ntree_limit)
        
        
        print('Plot roc curve...')
        plot_roc(X_test, model, input_test)
        
        print('Thresholding Validated Set...')
        best_f1, best_accuracy, best_threshold = thresholding(validate_hat, y_val)
        
        
        i+=1

    test_hat = np.zeros(( X_test.shape[0], 1) )
    test_hat[:,0] = model.predict(xgb.DMatrix(input_test), ntree_limit=model.best_ntree_limit)
    
    print('Thresholding Test Set...')
    best_f1_test, best_accuracy_test, best_threshold_test = thresholding(test_hat, y_test)
    
    return model, best_f1, best_accuracy, best_f1_test, best_accuracy_test, best_threshold, best_threshold_test

In [None]:
print('train the model')
best_f1score = 0
model, best_f1score, best_accuracy, best_f1_test, best_accuracy_test, best_threshold, best_threshold_test = train_xgboost()

In [None]:

print("Best F1_score:", best_f1score)
print("Best Accuracy: ", best_accuracy)
print("Best Threshold: ", best_threshold)

print("Best F1_score:", best_f1_test)
print("Best Accuracy: ", best_accuracy_test)
print("Best Threshold: ", best_threshold_test)

In [None]:
#kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
#print(model_params)
#grid_search = GridSearchCV(model, model_params, scoring="neg_log_loss", n_jobs=-1, cv=kfold, verbose=1)
#grid_result = grid_search.fit(X, label_encoded_y)

**Predict And Export Results**

In [None]:
#print('predict results')
#predictions = np.zeros(( X_test.shape[0], 1) )
#predictions[:,0] = model.predict(xgb.DMatrix(input_test), ntree_limit=model.best_ntree_limit)
#print(predictions)

In [None]:
#def save_results(submit, y_hat, name, threshold=0.35):
#    print('threshold is: ', threshold)
#    results = (y_hat > threshold).astype(int)
#    print(results[:100])
#    submit['prediction'] = results
#    save_to = (name+'.csv')
#    submit.to_csv(save_to, index=False)