# Importing Libraries:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
import distance
from wordcloud import WordCloud

In [None]:
data = pd.read_csv('../input/quora-train/train.csv')
data.head(4)

In [None]:
print(data.shape)

In [None]:
# Getting 50,000 random rows from a total of 4 Lakh rows:

data_ = data.sample(n = 50000)
data_.head(4)

In [None]:
# Storing two questions in seperate variable:

q1 = data_.iloc[:,3].values
q2 = data_.iloc[:,4].values
y = data_.iloc[:,-1].values # class label

In [None]:
labels = data_['is_duplicate'].value_counts()
labels

# Distribution of class label: 0 = Not Duplicate, 1 = Duplicate


In [None]:
x_axis = [str(0),str(1)]
y_axis = [labels[0],labels[1]]

plt.bar(x_axis,y_axis,width = 0.4)
plt.xlabel('Duplicate or not')
plt.ylabel('Number of questions')
plt.title('Frequency of Duplicate questions')
plt.show()

In [None]:
# Percentage of similar questions:

similar = ((labels[1])/(labels[0] + labels[1]))*100
not_similar = ((labels[0])/(labels[0] + labels[1]))*100
print('Percentage of Similar question pairs in dataset is: {}%'.format(similar))
print('Percentage of Not Similar questions in dataset is: {}%'.format(not_similar))


# Checking null values:

In [None]:
null_value = data_[data_.isnull().any(1)]
print(null_value)
print('*'*75)
data_ = data_.fillna('')
nan_rows = data_[data_.isnull().any(1)]
print (nan_rows)

# .any(1) this returns the instance where the dataframe has null values

# Genreal idea about data:

In [None]:
ques = pd.Series(data_['qid1'].tolist() + data_['qid2'].tolist())
unique_ques = len(np.unique(ques))
repeat = np.sum(ques.value_counts() > 1)
max_ques = max(ques.value_counts())
values = ques.value_counts()

idx = ques.value_counts().index.tolist()[0] # To get the qid value for question that repeats maximum number of times

maxi = data_.loc[(data_['qid1'] == idx)]
maxi2 = maxi[maxi.columns[[3]]]

print('Question ID and their number of occurences')
print(values[:5])


In [None]:
print('Total number of unique questions are: ',unique_ques)
print('*'*75)
print('{}% of the unique questions repeat themselves'.format(round((repeat/unique_ques)*100,2)))
print('*'*75)
print('The following question is repeated {} times'.format(max_ques))
(maxi2)

# Data Cleaning:

In [None]:
# not removing stop-words yet:

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
lemma = WordNetLemmatizer()

import re
def clean(text):
    '''
    This function gets rid of all punctuation marks, special characters
    and expands the contracted words and returns words in its lemma form
    
    '''
    sent = text.lower().strip() # Done for whole sentence
    sent = re.sub('[^a-zA-Z]',' ',text) # Done for whole sentence
    sent = sent.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will").replace("doesn't", "does not")
    sent = sent.split()
    final = [lemma.lemmatize(word) for word in sent]
    final_sent =  ' '.join(final)
    return final_sent

# The strip() method removes any leading (spaces at the beginning) and trailing (spaces at the end) characters

In [None]:
q1_clean = [] # This stores all the question1 text which has been cleaned 
for z in range(len(q1)):
    res = clean(str(q1[z]))
    q1_clean.append(res)

In [None]:
q2_clean = [] # This stores all the question2 text which has been cleaned 
for v in range(len(q2)):
    res2 = clean(str(q2[v]))
    q2_clean.append(res2)

# Wordcloud:

In [None]:
duplicate = data_[data_['is_duplicate'] == 1] # Storing data having class label as 1 
not_duplicate = data_[data_['is_duplicate'] == 0] # Storing data having class label as 0 

similar = np.dstack([duplicate["question1"],duplicate["question2"]]).flatten()
no_similar = np.dstack([not_duplicate["question1"], not_duplicate["question2"]]).flatten()

In [None]:
# An example of using np.dstack and flatten() operation:
gfg1 = np.array([1, 2, 3])
gfg2 = np.array([4, 5, 6])

print(np.dstack((gfg1, gfg2)))
print(np.dstack((gfg1, gfg2)).flatten())


In [None]:
print((similar)[:4])

In [None]:
print(no_similar[:4])

In [None]:
# Word Cloud for Duplicate question:

stop_words = set(stopwords.words("english"))

duplicate_words = ''
for j in (similar): # This loop extracts words from sentences given to it as input
    c = (str(j).split())
    for v in c:
        b = v.lower()
        duplicate_words += "".join(b)+" "
            
wc_q = WordCloud(width = 800, height = 800,background_color ='white',stopwords = stop_words,min_font_size = 10)
wc_q.generate(duplicate_words)
print('Word Cloud for Duplicate question')
plt.imshow(wc_q, interpolation='bilinear')
plt.axis("off")
plt.show()
    

In [None]:
# Word Cloud for Non-Duplicate question:

not_duplicate_words = ''
for p in (no_similar):
    d = (str(p).split())
    for n in d:
        s = n.lower()
        not_duplicate_words += "".join(s)+" "
            
wc_q_ = WordCloud(width = 800, height = 800,background_color ='white',stopwords = stop_words,min_font_size = 10)
wc_q_.generate(not_duplicate_words)
print('Word Cloud for Non Duplicate question')
plt.imshow(wc_q_, interpolation='bilinear')
plt.axis("off")
plt.show()

#  Feature Extraction:

In [None]:
# Length of sentence
q1_len = [len(i.split()) for i in q1_clean]
q2_len = [len(j.split()) for j in q2_clean]

# Common words between two sentences
def comm(k):
    w1 = set(map(lambda word: word.lower().strip(), q1_clean[k].split(" "))) 
    w2 = set(map(lambda word: word.lower().strip(), q2_clean[k].split(" ")))    
    return len(w1&w2)
common = [comm(r) for r in range(len(q1_clean))] # storing the number of common words

# Word_share: (common_word_count/total number of words)

total = [a+b for a,b in zip(q1_len,q2_len)]
word_share = [round((c/d),3) for c,d in zip(common,total)] 

# cwc_min & cwc_max: (Ratio of common_word_count to min & max length of word count of Q1 and Q2)

min_len = [min(u,t) for u,t in zip(q1_len,q2_len)] # this returns length of either Q1 or Q2, whichever has minimum one.
max_len = [max(u_,t_) for u_,t_ in zip(q1_len,q2_len)] # this returns length of either Q1 or Q2, whichever has maximum one.

cwc_min = [round((e/f),3) if f!=0 else 0 for e,f in zip(common,min_len)] #if min_len =0, appending zero to avoid zero divison error
cwc_max = [round((e/f),3) for e,f in zip(common,max_len)]

# first word equal or not:

first_word = []
for g in range(len(q1_clean)):
    if len(q1_clean[g].split()) != 0 and len(q2_clean[g].split()) != 0: # handling the condition when length of question = 0
        s3 = q1_clean[g].split()[0]
        s4 = q2_clean[g].split()[0]
    
        if s3 == s4:
            first_word.append(1) # if first word of Q1 & Q2 is same, append 1
        else:
            first_word.append(0) # if first word of Q1 & Q2 is NOT same, append 0
    else:
        first_word.append(0)
        

# last word equal or not:
last_word = []
for g in range(len(q1_clean)):
    if len(q1_clean[g].split()) != 0 and len(q2_clean[g].split()) != 0:
        s3 = q1_clean[g].split()[-1]
        s4 = q2_clean[g].split()[-1]
        if s3 == s4: 
            last_word.append(1) # if last word of Q1 & Q2 is same, append 1. Else 0
        else:
            last_word.append(0)
    else:
        last_word.append(0)
        
len_diff = [abs(t1-t2) for t1,t2 in zip(q1_len,q2_len)] #this returns absolute difference between number of words in Q1 & Q2
avg_len = [(t1+t2)/2 for t1,t2 in zip(q1_len,q2_len)] #this returns average number of words in Q1 & Q2

In [None]:
# Adding these new extracted features to the dataframe

data_['q1_length'] = q1_len
data_['q2_length'] = q2_len
data_['common words'] = common
data_['word share'] = word_share
data_['cwc_min'] = cwc_min
data_['cwc_max'] = cwc_max
data_['first word equal'] = first_word
data_['last word equal'] = last_word
data_['difference in no.of words'] = len_diff
data_['avg length of words'] = avg_len

In [None]:
(data_.head())

In [None]:
# Data visualization for "Word Share":

plt.figure(figsize=(8, 6))

plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'word share', data = data_[0:])

plt.subplot(1,2,2)
sns.kdeplot(data_[data_['is_duplicate'] == 1.0]['word share'][0:] , label = "1", color = 'red')
sns.kdeplot(data_[data_['is_duplicate'] == 0.0]['word share'][0:] , label = "0" , color = 'blue' )
plt.show()

-- There is significant overlap when word_share is used as feature to classify question pairs as duplicate or not.So, this feature doesn't help much.

In [None]:
# Data visualization for "Common words":

plt.figure(figsize=(8, 6))

plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'common words', data = data_[0:])

plt.subplot(1,2,2)
sns.kdeplot(data_[data_['is_duplicate'] == 1.0]['common words'][0:] , label = "1", color = 'red')
sns.kdeplot(data_[data_['is_duplicate'] == 0.0]['common words'][0:] , label = "0" , color = 'blue' )
plt.show()

-- Highly overlapping distibution is observed. Hence, this feature too doesn't do much good for classifying labels 0 & 1

# Advanced Feature Extraction: (Fuzzwuzzy)

In [None]:
# To get longest substring ratio

def lsubstring_ratio(a, b):
    strs = list(distance.lcsubstrings(a, b)) # This returns the length of longest common substring
    if len(strs) == 0:
        return 0
    else:
        return len(strs[0]) / (min(len(a), len(b)) + 1) # This gives us the LCS ratio

# partial ratio:
partial = [fuzz.partial_ratio(k.split(),l.split()) for k,l in zip(q1_clean,q2_clean)]

# token sort ratio:
token_sort = [fuzz.token_sort_ratio(k.split(),l.split()) for k,l in zip(q1_clean,q2_clean)]

# token set ratio:
token_set = [fuzz.token_set_ratio(k.split(),l.split()) for k,l in zip(q1_clean,q2_clean)]

# WRatio:
wratio = [fuzz.WRatio(k.split(),l.split()) for k,l in zip(q1_clean,q2_clean)]

# LCSubstring ratio
lcs = [lsubstring_ratio(k,l) for k,l in zip(q1_clean,q2_clean)]


In [None]:
# Adding these new advanced features to the dataframe:

data_["token_set_ratio"] = token_set
data_["token_sort_ratio"] = token_sort
data_["fuzz_WRatio"] = wratio
data_["fuzz_partial_ratio"] = partial
data_["longest_substr_ratio"]  = lcs

data_.head()

# Data Visualization of Advanced Features:

In [None]:
# Pair plot of some of the advanced features:

n = len(data_)
sns.pairplot(data_[['token_set_ratio', 'token_sort_ratio', 'fuzz_partial_ratio', 'fuzz_WRatio', 'is_duplicate']][0:n], hue='is_duplicate', vars=['token_set_ratio', 'token_sort_ratio', 'fuzz_partial_ratio', 'fuzz_WRatio'])
plt.show()

# vars:list of variable names
# Variables within data to use, otherwise use every column with a numeric datatype.

-- From above combination of features we see that 'token sort/set ratio' & 'fuzz_partial_ratio' do fairly good.

In [None]:
# Distribution of the fuzz_partial_ratio:

plt.figure(figsize=(8, 6))

plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'fuzz_partial_ratio', data = data_[0:])

plt.subplot(1,2,2)
sns.kdeplot(data_[data_['is_duplicate'] == 1.0]['fuzz_partial_ratio'][0:] , label = "1", color = 'red',)
sns.kdeplot(data_[data_['is_duplicate'] == 0.0]['fuzz_partial_ratio'][0:] , label = "0" , color = 'blue')
plt.show()

-- Fuzz partial ratio as a feature has major overlap for values between 45 - 100.


In [None]:
# Distribution of the WRatio:

plt.figure(figsize=(8, 6))

plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'fuzz_WRatio', data = data_[0:])

plt.subplot(1,2,2)
sns.kdeplot(data_[data_['is_duplicate'] == 1.0]['fuzz_WRatio'][0:] , label = "1", color = 'red',)
sns.kdeplot(data_[data_['is_duplicate'] == 0.0]['fuzz_WRatio'][0:] , label = "0" , color = 'blue')
plt.show()

-- For WRatio <45, class label 0 is easily distinguishable

In [None]:
# Distribution of the Token Set ratio:

plt.figure(figsize=(8, 6))

plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'token_set_ratio', data = data_[0:])

plt.subplot(1,2,2)
sns.kdeplot(data_[data_['is_duplicate'] == 1.0]['token_set_ratio'][0:] , label = "1", color = 'red',)
sns.kdeplot(data_[data_['is_duplicate'] == 0.0]['token_set_ratio'][0:] , label = "0" , color = 'blue')
plt.show()

-- Token set ratio as a feature works fairly well as seen from kdeplot.

# Data Preprocessing:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from gensim.models import Word2Vec

In [None]:
# Removing stop words:
all_stopwords = stopwords.words('english')

# Removing the following words from list containing stopwords
all_stopwords.remove('not')
all_stopwords.remove('but')
all_stopwords.remove('because')
all_stopwords.remove('against')
all_stopwords.remove('between')
all_stopwords.remove('up')
all_stopwords.remove('down')
all_stopwords.remove('in')
all_stopwords.remove('out')
all_stopwords.remove('once')
all_stopwords.remove('before')
all_stopwords.remove('after')
all_stopwords.remove('few')
all_stopwords.remove('more')
all_stopwords.remove('most')
all_stopwords.remove('no')
all_stopwords.remove('nor')
all_stopwords.remove('same')
all_stopwords.remove('some')

def remove_stopwords(texts):
    '''
    This function removes stopwords from the sentences
    
    '''
    sentence = texts.lower().strip()
    sentence = sentence.split()
    final1 = [word1 for word1 in sentence if not word1 in set(all_stopwords)]
    final1_ = ' '.join(final1)
    return final1_

q1_clean1 = [] # this contains question1 without any stopwords.
for z in range(len(q1_clean)):
    res = remove_stopwords(str(q1_clean[z]))
    q1_clean1.append(res)

q2_clean2 = [] # this contains question2 without any stopwords.
for z in range(len(q1_clean)):
    res2 = remove_stopwords(str(q2_clean[z]))
    q2_clean2.append(res2)  

In [None]:
print(q1_clean1[:3])
print('*'*50)
print(q2_clean2[:3])

In [None]:
# TF-IDF Vector:

total_questions = (q1_clean1) + (q2_clean2)
tfidf = TfidfVectorizer(max_features = 500, min_df=50)
tfidf.fit(total_questions)
q1_vector = tfidf.transform(q1_clean1).toarray()
q2_vector = tfidf.transform(q2_clean2).toarray()

# Creating a dictionary with word as a key, and the idf as a value
# This is done so that we can get TF-IDF values.

tfidf_values = dict(zip(tfidf.get_feature_names(), list(tfidf.idf_)))
print('Shape of tf-idf vector is: ',q1_vector.shape)

In [None]:
features = tfidf.get_feature_names()
print(features[:25])

# Creating my own W2V model on the Quora question pair corpus:

In [None]:
# Creating w2v model on question 1:
q1_list = []
for new in (q1_clean1):
    q1_list.append(new.split())
    
w2v_q1=Word2Vec(q1_list,min_count=5,vector_size=200, workers=2)
w2v_words_q1 = list(w2v_q1.wv.key_to_index)
print("sample words ", w2v_words_q1[0:50])

In [None]:
# Creating w2v model on question 2:
q2_list = []
for new2 in (q2_clean2):
    q2_list.append(new2.split())
    
w2v_q2 = Word2Vec(q2_list,min_count=5,vector_size=200, workers=2)
w2v_words_q2 = list(w2v_q2.wv.key_to_index)

In [None]:
# Getting TF-IDF*W2V values for question 1 :

tfidf_w2v_q1 = [] # the tfidf-w2v for each question1 is stored in this list

for sent1 in tqdm(q1_list): # for each question1
    sent_vec1 = np.zeros(200) 
    tfidf_sum1 =0
    for word1 in sent1: # for each word in question1
        if word1 in w2v_words_q1 and word1 in features:
            vec1 = w2v_q1.wv[word1] # w2v vector for the word
            tf_idf_q1 = tfidf_values[word1]*(sent1.count(word1)/len(sent1)) # idf * tf = tfidf
            sent_vec1 += (vec1 * tf_idf_q1) # w2v * tfidf
            tfidf_sum1 += tf_idf_q1 # summation of tfidf
    if tfidf_sum1 != 0: # handling boundary condition
        sent_vec1 = sent_vec1/tfidf_sum1
    tfidf_w2v_q1.append(sent_vec1)

tfidf_w2v_q1_list = list(tfidf_w2v_q1)

In [None]:
# Getting TF-IDF*W2V values for question 2 :

tfidf_w2v_q2 = [] # the tfidf-w2v for each question 2 is stored in this list

for sent2 in tqdm(q2_list): # for each question2
    sent_vec2 = np.zeros(200) 
    tfidf_sum2 =0
    for word2 in sent2: # for each word in question2
        if word2 in w2v_words_q2 and word2 in features:
            vec2 = w2v_q2.wv[word2] 
            tf_idf_q2 = tfidf_values[word2]*(sent2.count(word2)/len(sent2)) 
            sent_vec2 += (vec2 * tf_idf_q2) 
            tfidf_sum2 += tf_idf_q2 
    if tfidf_sum2 != 0:
        sent_vec2 = sent_vec2/tfidf_sum2
    tfidf_w2v_q2.append(sent_vec2)

tfidf_w2v_q2_list = list(tfidf_w2v_q2)
   

In [None]:
print('The dimensions of TF_IDF_W2V is:',len(tfidf_w2v_q2[0]))

In [None]:
# Dropping the features which won't be required in Modeling

new_data = data_.drop(['qid1','qid2','question1','question2'], axis = 1)
print(new_data.shape)

In [None]:
n1 = pd.DataFrame(tfidf_w2v_q1_list,index = new_data.index) # Storing the q1 vector here
n2 = pd.DataFrame(tfidf_w2v_q2_list,index = new_data.index) # Storing the q2 vector here
n1['id']=new_data['id']
n2['id']=new_data['id']

In [None]:
# Merging all the dataframes:

n3 = n1.merge(n2,on = 'id',how ='left')
final_data = new_data.merge(n3,on = 'id',how ='left')
print('Final dimensions of the data:',final_data.shape)
final_data

# Modeling:

In [None]:
import math
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, log_loss
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold 
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, auc, roc_curve
from sklearn.model_selection import train_test_split

In [None]:
x = final_data.drop(['is_duplicate','id'],axis = 1)
y = final_data['is_duplicate']

In [None]:
# Splitting the data into Train,CV and Test Sets:

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,stratify = y,random_state = 0)
x_train1,x_cv,y_train1,y_cv = train_test_split(x_train,y_train,test_size = 0.2,stratify = y_train,random_state = 0)

In [None]:
print('Train data dimensions:',x_train1.shape)
print('Cross validation data dimensions:',x_cv.shape)
print('Test data dimensions:',x_test.shape)
print('Total train data dimensions:',x_train.shape)

# Random Model: (To check the log-loss to beat)

In [None]:
# Since we have binary classification problem, this model randomly generates either 1 or 0 as predicted class label.

predicted_y = np.zeros((len(y_test),2))
for i in range(len(y_test)):
    random_prob = np.random.rand(1,2)
    predicted_y[i] = ((random_prob/sum(sum(random_prob)))[0])
print("Log loss on Test Data using Random Model",log_loss(y_test, predicted_y, eps=1e-15))

# Logistic Regression with Hyperparameter Tuning:

In [None]:
alpha = [10 ** x for x in range(-5, 2)] # hyperparameter for SGD classifier.

log_error_array=[] # this list contains the log-loss obtained with different values of alpha
for i in alpha:
    clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
    clf.fit(x_train1, y_train1)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(x_train1, y_train1)
    predict_y = sig_clf.predict_proba(x_cv) # getting the predictions in form of probabilities to use log_loss as a metric
    
    log_error_array.append(log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
    print('For values of alpha = ', i, "The log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))


best_alpha = np.argmin(log_error_array) # selecting the alpha with minimum log_loss
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
clf.fit(x_train, y_train) # now fitting on entire train data
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(x_train, y_train)

predict_y = sig_clf.predict_proba(x_train)
print('*'*100)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(x_test)
print('*'*100)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))

# Random Forest:

In [None]:
classifier = RandomForestClassifier(n_estimators = 15, criterion = 'entropy', random_state = 0, max_depth = 5)
classifier.fit(x_train, y_train)

sig_clf = CalibratedClassifierCV(classifier, method="sigmoid")
sig_clf.fit(x_train, y_train)

predict_y_rf_train = sig_clf.predict_proba(x_train)
print('*'*100)

print("The train log loss is:",log_loss(y_train, predict_y_rf_train, labels=classifier.classes_, eps=1e-15))
predict_y_rf_test = sig_clf.predict_proba(x_test)
print('*'*100)
print( "The test log loss is:",log_loss(y_test, predict_y_rf_test, labels=classifier.classes_, eps=1e-15))

## XGBoost:

In [None]:
import xgboost as xgb
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.01
params['max_depth'] = 5

d_train = xgb.DMatrix(x_train, label=y_train)
d_test = xgb.DMatrix(x_test, label=y_test)

watchlist = [(d_train, 'train'), (d_test, 'valid')]

bst = xgb.train(params, d_train, 300, watchlist, early_stopping_rounds = 20, verbose_eval=10)

xgdmat = xgb.DMatrix(x_train,y_train)
predict_y = bst.predict(d_test)
print("The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))

# https://xgboost.readthedocs.io/en/stable/python/python_intro.html