### Problem Statement
Multiple questions with the same intent can cause seekers to spend more time finding the best answer to their question, and make writers feel they need to answer multiple versions of the same question.

### Dataset Description
1. id - the id of a training set question pair
2. qid1, qid2 - unique ids of each question (only available in train.csv)
3. question1, question2 - the full text of each question
4. is_duplicate - the target variable, set to 1 if question1 and question2 have essentially the same meaning, and 0 otherwise.

In [None]:
import zipfile

z= zipfile.ZipFile('../input/quora-question-pairs/train.csv.zip')
z.extractall()
z= zipfile.ZipFile('../input/quora-question-pairs/test.csv.zip')
z.extractall()

In [None]:
import nltk
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import warnings as wg
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import plot_confusion_matrix, log_loss

from nltk.stem import WordNetLemmatizer
import spacy
from tqdm import tqdm

import vaex
from vaex.ml.sklearn import IncrementalPredictor
from collections import Counter, defaultdict
wg.filterwarnings("ignore")

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

In [None]:
df = pd.read_csv("./train.csv")
df.head(2)

In [None]:
# df[~df['Unnamed: 8'].isna()]

In [None]:
# df = df.drop(["Unnamed: 6", "Unnamed: 7", "Unnamed: 8", "Unnamed: 9", "Unnamed: 10", "Unnamed: 11", 
#          "Unnamed: 12"], axis = 'columns')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.nunique()

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
df["is_duplicate"].value_counts()

In [None]:
df.loc[75, "question1"], df.loc[75, "question2"], df.loc[75, "is_duplicate"]

# Exploratory Data Analysis

In [None]:
df["is_duplicate"].value_counts().plot(kind = "bar")

In [None]:
df.shape

In [None]:
df.nunique()

In [None]:
text1 = " ".join(review for review in df.question1.astype(str))

In [None]:
print ("There are {} words in the combination of all cells in column 'question1'.".format(len(text1)))

In [None]:
stop_words = set(STOPWORDS)
wordcloud1 = WordCloud(stopwords=stop_words, background_color="black", width=800, height=400).generate(text1)

**Common words in question1 column**

In [None]:
plt.figure(figsize=(40,20))
plt.tight_layout(pad=0)
plt.imshow(wordcloud1, interpolation='bilinear')
plt.show()

In [None]:
text2 = " ".join(review for review in df.question2.astype(str))
print ("There are {} words in the combination of all cells in column 'question1'.".format(len(text2)))

In [None]:
wordcloud2 = WordCloud(stopwords=stop_words, background_color="black", width=800, height=400).generate(text2)

![](http://)**Common words in question2 column**

In [None]:
plt.figure(figsize=(40,20))
plt.tight_layout(pad=0)
plt.imshow(wordcloud2, interpolation='bilinear')
plt.show()

# Feature Engineering

In [None]:
df.head()

In [None]:
df["question1"] = df["question1"].replace(" ?","?")
df["question2"] = df["question2"].replace(" ?","?")

In [None]:
def to_string(string):
    return str(string)

In [None]:
df["question1"] = df["question1"].apply(to_string)
df["question2"] = df["question2"].apply(to_string)

In [None]:
def string_length(string):
    return len(string)

In [None]:
# Number of characters in each string
df["char_count1"] = df["question1"].apply(string_length)
df["char_count2"] = df["question2"].apply(string_length)

In [None]:
def num_of_words(string):
    return len(string.split())

In [None]:
# Number of words in each question
df["word_count1"] = df["question1"].apply(num_of_words)
df["word_count2"] = df["question2"].apply(num_of_words)

In [None]:
def common_words(string1, string2):
    s1 = set(string1.split())
    s2 = set(string2.split())
    return len(s1.intersection(s2))

In [None]:
# Number of common words in question1 and question2
df["words_common"] = df.apply(lambda x: common_words(x.question1, x.question2), axis=1)

In [None]:
# Total number of words in question1 and question2
df["words_total"] = df["word_count1"] + df["word_count2"]

In [None]:
# Shared words ratio:- common_words/total_words
df["shared_words_ratio"] = df["words_common"] / df["words_total"]

In [None]:
def first_word_eq(string1, string2):
    q1 = string1.split()
    q2 = string2.split()
    
    if q1[0] == q2[0]:
        return 1
    return 0

def last_word_eq(string1, string2):
    q1 = string1.split()
    q2 = string2.split()
    
    if q1[-1] == q2[-1]:
        return 1
    return 0

In [None]:
# 1 if first word is same else 0
df["first_word_eq"] = df.apply(lambda x: first_word_eq(x.question1, x.question2), axis=1)
# 1 if last word is same else 0
df["last_word_eq"] = df.apply(lambda x: last_word_eq(x.question1, x.question2), axis=1)

**Caclulating string similarity between question1 and question2 using fuzzywuzzy**<br>
 - ratio
 - partial_ratio
 - token_sort_ratio
 - token_set_ratio
 
Not a single ratio is perfect that's why all of them are used one by one<br>
[FuzzyWuzzy: Fuzzy String Matching in Python](https://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)

In [None]:
df["question1"][1], df["question2"][1]

In [None]:
print("Taking simple ratio", fuzz.ratio(df["question1"][1], df["question2"][1]))
print("Taking partial ratio", fuzz.partial_ratio(df["question1"][1], df["question2"][1]))
print("Taking token sort ratio", fuzz.token_sort_ratio(df["question1"][1], df["question2"][1]))
print("Taking token set ratio", fuzz.token_set_ratio(df["question1"][1], df["question2"][1]))

In [None]:
def fuzz_ratio(string1, string2):
    return fuzz.ratio(string1, string2)

def fuzz_partial_ratio(string1, string2):
    return fuzz.partial_ratio(string1, string2)

def fuzz_token_sort_ratio(string1, string2):
    return fuzz.token_sort_ratio(string1, string2)

def fuzz_token_set_ratio(string1, string2):
    return fuzz.token_set_ratio(string1, string2)

In [None]:
df["fuzz_ratio"] = df.apply(lambda x: fuzz_ratio(x.question1, x.question2), axis=1)
df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz_partial_ratio(x.question1, x.question2), axis=1)
df["fuzz_token_sort_ratio"] = df.apply(lambda x: fuzz_token_sort_ratio(x.question1, x.question2), axis=1)
df["fuzz_token_set_ratio"] = df.apply(lambda x: fuzz_token_set_ratio(x.question1, x.question2), axis=1)

In [None]:
# Average number of words in the two questions
df["avg_words"] = (df["word_count1"] + df["word_count2"])/2


# difference in the number of words in the two strings
df["word_diff"] = np.abs(df["word_count1"] - df["word_count2"])

In [None]:
df.head(2)

**Checking how much the new features are able to distinguish the duplicate and non duplicate questions**

In [None]:
plt.figure(figsize=(15, 8))

plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'fuzz_ratio', data = df)

plt.subplot(1,2,2)
sns.distplot(df[df['is_duplicate'] == 1.0]['fuzz_ratio'] , label = "1", color = 'red')
sns.distplot(df[df['is_duplicate'] == 0.0]['fuzz_ratio'] , label = "0" , color = 'blue' )

plt.suptitle("fuzz_ratio distribution for duplicate and non duplicate", fontsize = 16)
plt.show()

In [None]:
plt.figure(figsize=(15, 8))

plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'shared_words_ratio', data = df)

plt.subplot(1,2,2)
sns.distplot(df[df['is_duplicate'] == 1.0]['shared_words_ratio'] , label = "1", color = 'red')
sns.distplot(df[df['is_duplicate'] == 0.0]['shared_words_ratio'] , label = "0" , color = 'blue' )

plt.suptitle("shared_words_ratio distribution for duplicate and non duplicate", fontsize = 16)
plt.show()

In [None]:
plt.figure(figsize=(15, 8))

plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'word_diff', data = df)

plt.subplot(1,2,2)
sns.distplot(df[df['is_duplicate'] == 1.0]['word_diff'] , label = "1", color = 'red')
sns.distplot(df[df['is_duplicate'] == 0.0]['word_diff'] , label = "0" , color = 'blue' )

plt.suptitle("word_diff distribution for duplicate and non duplicate", fontsize = 16)
plt.show()

# Text Preprocessing Techniques

* Expand Contractions
* Lower Case
* Remove Punctuations
* Remove Stopwords
* Stemming and Lemmatization
* Remove White spaces

In [None]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and "}

In [None]:
lemmatizer = WordNetLemmatizer()
# stop_words = set(stopwords.words('english'))
class text_preprocessing:
    def __init__(self):
        pass
        
    def cont_to_exp(self,x): # Contraction to expansion
        if type(x) is str:
            x = x.replace("\w*\\", "")
            for key in contractions:
                value = contractions[key]
                x = x.replace(key, value)
            return x
        else:
            return x
    
    def to_lower(self, text):
        return text.lower()
    
    def special_char_removal(self, x):
        return re.sub(r"[^\w+ ]+", "", x)
    
    def remove_stopwords(self,text):
        return " ".join([word for word in str(text).split() if word not in stop_words])
    
    def lemmatize_words(self, text):
        return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    
    def digit_removal(self, text): #Some questions are like maths equaltions
        return re.sub('\w+\d+\w*','',text)
    
    def removing_spaces(self, text):
        return text.strip()

In [None]:
pro = text_preprocessing()

In [None]:
df["question1"] = df["question1"].apply(lambda x: pro.cont_to_exp(x))
df["question1"] = df["question1"].apply(lambda x: pro.to_lower(x))
df["question1"] = df["question1"].apply(lambda x: pro.special_char_removal(x))
df["question1"] = df["question1"].apply(lambda x: pro.remove_stopwords(x))
df["question1"] = df["question1"].apply(lambda x: pro.lemmatize_words(x))
df["question1"] = df["question1"].apply(lambda x: pro.digit_removal(x))
df["question1"] = df["question1"].apply(lambda x: pro.removing_spaces(x))


df["question2"] = df["question2"].apply(lambda x: pro.cont_to_exp(x))
df["question2"] = df["question2"].apply(lambda x: pro.to_lower(x))
df["question2"] = df["question2"].apply(lambda x: pro.special_char_removal(x))
df["question2"] = df["question2"].apply(lambda x: pro.remove_stopwords(x))
df["question2"] = df["question2"].apply(lambda x: pro.lemmatize_words(x))
df["question2"] = df["question2"].apply(lambda x: pro.digit_removal(x))
df["question2"] = df["question2"].apply(lambda x: pro.removing_spaces(x))

In [None]:
df.head()

# Generating Word embeddings

## 1. Word2Vec

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
def get_vec(string):
    doc = nlp(string)
    vec = doc.vector
    return vec

**Uncomment the below code to get the final dataframe having word2vec embeddings and derived features**

In [None]:
# %%time
# df["q1_vector"] = df["question1"].apply(lambda x: get_vec(x))
# df["q2_vector"] = df["question2"].apply(lambda x: get_vec(x))

# df_q1 = pd.DataFrame(df.q1_vector.values.tolist(), index= df.index)
# df_q2 = pd.DataFrame(df.q2_vector.values.tolist(), index= df.index)

# df_features = df.drop(['id', 'question1', 'question2',
#                       'q1_vector', 'q2_vector'], axis = 'columns')

# print("Number of independent features generated after feature engineering:", df_features.shape[1] - 2)
# print("Features generated through word2vec of question_1:", df_q1.shape[1])
# print("Features generated through word2vec of question_2:", df_q2.shape[1])

# final_df = pd.concat([df_features, df_q1, df_q2], axis = 'columns')

In [None]:
from IPython.display import HTML

def create_download_link(filename, title = "Download CSV file"):  
    html = '<a href={filename}>{title}</a>'
    html = html.format(title=title,filename=filename)
    return HTML(html)

In [None]:
# create a link to download the dataframe which was saved with .to_csv method

# create_download_link(filename='word2vec.csv')

## 2. Sent2Vec transformer

In [None]:
pip install -U sentence-transformers

[Click here to read more about the below transformer](https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L3-v2)

<br>
Reason to use this particular transformer:<br>
* It occupies lesser memory.<br>
* It was having far better speed than any other transformer.

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L3-v2')
# Model Size:- 61MB, Model Speed:- 19000, Avg Performance:- 50.74

In [None]:
def get_sent_vec(string):
    embedding = model.encode(string,show_progress_bar = False)
    return embedding

**Uncomment the below code to get the final dataframe having sent2vec embeddings and derived features**

In [None]:
# %%timeit
# df["q1_vector"] = df["question1"].apply(lambda x: get_sent_vec(x))
# df["q2_vector"] = df["question2"].apply(lambda x: get_sent_vec(x))

# df_q1 = pd.DataFrame(df.q1_vector.values.tolist(), index= df.index)
# df_q2 = pd.DataFrame(df.q2_vector.values.tolist(), index= df.index)

# df_features = df.drop(['id', 'question1', 'question2',
#                       'q1_vector', 'q2_vector'], axis = 'columns')

# final_df = pd.concat([df_features, df_q1, df_q2], axis = 'columns')

In [None]:
# create a link to download the dataframe which was saved with .to_csv method

# create_download_link(filename='sent2vec.csv')

In [None]:
w2v_df = pd.read_csv("../input/embeddings/embeddings_data/word2vec.csv")
sent2v_df = pd.read_csv("../input/embeddings/embeddings_data/sent2vec.csv")

In [None]:
w2v_df.shape, sent2v_df.shape

**Since, the data is very huge, the preprocessing on CSV will take a lot of time, so here I am converting my data into hdf5 which stores the data into hierarchical order. Also, I am using vaex as it works efficiently on large datasets**

In [None]:
vaexdf_w2v = vaex.from_pandas(w2v_df)
vaexdf_sent2v = vaex.from_pandas(sent2v_df)

vaexdf_w2v.export_hdf5('word2vec.hdf5')
vaexdf_sent2v.export_hdf5('sent2vec.hdf5')

vaexdf_w2v = vaex.open("./word2vec.hdf5")
vaexdf_sent2v = vaex.open("./sent2vec.hdf5")

**Taking random 100,000 records for fast model training**

In [None]:
vaexdf_w2v = vaexdf_w2v.shuffle(random_state = 0)
vaexdf_sent2v = vaexdf_sent2v.shuffle(random_state = 1)

vaexdf_w2v = vaexdf_w2v[:100000]
vaexdf_sent2v = vaexdf_sent2v[:100000]

# Train Test Split

In [None]:
w2v_train , w2v_test = vaexdf_w2v.ml.train_test_split(test_size = 0.2)
sent2v_train, sent2v_test = vaexdf_sent2v.ml.train_test_split(test_size = 0.2)

In [None]:
w2v_train.shape, w2v_test.shape, sent2v_train.shape, sent2v_test.shape

**Checking class distribution in train and test data**

In [None]:
print("-"*10, "Distribution of output variable in WORD2VEC train data", "-"*10)
train_distr = Counter(w2v_train['is_duplicate'].to_pandas_series())
train_len = len(w2v_train['is_duplicate'].to_pandas_series())
print("\t Class 0: ",int(train_distr[0])/train_len,"Class 1: ", int(train_distr[1])/train_len)

print("\n")

print("-"*10, "Distribution of output variable in WORD2VEC test data", "-"*10)
test_distr = Counter(w2v_test['is_duplicate'].to_pandas_series())
test_len = len(w2v_test['is_duplicate'].to_pandas_series())
print("\t Class 0: ",int(test_distr[1])/test_len, "Class 1: ",int(test_distr[1])/test_len)

In [None]:
print("-"*10, "Distribution of output variable in SENT2VEC train data", "-"*10)
train_distr = Counter(sent2v_train['is_duplicate'].to_pandas_series())
train_len = len(sent2v_train['is_duplicate'].to_pandas_series())
print("\t Class 0: ",int(train_distr[0])/train_len,"Class 1: ", int(train_distr[1])/train_len)

print("\n")

print("-"*10, "Distribution of output variable in SENT2VEC test data", "-"*10)
test_distr = Counter(sent2v_test['is_duplicate'].to_pandas_series())
test_len = len(sent2v_test['is_duplicate'].to_pandas_series())
print("\t Class 0: ",int(test_distr[1])/test_len, "Class 1: ",int(test_distr[1])/test_len)

# Scaling

In [None]:
w2v_features = list(w2v_train.column_names)
w2v_features.remove('is_duplicate')

sent2v_features = list(sent2v_train.column_names)
sent2v_features.remove('is_duplicate')

In [None]:
len(w2v_features), len(sent2v_features)

In [None]:
# Feature Scaling of both the dataframes

scaler_w2v = vaex.ml.MinMaxScaler(features = w2v_features, prefix = "scaled_")
scaler_sent2v = vaex.ml.MinMaxScaler(features = sent2v_features, prefix = "scaled_")

# Fitting the scaler to the train set
scaler_w2v.fit(w2v_train)
scaler_sent2v.fit(sent2v_train)

# Getting the transformed training and test set
w2v_train_trans = scaler_w2v.transform(w2v_train)
w2v_test_trans = scaler_w2v.transform(w2v_test)

sent2v_train_trans = scaler_sent2v.transform(sent2v_train)
sent2v_test_trans = scaler_sent2v.transform(sent2v_test)

# Removing the initial non scaled features from the scaled dataframe
w2v_train_trans.drop(w2v_features, inplace = True)
w2v_test_trans.drop(w2v_features, inplace = True)

sent2v_train_trans.drop(sent2v_features, inplace = True)
sent2v_test_trans.drop(sent2v_features, inplace = True)

# Logistic Regression and Linear SVM With Hyperparameter Tuning

In [None]:
def sgd_tuned_model(X_train, y_train, X_test, y_test, loss):
    alpha = [10 ** x for x in range(-5, 2)] # This is basically the learning rate, the constant in regularization

    log_error_array=[]
    for i in alpha:
        clf = SGDClassifier(alpha=i, penalty='l2', loss=loss, random_state=0)
        clf.fit(X_train, y_train)
        sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
        sig_clf.fit(X_train, y_train)
        predict_y = sig_clf.predict_proba(X_test)
        log_error_array.append(log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
        print('For values of alpha = ', i, "The log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))

    fig, ax = plt.subplots()
    ax.plot(alpha, log_error_array,c='g')
    for i, txt in enumerate(np.round(log_error_array,3)):
        ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],log_error_array[i]))
    plt.grid()
    plt.title("Cross Validation Error for each alpha")
    plt.xlabel("Alpha i's")
    plt.ylabel("Error measure")
    plt.show()


    best_alpha = np.argmin(log_error_array)
    clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss=loss, random_state=0)
    clf.fit(X_train, y_train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(X_train, y_train)

    predict_y = sig_clf.predict_proba(X_train)
    print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
    predict_y = sig_clf.predict_proba(X_test)
    print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
    predicted_y =np.argmax(predict_y,axis=1)
    print("Total number of data points :", len(predicted_y))
    plot_confusion_matrix(sig_clf,X_test, y_test, normalize = 'true')

In [None]:
X_train_w2v = w2v_train_trans.drop('is_duplicate').to_pandas_df()
y_train_w2v = w2v_train_trans['is_duplicate'].to_pandas_series()
X_test_w2v = w2v_test_trans.drop('is_duplicate').to_pandas_df()
y_test_w2v = w2v_test_trans['is_duplicate'].to_pandas_series()

X_train_sent2v = sent2v_train_trans.drop('is_duplicate').to_pandas_df()
y_train_sent2v = sent2v_train_trans['is_duplicate'].to_pandas_series()
X_test_sent2v = sent2v_test_trans.drop('is_duplicate').to_pandas_df()
y_test_sent2v = sent2v_test_trans['is_duplicate'].to_pandas_series()

In [None]:
X_train_w2v.shape,X_train_sent2v.shape

In [None]:
# Implementing LR with tuning on Word2Vec embeddings data
sgd_tuned_model(X_train_w2v, y_train_w2v, X_test_w2v, y_test_w2v, 'log')

In [None]:
# Implementing LR with tuning on Sent2Vec embeddings data
sgd_tuned_model(X_train_sent2v, y_train_sent2v, X_test_sent2v, y_test_sent2v, 'log')

In [None]:
# Implementing Linear SVM with tuning on Word2Vec embeddings data
sgd_tuned_model(X_train_w2v, y_train_w2v, X_test_w2v, y_test_w2v, 'hinge')

In [None]:
# Implementing Linear SVM with tuning on Sent2Vec embeddings data
sgd_tuned_model(X_train_sent2v, y_train_sent2v, X_test_sent2v, y_test_sent2v, 'hinge')

# Xgboost with hyperparameter tuning

In [None]:
w2v_train.shape, w2v_test.shape

In [None]:
def xgb_classifier(params, X_train, X_test, y_train, y_test):
#     params = {}
#     params['objective'] = 'binary:logistic'
#     params['eval_metric'] = 'logloss'
#     params['eta'] = 0.02
#     params['max_depth'] = 4

    d_train = xgb.DMatrix(X_train, label=y_train)
    d_test = xgb.DMatrix(X_test, label=y_test)

    watchlist = [(d_train, 'train'), (d_test, 'valid')]

    bst = xgb.train(params, d_train, 400, watchlist,verbose_eval= False,early_stopping_rounds=20)

#     xgdmat = xgb.DMatrix(X_train,y_train)
    predict_y = bst.predict(d_test)
    print("The test log loss is:",log_loss(y_test, predict_y,eps=1e-15))

In [None]:
# params for hyperparameter tuning
# w2v_params={
#     "learning_rate"    : [0.0001, 0.001, 0.01, 0.10] ,
#     "max_depth"        : [5, 10, 20, 100, 200],
#     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7],
#     "n_estimators" : [50, 100, 150, 200]
# }

# sent2v_params={
#     "learning_rate"    : [0.0001, 0.001, 0.01, 0.10],
#     "max_depth"        : [4, 5, 10, 15, 20],
#     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7],
#     "n_estimators" : [50, 100, 150, 200]
# }

In [None]:
# # Implementing XGB classifier with tuning on Word2Vec embeddings data
# X_train = sent2v_train_trans.drop('is_duplicate').to_pandas_df()
# y_train = sent2v_train_trans['is_duplicate'].to_pandas_series()
# X_test = sent2v_test_trans.drop('is_duplicate').to_pandas_df()
# y_test = sent2v_test_trans['is_duplicate'].to_pandas_series()

# clf = xgb.XGBClassifier(objective = 'binary:logistic', eval_metric = 'logloss')

# random_search = RandomizedSearchCV(clf,
#                                    param_distributions = sent2v_params,
#                                    n_iter = 3,
#                                    n_jobs = -1,
#                                    cv = 5,
#                                    verbose = 3)
# random_search.fit(X_train, y_train)

# random_search.best_params_

Best params using random search cv for Xgboost on W2V embeddings: <br>
{'n_estimators': 200,
 'max_depth': 10,
 'learning_rate': 0.1,
 'colsample_bytree': 0.3}

In [None]:
# Implementing XGBclasifier on Word2Vec data

params = {'objective' : 'binary:logistic', 
          'eval_metric' : 'logloss',
          'n_parameters' : 200,
          'max_depth' : 10, 
          'learning_rate' : 0.1,
          'colsample_bytree' : 0.3}

X_train = w2v_train.drop('is_duplicate').to_pandas_df()
y_train = w2v_train['is_duplicate'].to_pandas_series()
X_test = w2v_test.drop('is_duplicate').to_pandas_df()
y_test = w2v_test['is_duplicate'].to_pandas_series()

xgb_classifier(params, X_train, X_test, y_train, y_test)

Best params using random search cv for xgboost on Sent2Vec embeddings:<br>
{'n_estimators': 100,
 'max_depth': 15,
 'learning_rate': 0.01,
 'colsample_bytree': 0.5}

In [None]:
# Implementing XGBclasifier on Sent2Vec data
params = {'objective' : 'binary:logistic', 
          'eval_metric' : 'logloss',
          'n_parameters' : 100,
          'max_depth' : 15, 
          'learning_rate' : 0.01,
          'colsample_bytree' : 0.5}

X_train = sent2v_train.drop('is_duplicate').to_pandas_df()
y_train = sent2v_train['is_duplicate'].to_pandas_series()
X_test = sent2v_test.drop('is_duplicate').to_pandas_df()
y_test = sent2v_test['is_duplicate'].to_pandas_series()

xgb_classifier(params, X_train, X_test, y_train, y_test)