# Quora Question Pairs similarity prediction:

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import zipfile # to read/ extract zip files
import matplotlib.pyplot as plt # to plot graphs
import seaborn as sns # to plot graphs

# ignoring any/ all types of warnings:
import warnings
warnings.filterwarnings(action='ignore')

import gc # to free-up memory occasionally

import tqdm
from tqdm import notebook

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Reading the zip file:
zf = zipfile.ZipFile('/kaggle/input/quora-question-pairs/train.csv.zip')
df = pd.read_csv(zf.open('train.csv'))

## Basic Exploratory Data Analysis:

In [None]:
print("Dataframe info.:")
print(df.info())
print("Example data points: ")
display(df.head(5))
print(">> Size of the dataset: ", df.shape[0])
print(">> Shape of the dataset: ", df.shape)

#### Count of questions that are similar or different:

In [None]:
print(">> Total number of questions pairs for training: ", df.shape[0])
print(">> Questions that are similar: {}%".format(round(df['is_duplicate'].value_counts(normalize='true')[0]*100,2)))
print(">> Questions that are different: {}%".format(round(df['is_duplicate'].value_counts(normalize='true')[1]*100,2)))

plt.figure(figsize=(10,6))
df.groupby('is_duplicate')['id'].count().plot.bar()
plt.show()

#### Count of questions that are repeated or unique:

In [None]:
qids = pd.Series(df['qid1'].tolist() + df['qid2'].tolist())
unique_qids = len(np.unique(qids))
qs_morethan_onetime = np.sum(qids.value_counts()>1)
print(">> Total questions:", len(qids))
print(">> Total unique Questions:", unique_qids)
print(">> Total questions that repeat more than once: {} i.e. {:.2%} of unique questions".format(qs_morethan_onetime,qs_morethan_onetime/unique_qids))
print(">> Maximum no. of times a single question is repeated:", qids.value_counts().values[0])

x = ['unique questions', 'repeated questions']
y = [unique_qids, qs_morethan_onetime]
plt.figure(figsize=(10,6))
plt.title('Plot representing the unique & repeated questions')
sns.barplot(x,y)
plt.show()

#### Occurences of each question:

In [None]:
plt.figure(figsize=(15,6))
plt.hist(qids.value_counts(), bins=160)
plt.yscale('log', nonposy='clip')
plt.title('Log-histogram of question appearance counts')
plt.xlabel('Number of occurrences of question')
plt.ylabel('Number of questions')
print('Maximum time a single question is repated is: ', qids.value_counts().values[0])

#### Checking for null values:

In [None]:
# Check which rows have null values:
nan_rows = df[df.isnull().any(1)]
nan_rows

# Replace Nan values with whitespace and check again if the null values have been dealt with:
df = df.fillna(' ')
nan_rows = df[df.isnull().any(1)]
print(nan_rows)
print(">> No more null or NaN values.")

## Basic featurization:

In [None]:
#del df_basic
df_basic = df.copy()
display(df_basic.head(5))

In [None]:
import tqdm
from tqdm import notebook

quest1_len=[]
quest2_len=[]
quest1_word_count=[]
quest2_word_count=[]
word_total=[]
word_common=[]
word_shared=[]

for i in notebook.tqdm(df_basic['id']):
    # Calculate the values:
    q1=df_basic['question1'][i]
    q2=df_basic['question2'][i]
    q1_words = q1.split()
    q2_words = q2.split()
    total = len(set(q1_words)) + len(set(q2_words))
    unique_common = len(set(q1_words)&set(q2_words))
    
    # Append in the lists accordingly:
    quest1_len.append(len(q1)) # length of question 1
    quest2_len.append(len(q2)) # length of question 2
    quest1_word_count.append(len(q1_words)) # no. of words in question 1
    quest2_word_count.append(len(q2_words)) # no. of words in question 2
    word_total.append(total) # total number of unique words in question 1 & question 2
    word_common.append(unique_common) # total number of common unique words in question 1 & question 2
    word_shared.append(unique_common/ total) # ratio of common unique words & total number of words
        
df_basic['quest1_len'] = quest1_len
df_basic['quest2_len'] = quest2_len
df_basic['quest1_word_count'] = quest1_word_count
df_basic['quest2_word_count'] = quest2_word_count
df_basic['word_total'] = word_total
df_basic['word_common'] = word_common
df_basic['word_shared'] = word_shared
  
display(df_basic.head(5))

#freeing up the memory
gc.collect()
del quest1_len, quest2_len, quest1_word_count, quest2_word_count, word_total, word_common, word_shared

### Exploratory data analysis of basic features:
Checking if any of these features are useful or not using the violin & pdf plots:

In [None]:
plt.figure(figsize=(22,8)) # setting the size of the plot

# customizing sns plots, completely optional:
sns.set_context("notebook", font_scale=1.0)
sns.set_style("whitegrid")

# 1st subplot - violin plot:
plt.subplot(1,3,1)
sns.violinplot(x='is_duplicate', y='word_common', data=df_basic)

# 2nd subplot - box plot:
plt.subplot(1,3,2)
sns.boxplot(x='is_duplicate', y='word_common', data=df_basic)

# 3rd subplot - pdf plot:
plt.subplot(1,3,3)
sns.distplot(df_basic[df_basic['is_duplicate']==0]['word_common'], label='0') # is_duplicate=0
sns.distplot(df_basic[df_basic['is_duplicate']==1]['word_common'], label='1') # is_duplicate=1

plt.show()

In [None]:
plt.figure(figsize=(22,8))

sns.set_context("notebook", font_scale=1.0)
sns.set_style("whitegrid")

plt.subplot(1,3,1)
#sns.set_theme()
sns.violinplot(x='is_duplicate', y='word_shared', data=df_basic)

plt.subplot(1,3,2)
sns.boxplot(x='is_duplicate', y='word_shared', data=df_basic)

plt.subplot(1,3,3)
sns.distplot(df_basic[df_basic['is_duplicate']==0]['word_shared'], label='0') # is_duplicate=0
sns.distplot(df_basic[df_basic['is_duplicate']==1]['word_shared'], label='1') # is_duplicate=1

plt.show()

### 'word_shared' definitely seems  more useful feature than 'word_common' by looking at the density plots as 'word_shared' has less area overalapping as compared to the 'word_common'

## Pre-processing of the text data:
1. Removing HTML tags
2. Removing punctuations
3. Performing Stemming
4. Removing stop-words
5. Expanding contractions, etc.

In [None]:
import regex as re
from bs4 import BeautifulSoup
from nltk.stem import *

# Get all the HTML tags, special characters & puctuations and remove them:
def text_cleanup(text):
    #print("Original text:\n", text)
    soup = BeautifulSoup(text)
    clean_text = soup.get_text(strip=True)
    #print("After removing HTML tags:\n", clean_text)
    
    clean_text = clean_text.lower()
    
    clean_text = clean_text.replace(",000,000", "m").replace(",000", "k")\
                .replace("′", "'").replace("’", "'").replace("won't", "will not")\
                .replace("cannot", "can not").replace("can't", "can not")\
                .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                .replace("he's", "he is").replace("she's", "she is")\
                .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                .replace("€", " euro ").replace("'ll", " will")
    clean_text = re.sub(r"([0-9]+)000000", r"\1m", clean_text)
    clean_text = re.sub(r"([0-9]+)000", r"\1k", clean_text)
    clean_text = clean_text.replace("'s", " own")
    
    #print("After expanding the contractions:\n", clean_text)
    
    pattern = re.compile('\W')
    clean_text = re.sub(pattern, ' ', clean_text)
#     if type(clean_text) == type(''):
#         clean_text = re.sub(pattern, ' ', clean_text)
    
    #print("After removing special characters:\n", clean_text)
    
    
    port = PorterStemmer()
    #snow = SnowballStemmer('english')
    
#     if type(clean_text) = type(''):
#         clean_text = port.stem(clean_text)
    clean_text = port.stem(clean_text)
    #print("After performing Porter Stemming: ", clean_text1)

    #    clean_text = snow.stem(clean_text)
    #clean_text2 = snow.stem(clean_text)
    #print("After performing Snowball Stemming: ", clean_text2)

    return clean_text

## Advanced featurization:

Definitions:
> 1. **Token**: Token is obtained by splitting up the sentence.
> 2. **Stop words**: Stop words as per NLTK
> 3. **Word**: A token that is not a stop-word.

Features:
> 1. **cwc_min**: Ratio of common word count to the min(length(Q1),length(Q2))
        cwc_min = common_word_count / min(length(Q1),length(Q2))
> 2. **cwc_max**: Ratio of common word count to the max(length(Q1),length(Q2))
        cwc_max = common_word_count / max(length(Q1),length(Q2))
> 3. **csc_min**: Ratio of common stop-word count to the min(length(Q1),length(Q2))
        csc_min = common_stop_count / min(length(Q1),length(Q2))
> 4. **csc_max**: Ratio of common stop-word count to the max(length(Q1),length(Q2))
        cwc_max = common_stop_count / max(length(Q1),length(Q2))
> 5. **ctc_min**: Ratio of common token count to the min(length(Q1),length(Q2))
        ctc_min = common_token_count / min(length(Q1),length(Q2))
> 6. **ctc_max**: Ratio of common token count to the max(length(Q1),length(Q2))
        ctc_min = common_token_count / max(length(Q1),length(Q2))
> 7. **abs_len_diff**: Absolute token-length difference between both the questions
        abs_len_diff = abs(length(Q1) - length(Q2))
> 8. **mean_len**: Average token-length difference between both the questions
        mean_len = [length(Q1) + length(Q2)] / 2
> 9. **longest_substr_ratio**: Ratio of length of longest common substring to the   min(length(Q1),length(Q2))
> 10. **last_word_eq**: Check if last word of both questions are same or not (boolean)
> 11. **first_word_eq**: Check if first word of both questions are same or not (boolean)
> 12. We'll be using fuzzywuzzy librarry to get these features - **fuzz_ratio**, **fuzz_partial_ratio**, **token_sort_ratio** & **token_set_ratio**     
Developer's blog :http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/     
Github link: https://github.com/seatgeek/fuzzywuzzy


In [None]:
!python -m pip install --upgrade pip
!pip install fuzzywuzzy
!pip install distance

In [None]:
from fuzzywuzzy import fuzz # to calculate fuzzy ratios
from nltk.corpus import stopwords # to get the  stop-words
import distance # to get the similarity between two words

df_advanced = df.copy() #  making a copy of the dataframe
stop_words = set(stopwords.words('english')) # getting the stop-words

# for i in notebook.tqdm(list(set(df_advanced['qid1']))):
#     quest1 = df_advanced[df_advanced['qid1']==i]['question1']
#     df_advanced['question1'][df_advanced['qid1']==i] = text_cleanup(quest1.values[0])
    
# for i in notebook.tqdm(list(set(df_advanced['qid2']))):
#     quest2 = df_advanced[df_advanced['qid2']==i]['question2']
#     df_advanced['question2'][df_advanced['qid2']==i] = text_cleanup(quest2.values[0])

quest1=[]
quest2=[]

# iterating through question1 column to pre-process the text:
print("Text-cleanup initiated - Part 1/2")
for i in notebook.tqdm(df_advanced['question1']):
    quest1.append(text_cleanup(i))

# iterating through question2 column to pre-process the text:
print("Part 1 completed, Part 2/2 started...")
for i in notebook.tqdm(df_advanced['question2']):
    quest2.append(text_cleanup(i))

# moving the cleaned questions/ text to the dataframe:
df_advanced['question1'] = quest1
df_advanced['question2'] = quest2
print("Part 2 completed, Text-cleanup completed!")

display(df_advanced)

print('Advanced featurization initiated...')

# initial values
safe_div = 0.0001 # using a constant to avoid division by zero error
cwc_min=[]
cwc_max=[]
csc_min=[]
csc_max=[]
ctc_min=[]
ctc_max=[]
abs_len_diff=[]
mean_len=[]
longest_substr_ratio=[]
last_word_eq=[]
first_word_eq=[]
longest_substr_ratio=[]
fuzz_ratio=[]
fuzz_partial_ratio=[]
fuzz_token_sort_ratio=[]
fuzz_token_set_ratio=[]

for i in notebook.tqdm(df_advanced['id']):
    # Storing the lengths & declaring the empty lists
    q1=df_advanced['question1'][i]
    q2=df_advanced['question2'][i]
    q1_len = len(q1)
    q2_len = len(q2)
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    q1_stop = set()
    q2_stop = set()
    q1_words = set()
    q2_words = set()
    for token in q1_tokens:
        if token in stop_words:
            q1_stop.add(token) # storing the question1 stop-words
        else:
            q1_words.add(token) # storing the question1 words
    for token in q2_tokens:
        if token in stop_words:
            q2_stop.add(token) # storing the question2 stop-words
        else:
            q2_words.add(token) # storing the question2 words

    unique_common_word = len(q1_words & q2_words) # storing the common words
    unique_common_stop = len(q1_stop & q2_stop) # storing the common stop-words
    unique_common_tokens = len(set(q1_tokens) & set(q2_tokens)) # storing the common tokens
    
    # Calculating the above mentioned features and storing them in lists:
    cwc_min.append(unique_common_word/(min(len(q1_words),len(q2_words))+safe_div))
    cwc_max.append(unique_common_word/(max(len(q1_words),len(q2_words))+safe_div))
    csc_min.append(unique_common_stop/(min(len(q1_stop),len(q2_stop))+safe_div))
    csc_max.append(unique_common_stop/(max(len(q1_stop),len(q2_stop))+safe_div))
    ctc_min.append(unique_common_tokens/(min(len(q1_tokens),len(q2_tokens))+safe_div))
    ctc_max.append(unique_common_tokens/(max(len(q1_tokens),len(q2_tokens))+safe_div))
    abs_len_diff.append(abs(len(q1_tokens) - len(q2_tokens)))
    mean_len.append((len(q1_tokens) + len(q2_tokens))/2)
    try:
        longest_substr_ratio.append(len(list(distance.lcsubstrings(q1,q2))[0])/(min(q1_len,q2_len)+1))
    except IndexError as e:
        longest_substr_ratio.append(0)

    try:
        first_word_eq.append(int(q1_tokens[0]==q2_tokens[0]))
        last_word_eq.append(int(q1_tokens[-1]==q2_tokens[-1]))
    except IndexError as e:
        first_word_eq.append(0)
        last_word_eq.append(0)
    
    fuzz_ratio.append(fuzz.ratio(q1,q2))
    fuzz_partial_ratio.append(fuzz.partial_ratio(q1,q2))
    fuzz_token_sort_ratio.append(fuzz.token_sort_ratio(q1,q2))
    fuzz_token_set_ratio.append(fuzz.token_set_ratio(q1,q2))
    
# Moving the lists to the below new columns in the dataframe:
df_advanced['cwc_min'] = cwc_min
df_advanced['cwc_max'] = cwc_max
df_advanced['csc_min'] = csc_min
df_advanced['csc_max'] = csc_max
df_advanced['ctc_min'] = ctc_min
df_advanced['ctc_max'] = ctc_max
df_advanced['abs_len_diff'] = abs_len_diff
df_advanced['mean_length'] = mean_len
df_advanced['longest_substr_ratio'] = longest_substr_ratio
df_advanced['first_word_eq'] = first_word_eq
df_advanced['last_word_eq'] = last_word_eq
df_advanced['fuzz_ratio'] = fuzz_ratio
df_advanced['fuzz_partial_ratio'] = fuzz_partial_ratio
df_advanced['fuzz_token_sort_ratio'] = fuzz_token_sort_ratio
df_advanced['fuzz_token_set_ratio'] = fuzz_token_set_ratio

gc.collect()

display(df_advanced)
#df_advanced.to_csv("df_advanced.csv",index=False)

print("Advanced featurization complete!")

## Exploratory Data Analysis of the above calculated Advanced features:

In [None]:
# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# import zipfile # to read/ extract zip files
# import matplotlib.pyplot as plt # to plot graphs
# import seaborn as sns # to plot graphs

# # ignoring any/ all types of warnings:
# import warnings
# warnings.filterwarnings(action='ignore')

# import gc # to free-up memory occasionally

# df_advanced = pd.read_csv("../input/df-advanced/df_advanced.csv")
# display(df_advanced.head(5))

### Generating Word clouds for all the words that come under duplicate & non-duplicate question pairs:

In [None]:
from wordcloud import WordCloud, STOPWORDS

stop_words = set(STOPWORDS) # getting the stop-words

# removing the below stopwords as they might contirbute information in this scenario:
stop_words.remove('like')
stop_words.remove('cannot')
stop_words.remove('no')
stop_words.remove('not')

# getting all the words in question1 for all non duplicate question pairs:
words_in_non_duplicates=""
for i in df_advanced[df_advanced['is_duplicate']==0]['question1'].values:
    try:
        i=i.strip()
    except AttributeError as e:
        i=' '.strip()
    words_in_non_duplicates += i

# getting all the words in question2 for all non duplicate question pairs:
for i in df_advanced[df_advanced['is_duplicate']==0]['question2'].values:
    try:
        i=i.strip()
    except AttributeError as e:
        i=' '.strip()
    words_in_non_duplicates += i

print("Generating the word cloud...")

total_nodup_words = len(" ".join(words_in_non_duplicates))

print("Word Cloud for NON-DUPLICATE question pairs:")
wordcloud = WordCloud(background_color='white',width=1200,height=400,max_words=total_nodup_words,stopwords=stop_words)
wordcloud = wordcloud.generate(words_in_non_duplicates)

plt.figure(figsize=(30,15))
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
print("Total no. of words in NON-DUPLICATE questions: ",total_nodup_words)

In [None]:
# getting all the words in question1 for all the duplicate question pairs:
words_in_duplicates=""
for i in df_advanced[df_advanced['is_duplicate']==1]['question1'].values:
    try:
        i=i.strip()
    except AttributeError as e:
        i=' '.strip()
    words_in_duplicates += i

# getting all the words in question2 for all the duplicate question pairs:
for i in df_advanced[df_advanced['is_duplicate']==1]['question2'].values:
    try:
        i=i.strip()
    except AttributeError as e:
        i=' '.strip()
    words_in_duplicates += i

print("Generating word cloud...")

total_dup_words = len(" ".join(words_in_duplicates))
    
print("Word Cloud for DUPLICATE question pairs:")
wordcloud = WordCloud(background_color='white',width=1200,height=400,max_words=total_dup_words,stopwords=stop_words)
wordcloud = wordcloud.generate(words_in_duplicates)

plt.figure(figsize=(30,15))
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
print("Total no. of words in NON-DUPLICATE questions: ",total_dup_words)

### Plotting pair-plots for all the combinations of the advanced features calculated above:

In [None]:
sns.set_palette('muted') # setting the color palette
#sns.set_style('dark')
sns.set_context('notebook',font_scale=1.2) # setting the font scale
#sns.set_style('ticks')
plot = sns.pairplot(data=df_advanced[['csc_min','csc_max','cwc_min','cwc_max','is_duplicate']], # dataframe
                    hue='is_duplicate', # set the class variable to colorize the plots
                    height=3, # size of the plot
                    corner=True) # remove the other identical diagonal plots 
#           ,plot_kws=dict(marker="+", linewidth=1))
plot.fig.suptitle("Pair-plots among COMMON WORDS & COMMON STOP-WORDS",y=1.05) # set the title
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['cwc_min','cwc_max','ctc_min','ctc_max','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among COMMON WORDS & COMMON TOKENS",y=1.05)
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['ctc_min','ctc_max','csc_min','csc_max','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among COMMON TOKENS & COMMON STOP-WORDS",y=1.05)
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['ctc_min','csc_min','cwc_min','fuzz_ratio','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among MIN (COMMON TOKENS,WORDS, STOP-WORDS) & FUZZ RATIO",y=1.05)
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['ctc_max','csc_max','cwc_max','fuzz_ratio','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among MAX (COMMON TOKENS,WORDS, STOP-WORDS) & FUZZ RATIO",y=1.05)
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['ctc_min','csc_min','cwc_min','fuzz_partial_ratio','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among MIN (COMMON TOKENS,WORDS, STOP-WORDS) & FUZZ PARTIAL RATIO",y=1.05)
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['ctc_max','csc_max','cwc_max','fuzz_partial_ratio','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among MAX (COMMON TOKENS,WORDS, STOP-WORDS) & FUZZ PARTIAL RATIO",y=1.05)
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['ctc_min','csc_min','cwc_min','fuzz_token_set_ratio','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among MIN (COMMON TOKENS,WORDS, STOP-WORDS) & FUZZ TOKEN SET RATIO",y=1.05)
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['ctc_max','csc_max','cwc_max','fuzz_token_set_ratio','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among MAX (COMMON TOKENS,WORDS, STOP-WORDS) & FUZZ TOKEN SET RATIO",y=1.05)
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['ctc_min','csc_min','cwc_min','fuzz_token_sort_ratio','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among MIN (COMMON TOKENS,WORDS, STOP-WORDS) & FUZZ TOKEN SORT RATIO",y=1.05)
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['ctc_max','csc_max','cwc_max','fuzz_token_sort_ratio','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among MAX (COMMON TOKENS,WORDS, STOP-WORDS) & FUZZ TOKEN SORT RATIO",y=1.05)
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['fuzz_ratio','fuzz_partial_ratio','fuzz_token_set_ratio','fuzz_token_sort_ratio','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among FUZZ RATIO, FUZZ PARTIAL, TOKEN SET & TOKEN SORT RATIOS",y=1.05)
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['fuzz_ratio','fuzz_partial_ratio','cwc_min','cwc_max','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among COMMON WORDS, FUZZ RATIO & FUZZ PARTIAL RATIO",y=1.05)
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['fuzz_ratio','fuzz_partial_ratio','csc_min','csc_max','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among COMMON STOP-WORDS, FUZZ RATIO & FUZZ PARTIAL RATIO",y=1.05)
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['fuzz_ratio','fuzz_partial_ratio','ctc_min','ctc_max','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among COMMON TOKENS, FUZZ RATIO & FUZZ PARTIAL RATIO",y=1.05)
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['fuzz_token_set_ratio','fuzz_token_sort_ratio','cwc_min','cwc_max','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among COMMON WORDS, TOKEN SET RATIO & TOKEN SORT RATIO",y=1.05)
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['fuzz_token_set_ratio','fuzz_token_sort_ratio','csc_min','csc_max','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among COMMON STOP-WORDS, TOKEN SET RATIO & TOKEN SORT RATIO",y=1.05)
plt.show()

In [None]:
plot = sns.pairplot(df_advanced[['fuzz_token_set_ratio','fuzz_token_sort_ratio','ctc_min','ctc_max','is_duplicate']],
             hue='is_duplicate',
             height=3,
             corner=True)
plot.fig.suptitle("Pair-plots among COMMON TOKENS, TOKEN SET RATIO & TOKEN SORT RATIO",y=1.05)
plt.show()

### Plotting violin, box & PDF plots of a fuzzy ratios:

In [None]:
plt.figure(figsize=(22,8)) # setting the size of the plot

# customizing sns plots, completely optional:
sns.set_context("notebook", font_scale=1.0)
sns.set_style("whitegrid")

# 1st subplot - violin plot:
plt.subplot(1,3,1)
sns.violinplot(x='is_duplicate', y='fuzz_ratio', data=df_advanced)

# 2nd subplot - box plot:
plt.subplot(1,3,2)
sns.boxplot(x='is_duplicate', y='fuzz_ratio', data=df_advanced)

# 3rd subplot - pdf plot:
plt.subplot(1,3,3)
sns.distplot(df_advanced[df_advanced['is_duplicate']==0]['fuzz_ratio'], label='0') # is_duplicate=0
sns.distplot(df_advanced[df_advanced['is_duplicate']==1]['fuzz_ratio'], label='1') # is_duplicate=1

plt.show()

In [None]:
plt.figure(figsize=(22,8)) # setting the size of the plot

# customizing sns plots, completely optional:
sns.set_context("notebook", font_scale=1.0)
sns.set_style("whitegrid")

# 1st subplot - violin plot:
plt.subplot(1,3,1)
sns.violinplot(x='is_duplicate', y='fuzz_token_set_ratio', data=df_advanced)

# 2nd subplot - box plot:
plt.subplot(1,3,2)
sns.boxplot(x='is_duplicate', y='fuzz_token_set_ratio', data=df_advanced)

# 3rd subplot - pdf plot:
plt.subplot(1,3,3)
sns.distplot(df_advanced[df_advanced['is_duplicate']==0]['fuzz_token_set_ratio'], label='0') # is_duplicate=0
sns.distplot(df_advanced[df_advanced['is_duplicate']==1]['fuzz_token_set_ratio'], label='1') # is_duplicate=1

plt.show()

In [None]:
plt.figure(figsize=(22,8)) # setting the size of the plot

# customizing sns plots, completely optional:
sns.set_context("notebook", font_scale=1.0)
sns.set_style("whitegrid")

# 1st subplot - violin plot:
plt.subplot(1,3,1)
sns.violinplot(x='is_duplicate', y='fuzz_token_sort_ratio', data=df_advanced)

# 2nd subplot - box plot:
plt.subplot(1,3,2)
sns.boxplot(x='is_duplicate', y='fuzz_token_sort_ratio', data=df_advanced)

# 3rd subplot - pdf plot:
plt.subplot(1,3,3)
sns.distplot(df_advanced[df_advanced['is_duplicate']==0]['fuzz_token_sort_ratio'], label='0') # is_duplicate=0
sns.distplot(df_advanced[df_advanced['is_duplicate']==1]['fuzz_token_sort_ratio'], label='1') # is_duplicate=1

plt.show()

### High-level observations from the pair-plots:
Below pair plots seem to separate the classes more than the rest of the combinations:
1. **fuzz_ratio vs ctc_min**
2. **fuzz_ratio cs cwc_min**
3. **token_set_ratio vs csc_min**
4. **token_sort_ratio vs ctc_min**

### T-SNE visualizations of the advanced features: 

In [None]:
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler

#since this is a high-dimensional data, it'd be better to sample some rows and then perform T-SNE visualizations:
sampled_df = df_advanced[0:10000]
X_arr = MinMaxScaler().fit_transform(sampled_df.drop(['id','qid1','qid2','question1','question2','is_duplicate'],axis=1))
y_arr = sampled_df[['is_duplicate']].values

print("Scaling completed!")

print("Generating T-SNE visualizations, this may take time if the dataset has many rows...")

tsne2d_df = TSNE(
    n_components=2,
    init='random', # pca
    random_state=101,
    method='barnes_hut',
    n_iter=1000,
    verbose=2,
    angle=0.5
).fit_transform(X_arr)

In [None]:
# Creating a dataframe to plot T-SNE:
tsne_df = pd.DataFrame({'x':tsne2d_df[:,0], 'y':tsne2d_df[:,1]})
tsne_df['label']=y_arr
display(tsne_df.head(5))

In [None]:
sns.FacetGrid(tsne_df, hue='label', height = 8).map(plt.scatter, 'x', 'y').add_legend()
plt.show()

### In the above T-SNE visualization, there are some areas where the blue points are completely separated from the orange points, thus, we can say that there are definitely some combinations among the advanced features that separate both these classes.

## Featurization using vectorizers:

In [None]:
!pip install --upgrade pip
!pip install spacy
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import tqdm
from tqdm import notebook

# making sure everything is in string format:
df_advanced['question1'] = df_advanced['question1'].apply(lambda x: str(x))
df_advanced['question2'] = df_advanced['question2'].apply(lambda x: str(x))

df_advanced

In [None]:
# merge texts
questions = list(df_advanced['question1']) + list(df_advanced['question2'])

tfidf = TfidfVectorizer(lowercase=False,)
tfidf.fit_transform(questions)
print("TF-IDF vectorizer created!")

# creating a dict with key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

* After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
* Here we use a pre-trained GLOVE model which comes free with "Spacy". https://spacy.io/usage/vectors-similarity
* It is trained on Wikipedia and therefore, it is stronger in terms of word semantics.

In [None]:
df_advanced.to_pickle('df_advanced.df')

In [None]:
df_advanced = pd.read_pickle('../input/df-advanced/df_advanced.df')
display(df_advanced)

In [None]:
# en_vectors_web_lg, which includes over 1 million unique vectors.
#!python -m spacy download en_vectors_web_lg
#nlp = spacy.load('en_vectors_web_lg')

nlp = spacy.load('en_core_web_sm')

quest1_vecs=[]
prev_mean_vec_quest1=np.zeros((1,1))
#count=0
for question in notebook.tqdm(df_advanced['question1']):
    document = nlp(question)
    #count+=1
    try:
        mean_vec_quest1 = np.zeros([len(document), len(document[0].vector)])
        prev_mean_vec_quest1=mean_vec_quest1*0
        
        for word in document:
            vec = word.vector
            try:
                tfidf_weight = word2tfidf[str(word)]
            except:
                tfidf_weight = 0
            mean_vec_quest1+= vec * tfidf_weight
        mean_vec_quest1 = mean_vec_quest1.mean(axis=0)
        quest1_vecs.append(mean_vec_quest1)
    except:
#         print(mean_vec_quest1.shape)
#         print(mean_vec_quest1)
#         print(prev_mean_vec_quest1.shape)
#         print(prev_mean_vec_quest1)
        mean_vec_quest1 = prev_mean_vec_quest1
        quest1_vecs.append(mean_vec_quest1)

        
df_temp = pd.DataFrame(columns=['quest1_features_mean'])
df_temp['quest1_features_mean'] = quest1_vecs
df3_quest1 = pd.DataFrame(df_temp.quest1_features_mean.values.tolist(),index=df_advanced.index)
del df_temp
df3_quest1

In [None]:
# en_vectors_web_lg, which includes over 1 million unique vectors.
#!python -m spacy download en_vectors_web_lg
#nlp = spacy.load('en_vectors_web_lg')

nlp = spacy.load('en_core_web_sm')

quest2_vecs=[]
prev_mean_vec_quest2=np.zeros((1,1))

for question in notebook.tqdm(df_advanced['question2']):
    document = nlp(question)
    try:
        mean_vec_quest2 = np.zeros([len(document), len(document[0].vector)])
        prev_mean_vec_quest2=mean_vec_quest2*0
        for word in document:
            vec = word.vector
            try:
                tfidf_weight = word2tfidf[str(word)]
            except:
                tfidf_weight = 0
            mean_vec_quest2 = mean_vec_quest2 + (vec * tfidf_weight)
        mean_vec_quest2 = mean_vec_quest2.mean(axis=0)
        quest2_vecs.append(mean_vec_quest2)
    except:
#         print(mean_vec_quest2.shape)
#         print(mean_vec_quest2)
#         print(prev_mean_vec_quest2.shape)
#         print(prev_mean_vec_quest2)
        mean_vec_quest2 = prev_mean_vec_quest2
        quest2_vecs.append(mean_vec_quest2)
        
    
df_temp = pd.DataFrame(columns=['quest2_features_mean'])
df_temp['quest2_features_mean'] = quest2_vecs
df3_quest2 = pd.DataFrame(df_temp.quest2_features_mean.values.tolist(),index=df_advanced.index)
del df_temp
df3_quest2

### Merging the dataframes:

In [None]:
df3_quest1.index.name='id'
df3_quest2.index.name='id'
df_basic.set_index('id',inplace=True)
df_advanced.set_index('id',inplace=True)
df1 = df_basic.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df2 = df_advanced.drop(['qid1','qid2','question1','question2'],axis=1)
# df3_quest1 = pd.DataFrame(tfidf_weighted_df.quest1_features_mean.values.tolist(), index=tfidf_weighted_df.index)
# df3_quest2 = pd.DataFrame(tfidf_weighted_df.quest2_features_mean.values.tolist(), index=tfidf_weighted_df.index)

In [None]:
print("Basic features: ")
display(df1.head())

print("Advanced features: ")
display(df2.head())

print("Question 1 vector features: ")
display(df3_quest1.head())

print("Question 2 vector features: ")
display(df3_quest2.head())

print("Total no. of features: ", df1.shape[1]+df2.shape[1]+df3_quest1.shape[1]+df3_quest2.shape[1])

In [None]:
df1 = df1.merge(df2, on='id',how='left')
df1 = df1.merge(df3_quest1, on='id',how='left')
df1 = df1.merge(df3_quest2, on='id',how='left')
display(df1)

In [None]:
df1.to_csv('finaldb.csv',index=False)

In [None]:
!pip install --upgrade pip command
!pip install csv-to-sqlite
import csv_to_sqlite
print("----")
# all the usual options are supported
options = csv_to_sqlite.CsvOptions() 
input_files = ["./finaldb.csv"] # pass in a list of CSV files
csv_to_sqlite.write_csv(input_files, "finaldb.sqlite", options)
print("Sqlite file created successfully!")

In [None]:
import sqlite3

conn = sqlite3.connect("./finaldb.sqlite")
c = conn.cursor()
table_names = c.execute("SELECT name from sqlite_master where type='table'")
print("Tables in the database: ",table_names.fetchall()[0][0])
conn.close()

In [None]:
print("Fetching rows...")
conn = sqlite3.connect("./finaldb.sqlite")
finaldf = pd.read_sql_query(
            "SELECT * FROM finaldb;",conn)
conn.close()
print("Rows fetched successfully from the database!")

print("Converting strings to numerics...")
# reading the data from database converts all the data into string,
# thus we need to convert them back to numbers
cols = list(finaldf.columns)
for i in notebook.tqdm(cols):
    try:
        finaldf[i] = finaldf[i].apply(pd.to_numeric)
        #count+=1
        #print(finaldf[i])
    except ValueError as v:    
        finaldf[i]=0
        
conn.close()

# removing the rows that have any NaN values:
finaldf.dropna(axis=0, how="any", thresh=None, subset=None, inplace=True)

display(finaldf)

y = finaldf[['is_duplicate']]
print(type(y))
display(y)

X = finaldf.drop(['is_duplicate'],axis=1)
display(X)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
print("x_train:", x_train.shape)
print("x_test:", x_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

## Applying machine learning models:

### 1. Random model

In [None]:
# This function plots the confusion matrices given y_i, y_i_hat.
def plot_confusion_matrix(test_y, predict_y):
    C = confusion_matrix(test_y, predict_y)
    # C = 9,9 matrix, each cell (i,j) represents number of points of class i are predicted class j
    
    A =(((C.T)/(C.sum(axis=1))).T)
    #A2 =(C/C.sum(axis=1))
    #divid each element of the confusion matrix with the sum of elements in that column
    
    # C = [[1, 2],
    #     [3, 4]]
    # C.T = [[1, 3],
    #        [2, 4]]
    # C.sum(axis = 1)  axis=0 corresonds to columns and axis=1 corresponds to rows in two diamensional array
    # C.sum(axix =1) = [[3, 7]]
    # ((C.T)/(C.sum(axis=1))) = [[1/3, 3/7]
    #                           [2/3, 4/7]]

    # ((C.T)/(C.sum(axis=1))).T = [[1/3, 2/3]
    #                           [3/7, 4/7]]
    # sum of row elements = 1
    
    B =(C/C.sum(axis=0))
    #divid each element of the confusion matrix with the sum of elements in that row
    # C = [[1, 2],
    #     [3, 4]]
    # C.sum(axis = 0)  axis=0 corresonds to columns and axis=1 corresponds to rows in two diamensional array
    # C.sum(axix =0) = [[4, 6]]
    # (C/C.sum(axis=0)) = [[1/4, 2/6],
    #                      [3/4, 4/6]] 
    plt.figure(figsize=(20,4))
    
    labels = [0,1]
    # representing C in heatmap format
    cmap=sns.light_palette("orange")
    plt.subplot(1, 3, 1)
    sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Confusion matrix")
    
    plt.subplot(1, 3, 2)
    # representing B in heatmap format
    sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Precision matrix")
    
    plt.subplot(1, 3, 3)
    # representing A in heatmap format
    sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Recall matrix")
    
    plt.show()

In [None]:
import tqdm
from tqdm import notebook
from sklearn.metrics import confusion_matrix
from sklearn.metrics.classification import accuracy_score, log_loss

test_len = len(y_test)

# Random model:
rand_y_pred = np.zeros((test_len,2))
for i in notebook.tqdm(range(test_len)):
    rand_probas = np.random.rand(1,2)
#     print(rand_probas)
#     print(sum(rand_probas))
#     print(sum(sum(rand_probas)))
#     print((rand_probas/sum(sum(rand_probas))))
#     break
    rand_y_pred[i] = ((rand_probas/sum(sum(rand_probas)))[0])
print("Log-loss on test data using a random model: ", log_loss(y_test,rand_y_pred))
print(rand_y_pred.shape)
rand_y_pred = np.argmax(rand_y_pred,axis=1)
plot_confusion_matrix(y_test,rand_y_pred)

### 2. Logistic Regression with hyperparameter tuning

In [None]:
# HYPER-PARAMETER TUNING:

from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV

alpha = [10 ** x for x in range(-1, 5)] # hyperparam for SGD classifier.

# read more about SGDClassifier() at http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
# ------------------------------
# default parameters
# SGDClassifier(loss=’hinge’, penalty=’l2’, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, 
# shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate=’optimal’, eta0=0.0, power_t=0.5, 
# class_weight=None, warm_start=False, average=False, n_iter=None)

# some of methods
# fit(X, y[, coef_init, intercept_init, …])	Fit linear model with Stochastic Gradient Descent.
# predict(X)	Predict class labels for samples in X.

#-------------------------------
# video link: 
#------------------------------

print("Checking log loss of the hyperparameter alpha for the values - ",alpha)
log_error_array=[]
for i in notebook.tqdm(alpha):
    clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
    clf.fit(x_train, y_train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid",cv='prefit')
    sig_clf.fit(x_train, y_train)
    log_pred_y = sig_clf.predict_proba(x_test)
    loss=log_loss(y_test, log_pred_y, labels=clf.classes_, eps=1e-15)
    log_error_array.append(loss)
    print('For value of alpha = ', i, ", the log loss is:",loss)

print("Generating the plot to determine best alpha...")
fig, ax = plt.subplots()
ax.plot(alpha, log_error_array,c='g')
for i, txt in enumerate(np.round(log_error_array,3)):
    ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha using log loss (lower is better)")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

In [None]:
# Applying Logistic regression with the best alpha value obtained above:
print("Training for Logistic regression has started...")
best_alpha = np.argmin(log_error_array)
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
clf.fit(x_train, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid",cv='prefit')
sig_clf.fit(x_train, y_train)
print("Training completed!")

print("Generating confusion matrices to compare with the random model...")
log_pred_y = sig_clf.predict_proba(x_train)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, log_pred_y, labels=clf.classes_, eps=1e-15))
log_pred_y = sig_clf.predict_proba(x_test)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, log_pred_y, labels=clf.classes_, eps=1e-15))
log_pred_y =np.argmax(log_pred_y,axis=1)
print("Total number of data points :", len(log_pred_y))
plot_confusion_matrix(y_test, log_pred_y)

### 3. Linear SVM with hyperparameter tuning

In [None]:
# HYPER-PARAMETER TUNING:

alpha = [10 ** x for x in range(-3, 3)] # hyperparam for SGD classifier.

# read more about SGDClassifier() at http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
# ------------------------------
# default parameters
# SGDClassifier(loss=’hinge’, penalty=’l2’, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, 
# shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate=’optimal’, eta0=0.0, power_t=0.5, 
# class_weight=None, warm_start=False, average=False, n_iter=None)

# some of methods
# fit(X, y[, coef_init, intercept_init, …])	Fit linear model with Stochastic Gradient Descent.
# predict(X)	Predict class labels for samples in X.

#-------------------------------
# video link: 
#------------------------------

print("Checking log-loss of the hyperparameter alpha for the values - ",alpha)
log_error_array=[]
for i in alpha:
    clf = SGDClassifier(alpha=i, penalty='l1', loss='hinge', random_state=42)
    clf.fit(x_train, y_train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid", cv='prefit')
    sig_clf.fit(x_train, y_train)
    linsvm_pred_y = sig_clf.predict_proba(x_test)
    log_error_array.append(log_loss(y_test, linsvm_pred_y, labels=clf.classes_, eps=1e-15))
    print('For values of alpha = ', i, "The log loss is:",log_loss(y_test, linsvm_pred_y, labels=clf.classes_, eps=1e-15))

print("Generating the plot to determine best alpha...")
fig, ax = plt.subplots()
ax.plot(alpha, log_error_array,c='g')
for i, txt in enumerate(np.round(log_error_array,3)):
    ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha using hinge loss (lower is better)")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

In [None]:
# Applying the Logistic regression algorithm with the best alpha obtained above:
print("Training for Linear SVM has started...")
best_alpha = np.argmin(log_error_array)
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l1', loss='hinge', random_state=42)
#alpha=0.6597672918027971
#clf = SGDClassifier(alpha=alpha, penalty='l1', loss='hinge', random_state=42)
clf.fit(x_train, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid",cv='prefit')
sig_clf.fit(x_train, y_train)
print("Training completed!")

In [None]:
print("Generating confusion matrices to compare with the random model...")
linsvm_pred_y = sig_clf.predict_proba(x_train)
#print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, linsvm_pred_y, labels=clf.classes_, eps=1e-15))
print('For values of best alpha = ', alpha, "The train log loss is:",log_loss(y_train, linsvm_pred_y, labels=clf.classes_, eps=1e-15))
linsvm_pred_y = sig_clf.predict_proba(x_test)
#print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, linsvm_pred_y, labels=clf.classes_, eps=1e-15))
print('For values of best alpha = ', alpha, "The test log loss is:",log_loss(y_test, linsvm_pred_y, labels=clf.classes_, eps=1e-15))
linsvm_pred_y =np.argmax(linsvm_pred_y,axis=1)
print("Total number of data points :", len(linsvm_pred_y))
plot_confusion_matrix(y_test, linsvm_pred_y)

### 4. GBDT using XGBoost with hyperparameter tuning:

In [None]:
import xgboost as xgb

d_train = xgb.DMatrix(x_train,label=y_train)
d_test = xgb.DMatrix(x_test,label=y_test)
watchlist = [(d_train,'train'),(d_test,'valid')]
#bst = xgb.train(parms,d_train,400,watchlist,early_stopping_rounds=20,verbose_eval=10)

eta = [10 ** x for x in range(-3,3)]

print("Checking log-loss of the hyperparameter eta for the values - ",eta)
log_error_array=[]
parms={}
parms['objective'] = 'binary:logistic'
parms['eval_metric'] = 'logloss'
parms['max_depth']=4
for i in eta:
    print("For the value of eta = ",i,", below are the train & test loss: ")
    parms['eta'] = i
    xgdmat = xgb.DMatrix(x_train,y_train)
    bst = xgb.train(parms,d_train,50,watchlist,early_stopping_rounds=0,verbose_eval=10)
    xgb_pred_y = bst.predict(d_test)
    #loss=logloss(y_test,xgb_pred_y,labels=clf.classes_,eps=1e-15)
    #log_error_array.append(loss)
    

### From the above output, eta=1 seems the best values as the loss is minimum as compared to the other values. So let's check for even better value:

In [None]:
d_train = xgb.DMatrix(x_train,label=y_train)
d_test = xgb.DMatrix(x_test,label=y_test)
watchlist = [(d_train,'train'),(d_test,'valid')]
#bst = xgb.train(parms,d_train,400,watchlist,early_stopping_rounds=20,verbose_eval=10)
parms={}
parms['objective'] = 'binary:logistic'
parms['eval_metric'] = 'logloss'
parms['max_depth']=4
parms['eta'] = 1
xgdmat = xgb.DMatrix(x_train,y_train)
bst = xgb.train(parms,d_train,400,watchlist,early_stopping_rounds=20,verbose_eval=10)
xgb_pred_y = bst.predict(d_test)

In [None]:
# Applying the XGBoost algorithm with the best eta obtained above:
print("Training for XGBoosted Gradient Decision Tree has started...")
d_train = xgb.DMatrix(x_train,label=y_train)
d_test = xgb.DMatrix(x_test,label=y_test)
watchlist = [(d_train,'train'),(d_test,'valid')]
#best_eta = np.argmin(log_error_array)
parms['objective'] = 'binary:logistic'
parms['eval_metric'] = 'logloss'
parms['max_depth']=4
parms['eta'] = 1
xgdmat = xgb.DMatrix(x_train,y_train)
xgb_pred_y = bst.predict(d_test)
#loss=logloss(y_test,predict_y,labels=clf.classes_,eps=1e-15)
#clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l1', loss='hinge', random_state=42)
#alpha=0.6597672918027971
#clf = SGDClassifier(alpha=alpha, penalty='l1', loss='hinge', random_state=42)
#clf.fit(x_train, y_train)
#sig_clf = CalibratedClassifierCV(clf, method="sigmoid",cv='prefit')
#sig_clf.fit(x_train, y_train)
print("Training completed!")

# Plotting the confusion matrix:
xgb_pred_y =np.array(xgb_pred_y>0.5,dtype=int)
print("Total number of data points :", len(xgb_pred_y))
plot_confusion_matrix(y_test, xgb_pred_y)

### We can see that the overall score of confusion matrix, precision & recall is much better obtained by GBDT (using XGBoost), thus GBDT would be the best algorithm to go with in this particular scenario.