# Quora Question Pairs with Bert

# context

Where else but Quora can a physicist help a chef with a math problem and get cooking tips in return? Quora is a place to gain and share knowledge—about anything. It’s a platform to ask questions and connect with people who contribute unique insights and quality answers. This empowers people to learn from each other and to better understand the world.

Over 100 million people visit Quora every month, so it's no surprise that many people ask similarly worded questions. Multiple questions with the same intent can cause seekers to spend more time finding the best answer to their question, and make writers feel they need to answer multiple versions of the same question. Quora values canonical questions because they provide a better experience to active seekers and writers, and offer more value to both of these groups in the long term.

Currently, Quora uses a Random Forest model to identify duplicate questions. In this competition, Kagglers are challenged to tackle this natural language processing problem by applying advanced techniques to classify whether question pairs are duplicates or not. Doing so will make it easier to find high quality answers to questions resulting in an improved experience for Quora writers, seekers, and readers.

In short, judging whether the problem is repeated can improve the search efficiency, and can also help the platform save storage space

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Dataset


In [None]:
data = pd.read_csv("/kaggle/input/quora-question-pairs/train.csv.zip")
data.head()

The The task is to judge whether question 1 and question 2 are similar problems, and is_duplicate is used as the label

# Task analysis
1. Overall this is a text matching task
2. Need an indicator to measure whether question 1 and question 2 are similar
3. Unstructured data(text) is stored in structured tables

# Code
Because my English level is poor, the comment part of the code is in Chinese

In [None]:
#导入必要工具包，这次我使用pytorch来实现bert模型对数据进行分析
import torch
import torch.nn as nn

#有了它们才能画出可爱的小图图
import seaborn as sns
import matplotlib.pyplot as plt

#进度条
from tqdm import tqdm

#用sklearn划分数据集
from sklearn.model_selection import train_test_split

#transformer模型的数据包
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
transformers.logging.set_verbosity_error()

In [None]:
#设置使用CPU还是GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#设置随机种子
SEED = 1024

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Analysis

In [None]:
#读取训练数据，看看基本情况，有无缺失值
train_df = pd.read_csv("/kaggle/input/quora-question-pairs/train.csv.zip")
train_df.info()

In [None]:
# 移除缺失值，因为数据是文本，所以先查看一下数据的长度
train_df.dropna(inplace=True)
train_sentences_lens = train_df['question1'].apply(lambda x: len(x.split(' '))).tolist()
train_sentences_lens.extend(train_df['question2'].apply(lambda x: len(x.split(' '))).tolist())
sns.distplot(train_sentences_lens)

In [None]:
#我们可以发现长度大于40的数据非常少，所以我们把模型读取文本的最大长度（单词数）设置为40
MAX_LEN = 40

In [None]:
#画个图，看看正负样本的分布情况，这个数据集里每条数据由两个问题以及判断它们是否相似的标签组成
def pie_chart(similar_questions_num, different_questions_num, set_type):
    labels = 'Similiar', 'Different'
    sizes = [similar_questions_num, different_questions_num]

    fig1, ax1 = plt.subplots()
    ax1.set_title(set_type)
    ax1.pie(sizes, labels=labels, autopct='%1.2f%%', shadow=True, startangle=90)

    plt.show()

similar_samples_num = sum(train_df['is_duplicate'].values)
pie_chart(similar_samples_num, len(train_df['is_duplicate']) - similar_samples_num, 'train set')

In [None]:
#看一下总共有多少个不同的问题
qids = pd.Series(list(train_df['qid1']) + list(train_df['qid2']))

print ('Unique Questions number: {}\n'.format(len(np.unique(qids))))

q_vals=qids.value_counts()[0:5]
print ('Top 5 most frequently asked questions: ')

for pair in q_vals.iteritems():
    print(train_df.loc[train_df['qid2']==pair[0]]['question1'].head(1).values + " count: " + str(pair[1]))

q_vals=q_vals.values

In [None]:
#看看数据中有没有完全重复的两对问题
duplicate_rows = train_df[train_df.duplicated(['qid1','qid2'])]
print ("Number of duplicate questions : ", len(duplicate_rows))

In [None]:
#看来没有完全重复的数据，很好，接下来看看有多少问题是独一无二的，多少问题是重复的
x = ["Unique" , "Repeated"]
y =  [len(np.unique(qids)), np.sum(qids.value_counts() > 1)]

plt.figure(figsize=(10, 8))
plt.title ("Unique and Repeated questions counts")
sns.barplot(x,y)
plt.show()


In [None]:
# 看看问题1和问题2中独有词的分布，以及常用词的分布
def common_words(row):
    q1_word_set = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    q2_word_set = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return 1.0 * len(q1_word_set & q2_word_set)

train_df['common_words'] = train_df.apply(common_words, axis=1)
plt.figure(figsize=(15, 10))

plt.subplot(1,2,2)
sns.distplot(train_df[train_df['is_duplicate'] == 1]['common_words'][0:] , label = "1", color = 'red')
sns.distplot(train_df[train_df['is_duplicate'] == 0]['common_words'][0:] , label = "0" , color = 'blue' )

plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'common_words', data = train_df[0:])

plt.show()

**我们可以看到，问题一和问题二中的独有词的分布或多或少是接近的，无论是在正样本还是负样本中**

In [None]:
# 我们也可以看看同时出现在问题一和问题二中的词的分布
def shared_words(row):
    q1_word_set = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    q2_word_set = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return 1.0 * len(q1_word_set & q2_word_set) / (len(q1_word_set) + len(q2_word_set))    

train_df['shared_words'] = train_df.apply(shared_words, axis=1)
plt.figure(figsize=(15, 10))

plt.subplot(1,2,2)
sns.distplot(train_df[train_df['is_duplicate'] == 1]['shared_words'][0:] , label = "1", color = 'red')
sns.distplot(train_df[train_df['is_duplicate'] == 0]['shared_words'][0:] , label = "0" , color = 'green' )

plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'shared_words', data = train_df[0:])

plt.show()

**可以看出共有词的分布在正负样本中的分布还是有比较大的差异**

# Define a Bert model and read data into the model

In [None]:
#指定使用的预训练模型，下载
BERT_VERSION = 'bert-base-uncased'
POOLED_OUTPUT_DIM = 768 
tokenizer = BertTokenizer.from_pretrained(BERT_VERSION)

In [None]:
# 将训练数据划分为训练集和验证集
train_df, val_df = train_test_split(train_df, test_size=0.1)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [None]:
#定义数据集，方便bert读取数据
class BertDataSet:
    def __init__(self, first_questions, second_questions, targets, tokenizer):
        self.first_questions = first_questions
        self.second_questions = second_questions
        self.targets = targets
        self.tokenizer = tokenizer
        self.length = len(first_questions)
        
    def __len__(self):
        return self.length

    def __getitem__(self, item):
        first_question = str(self.first_questions[item])
        second_question = str(self.second_questions[item])

        # 去除问题文本中多余的空格
        first_question = " ".join(first_question.split())
        second_question = " ".join(second_question.split())
        
        ### bert的数据输入格式如下 [CLS] question1 [SEP] questions2 [SEP] ... [PAD]
        inputs = self.tokenizer.encode_plus(
            first_question,
            second_question,
            add_special_tokens=True,
            padding='max_length',
            max_length=2 * MAX_LEN + 3, # max length of 2 questions and 3 special tokens
            truncation=True   
        )
        
        # 未找到数据时返回0
        return {
            "ids": torch.tensor(inputs["input_ids"], dtype=torch.long),
            "mask": torch.tensor(inputs["attention_mask"], dtype=torch.long),
            "token_type_ids": torch.tensor(inputs["token_type_ids"], dtype=torch.long),
            "targets": torch.tensor(int(self.targets[item]), dtype=torch.long) if self.targets is not None else 0
        }

In [None]:
# 创建数据集并且返回读取器
def get_data_loader(df, targets, batch_size, shuffle, tokenizer):
    dataset = BertDataSet(
        first_questions=df["question1"].values,
        second_questions=df["question2"].values,
        targets=targets,
        tokenizer=tokenizer
    )
    
    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle=shuffle
    )
    
    return data_loader

In [None]:
BS = 128

In [None]:
# 使用数据读取器读取训练数据和验证数据
train_data_loader = get_data_loader(
    df=train_df,
    targets=train_df["is_duplicate"].values,
    batch_size=BS,
    shuffle=True,
    tokenizer=tokenizer
)

val_data_loader = get_data_loader(
    df=val_df,
    targets=val_df["is_duplicate"].values,
    batch_size=4 * BS,
    shuffle=True,
    tokenizer=tokenizer
)

In [None]:
#定义bert模型
class BertModel(nn.Module):
    def __init__(self, bert_path):
        super(BertModel, self).__init__()
        self.bert_path = bert_path
        self.bert = transformers.BertModel.from_pretrained(self.bert_path)
        self.dropout = nn.Dropout(0.3)
        self.out = nn.Linear(POOLED_OUTPUT_DIM, 1)

    def forward(self, ids, mask, token_type_ids):
        _, pooled = self.bert(ids, attention_mask=mask,token_type_ids=token_type_ids,return_dict=False)
        
        # add dropout to prevent overfitting.
        pooled = self.dropout(pooled) 
        return self.out(pooled)

#为了方便这里直接实例化模型
model = BertModel(BERT_VERSION).to(device)

In [None]:
# 因为这里是二分类问题，所以使用交叉熵损失，并将结果用sigmoid缩放到（0，1）之间
def loss_fn(outputs, targets):
    outputs = torch.squeeze(outputs)
    return nn.BCELoss()(nn.Sigmoid()(outputs), targets)

In [None]:
# 在验证集计算损失和困惑度
def calculate_perplexity(data_loader, model, device):
    model.eval()
    
    # 因为网络中层的梯度基本用不到，所以我们不必保存它们
    with torch.no_grad():
        total_loss = 0
        for batch in data_loader:
            ids = batch["ids"].to(device, dtype=torch.long)
            mask = batch["mask"].to(device, dtype=torch.long)
            token_type_ids = batch["token_type_ids"].to(device, dtype=torch.long)
            targets = batch["targets"].to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            total_loss += loss_fn(outputs, targets).item()
            
    model.train()

    return np.exp(total_loss / len(data_loader))

# Define Training

In [None]:
#一切准备就绪，定义训练函数
def train_loop(epochs, train_data_loader, val_data_loader, model, optimizer, device, scheduler=None):
    it = 1
    total_loss = 0
    curr_perplexity = None
    perplexity = None
    
    model.train()
    for epoch in range(epochs):
        print('Epoch: ', epoch + 1)
        for batch in train_data_loader:
            ids = batch["ids"].to(device, dtype=torch.long)
            mask = batch["mask"].to(device, dtype=torch.long)
            token_type_ids = batch["token_type_ids"].to(device, dtype=torch.long)
            targets = batch["targets"].to(device, dtype=torch.float)

            optimizer.zero_grad()
            
            # 正向传播，计算损失
            outputs = model(ids, mask=mask, token_type_ids=token_type_ids)
            
            loss = loss_fn(outputs, targets)
            total_loss += loss.item()
            
            # 反向传播，计算梯度
            loss.backward()
            
            # 训练，用优化器更新模型中的参数
            optimizer.step()
            
            # 每隔100个iter输出一次训练情况（损失、困惑度）
            if it % 100 == 0:
                
                # 每隔500个iter计算一次困惑度
                if it % 500 == 0:
                    curr_perplexity = calculate_perplexity(val_data_loader, model, device)
                    
                    if scheduler is not None:
                        scheduler.step()

                    # 记录最优的模型参数
                    if not perplexity or curr_perplexity < perplexity:
                        torch.save(model.state_dict(), 'saved_model')
                        perplexity = curr_perplexity

                print('| Iter', it, '| Avg Train Loss', total_loss / 100, '| Dev Perplexity', curr_perplexity)
                total_loss = 0

            it += 1

In [None]:
#运行训练的函数
def run(model, train_df, device, train_data_loader, val_data_loader):
    EPOCHS = 1
    
    lr = 3e-5
    num_training_steps = int(len(train_data_loader) * EPOCHS)
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )
    
    
    train_loop(EPOCHS, train_data_loader, val_data_loader,  model, optimizer, device, scheduler)

# Train

In [None]:
#模型训练
run(model, train_df, device, train_data_loader, val_data_loader)

# Test and get result

In [None]:
#读入测试数据
test_df = pd.read_csv("/kaggle/input/quora-question-pairs/test.csv")
test_df.head()

In [None]:
#在测试集上进行预测，因为我的模型参数有点多，所以时间比较久，设置一个进度条来看看
def test(model, test_df, device):
    predictions = torch.empty(0).to(device, dtype=torch.float)
    
    test_dataset = BertDataSet(
        first_questions=test_df["question1"].values,
        second_questions=test_df["question2"].values,
        targets=None,
        tokenizer=tokenizer
    )
    
    test_data_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=512
    )
    
    with torch.no_grad():
        model.eval()
        for batch in tqdm(test_data_loader):
            ids = batch["ids"]
            mask = batch["mask"]
            token_type_ids = batch["token_type_ids"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)

            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            predictions = torch.cat((predictions, nn.Sigmoid()(outputs)))
    
    return predictions.cpu().numpy().squeeze()

predictions = test(model, test_df, device)
len(predictions)

In [None]:
# 将预测结果记录
test_df['is_duplicate'] = predictions

In [None]:
# 将结果保存到submission.csv
test_df[['test_id', 'is_duplicate']].to_csv('./submission.csv', index=False)

# Demonstration

In [None]:
# 这是我用于演示的函数，可以不要
def eval(model, tokenizer, first_question, second_question, device):
    inputs = tokenizer.encode_plus(
        first_question,
        second_question,
        add_special_tokens=True,
    )

    ids = torch.tensor([inputs["input_ids"]], dtype=torch.long).to(device, dtype=torch.long)
    mask = torch.tensor([inputs["attention_mask"]], dtype=torch.long).to(device, dtype=torch.long)
    token_type_ids = torch.tensor([inputs["token_type_ids"]], dtype=torch.long).to(device, dtype=torch.long)

    with torch.no_grad():
        model.eval()
        output = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
        prob = nn.Sigmoid()(output).item()

        print("questions [{}] and [{}] are {} with score {}".format(first_question, second_question, 'similar' if prob > 0.5 else 'not similar', prob))

In [None]:
# 我们输入两个问题来看它们是否相似
first_question = "how to register on hackerrank with google account?"
second_question = "Can I sign using google account on hackerrank?"

eval(model, tokenizer, first_question, second_question, device)