In [None]:
pip install transformers

In [None]:
import transformers
from transformers import BertModel, BertTokenizer

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
RANDOM_SEED = 4144959
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.get_device_name())
# device = torch.device('cpu')
df_test = pd.read_csv('../input/quora-question-pairs/test.csv')
MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
MAX_LEN = 335

In [None]:
print('Test set shape', df_test.shape)

In [None]:
class SimilarityMeasurer(nn.Module):
    def __init__(self):
        super(SimilarityMeasurer, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.out = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 128),
            nn.Linear(128, 2),
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        bert_outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        # return bert_outputs
        cls_hidden_state = bert_outputs[0].transpose(0, 1)[0]
        ret = self.out(cls_hidden_state)
        return ret

In [None]:
Model = SimilarityMeasurer()
Model.load_state_dict(torch.load('../input/quora-question-pairs-model-state/model_state.pkl'))
Model = Model.to(device)

In [None]:
import csv

In [None]:
with open('./submission.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["test_id", "is_duplicate"])
    for i in range(len(df_test)):
        ''''''
        text1 = df_test.iloc[i]['question1']
        text2 = df_test.iloc[i]['question2']
        encoded = tokenizer.encode_plus(
            text=text1,
            text_pair=text2,
            padding='max_length',
            truncation=True,
            max_length=MAX_LEN,
            return_tensors='pt'
        )
        with torch.no_grad():
            outputs = Model(
                input_ids=encoded['input_ids'].to(device),
                attention_mask=encoded['attention_mask'].to(device),
                token_type_ids=encoded['token_type_ids'].to(device)
            )
        outputs = F.softmax(outputs, dim=1)
        ans = outputs.max(dim=1)[0].item()
        writer.writerow([i, ans])
        if i % 1000 == 0:
            print(f"Progress: {i}/{len(df_test)}")
        