In [1]:
import torch
torch.__version__

'1.7.1+cu110'

In [2]:
from simcse import SimCSE

In [3]:
import pandas as pd
import torch
import numpy as np

In [4]:
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader, TensorDataset

In [5]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [6]:
from transformers import BertConfig, BertModel, BertTokenizer

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
from tqdm import tqdm

In [9]:
import gc
cuda = torch.device('cuda')     # Default CUDA device

# Read data

In [10]:
df_All = pd.read_csv('../quora-question-pairs/train/train.csv')
len(df_All)

404290

In [11]:
df = df_All[:int(len(df_All))]
df.shape

(404290, 6)

In [12]:
df = df.dropna()
df.reset_index(inplace = True)
df.shape

(404287, 7)

In [13]:
idx = 1

In [14]:
df.iloc[idx]['question1']

'What is the story of Kohinoor (Koh-i-Noor) Diamond?'

In [15]:
df.iloc[idx]['question2']

'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?'

# encoder

In [16]:
class SimcseModel(nn.Module):
    """Simcse有监督模型定义"""
    def __init__(self, pretrained_model: str, pooling: str):
        super(SimcseModel, self).__init__()
        # config = BertConfig.from_pretrained(pretrained_model)   # 有监督不需要修改dropout
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.pooling = pooling
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        
        # out = self.bert(input_ids, attention_mask, token_type_ids)
        out = self.bert(input_ids, attention_mask, token_type_ids, output_hidden_states=True)

        if self.pooling == 'cls':
            return out.last_hidden_state[:, 0]  # [batch, 768]  

In [17]:
model_path = 'bert-base-uncased'
POOLING = 'cls'

CSE_model = SimcseModel(pretrained_model = model_path, pooling = POOLING)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
SAVE_PATH = 'simcse_sup_neg.pt'
CSE_model.load_state_dict(torch.load(SAVE_PATH))
CSE_model.cuda()

SimcseModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


# load data use encoder

In [19]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(D_in, H1)
        self.fc2 = nn.Linear(H1, H2)
        self.fc3 = nn.Linear(H2, D_out)
        self.dropout = nn.Dropout(0.2)
        self.softmax = torch.nn.Softmax(dim=0)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [20]:
def TrainData(df):
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
    features = []
    labels = []

    with torch.no_grad():
        for i in tqdm(range(len(df))):
            question1 = df['question1'][i]
            question2 = df['question2'][i]

            label = df['is_duplicate'][i]

            sen1 = tokenizer(question1, max_length=64, truncation=True, padding='max_length', return_tensors='pt')
            sen2 = tokenizer(question2, max_length=64, truncation=True, padding='max_length', return_tensors='pt')

            input_ids1 = sen1.get('input_ids').cuda()
            attention_mask1 = sen1.get('attention_mask').cuda()
            token_type_ids1 = sen1.get('token_type_ids').cuda()


            output = CSE_model(input_ids1,attention_mask1,token_type_ids1)
            x1 =  output.squeeze(0)

            input_ids2 = sen2.get('input_ids').cuda()
            attention_mask2 = sen2.get('attention_mask').cuda()
            token_type_ids2 = sen2.get('token_type_ids').cuda()

            output = CSE_model(input_ids2,attention_mask2,token_type_ids2)
            x2 =  output.squeeze(0)
            
            #print(x1.size())
            #print(x2.size())

            feature = torch.cat((x1,x2))
            label = label

            features.append(feature.cpu().detach().numpy().tolist())
            labels.append(label)
    
    return features, labels

In [21]:
def Data_Transform(sentence_weights, sentence_labels):
    #sentence_weights, sentence_labels = average_word2vec(df_simple)
    
    X_train, X_test, y_train, y_test = train_test_split(sentence_weights, sentence_labels, test_size=0.2, random_state = None)
    
    dtype1 = torch.LongTensor # y
    dtype2 = torch.FloatTensor # x
    dtype3 = torch.cuda.FloatTensor # x
    dtype4 = torch.cuda.LongTensor # y
    train_data = torch.tensor(X_train).type(dtype3)
    test_data = torch.tensor(X_test).type(dtype3)
    train_label = torch.tensor(y_train).type(dtype4) 
    test_label = torch.tensor(y_test).type(dtype4) 
    
    return train_data, test_data, train_label, test_label

In [22]:
def Data_loader(train_data, test_data, train_label, test_label, batch_size):
    # number of subprocesses to use for data loading
    num_workers = 0
    # how many samples per batch to load
    #batch_size = 20
    # percentage of training set to use as validation
    valid_size = 0.2
    
    num_train = len(train_data)
    indices = list(range(num_train))
    np.random.shuffle(indices)
    split = int(np.floor(valid_size * num_train))
    train_idx, valid_idx = indices[split:], indices[:split]
    
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)
    
    deal_dataset_training = TensorDataset(train_data, train_label)
    deal_dataset_test = TensorDataset(test_data, test_label)
    
    train_loader = torch.utils.data.DataLoader(deal_dataset_training, batch_size=batch_size, 
                                           sampler=train_sampler, num_workers=num_workers)

    valid_loader = torch.utils.data.DataLoader(deal_dataset_training, batch_size=batch_size, 
                                               sampler=valid_sampler, num_workers=num_workers)

    test_loader = torch.utils.data.DataLoader(deal_dataset_test, batch_size=batch_size, 
                                              num_workers=num_workers)
    
    return train_loader, valid_loader, test_loader

# train & prediction

In [23]:
def train(input_model,train_loader,valid_loader, epochs, model_name):
    optimizer = torch.optim.Adam(input_model.parameters(), lr=learning_rate)
    n_epochs = epochs
    valid_loss_min = np.Inf # set initial "min" to infinity
    for epoch in range(n_epochs):
        train_loss = 0.0
        valid_loss = 0.0

        input_model.train() # prep model for training
        for data, target in train_loader:
#             print(data.size())
#             print(target)            
            output = input_model(data)
#             print(output)
            loss = criterion(output, target.cuda())
            optimizer.zero_grad()    
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        input_model.eval() # prep model for evaluation
        for data, target in valid_loader:                
            output = input_model(data)
            loss = criterion(output, target.cuda())
            valid_loss += loss.item()
        train_loss = train_loss/len(train_loader)
        valid_loss = valid_loss/len(valid_loader)
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch+1, 
            train_loss,
            valid_loss
            ))
        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
            torch.save(input_model.state_dict(), model_name)
            valid_loss_min = valid_loss

In [24]:
def predict(model, dataloader):
    prediction_list = []
    actual_list = []
    for data, target in dataloader:
        outputs = model(data)
        _, predicted = torch.max(outputs.data, 1)
#         print(predicted)
        prediction_list.append(predicted.tolist())
        actual_list.append(target.tolist())
    return prediction_list[0], actual_list[0]

In [25]:
#Net
D_in, H1, H2, D_out = 1536, 300, 150, 2 

# train
learning_rate = 0.001

batch_size = 250

epochs = 20

validSize = 0.2

criterion = nn.CrossEntropyLoss()

In [26]:
net = Net().cuda()

features, labels = TrainData(df)

train_data, test_data, train_label, test_label = Data_Transform(features, labels)

train_loader, valid_loader, test_loader = Data_loader(train_data, test_data, train_label, test_label, batch_size)

100%|████████████████████████████████████████████████████████████████████████| 404287/404287 [2:14:09<00:00, 50.23it/s]


In [27]:
train(net,train_loader, valid_loader, epochs,"model.pt")

Epoch: 1 	Training Loss: 0.387729 	Validation Loss: 0.343805
Validation loss decreased (inf --> 0.343805).  Saving model ...
Epoch: 2 	Training Loss: 0.305459 	Validation Loss: 0.323345
Validation loss decreased (0.343805 --> 0.323345).  Saving model ...
Epoch: 3 	Training Loss: 0.272277 	Validation Loss: 0.318809
Validation loss decreased (0.323345 --> 0.318809).  Saving model ...
Epoch: 4 	Training Loss: 0.244139 	Validation Loss: 0.321755
Epoch: 5 	Training Loss: 0.217451 	Validation Loss: 0.328134
Epoch: 6 	Training Loss: 0.191638 	Validation Loss: 0.361086
Epoch: 7 	Training Loss: 0.168240 	Validation Loss: 0.378488
Epoch: 8 	Training Loss: 0.147112 	Validation Loss: 0.417059
Epoch: 9 	Training Loss: 0.129633 	Validation Loss: 0.444546
Epoch: 10 	Training Loss: 0.113161 	Validation Loss: 0.487325
Epoch: 11 	Training Loss: 0.101100 	Validation Loss: 0.520849
Epoch: 12 	Training Loss: 0.090546 	Validation Loss: 0.543315
Epoch: 13 	Training Loss: 0.080792 	Validation Loss: 0.584776
E

In [28]:
predictions, labels = predict(net,test_loader)

print(accuracy_score(predictions, labels))

print(precision_score(predictions, labels))

print(f1_score(predictions, labels))

0.852
0.8555555555555555
0.8062827225130891
