In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# install needed lib
!pip install sentence-transformers

In [None]:
# imports
from sentence_transformers import SentenceTransformer, util
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from torch.nn.functional import normalize
from sklearn import preprocessing
from torch import nn
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

In [None]:
# read csv
df = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv')
df.head()

In [None]:
# add context (section) to dataframe
df['section'] = df.context.str[0]

changedict = {"A": "Human Necessities",
             "B": "Performing Operations; Transporting",
             "C": "Chemistry; Metallurgy", 
             "D": "Textiles; Paper",
             "E": "Fixed Constructions",
             "F": "Mechanical Engineering; Lighting; Heating; Weapons; Blasting",
             "G": "Physics",
             "H": "Electricity"}
df = df.replace({"section": changedict})

In [None]:
# function to create and load inputs and outputs for models
def LoadData(df, model_name):
    # load different models
    model = SentenceTransformer(model_name)
    
    # load and encode input data for models
    anchors = df.anchor.values 
    targets = df.target.values
    context = df.section.values
    embedding1 = model.encode(anchors, convert_to_tensor=True)
    embedding2 = model.encode(targets, convert_to_tensor=True)
    embeddingc = model.encode(context, convert_to_tensor=True)
    
    # normilize embeded vectors
    embedding1 = normalize(embedding1)
    embedding2 = normalize(embedding2)
    embeddingc = normalize(embeddingc)
    
    # prepare data for train-test-split
    x = list()
    for i in range(0, len(embedding1)):
        x.append([embedding1[i], embedding2[i], embeddingc[i]])
    y = df.score.to_list()
    
    # function call
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)

    x_train_embedding1 = list()
    x_train_embedding2 = list()
    x_train_embeddingc = list()
    x_test_embedding1 = list()
    x_test_embedding2 = list()
    x_test_embeddingc = list()

    for i in range(len(x_train)):
        x_train_embedding1.append(x_train[i][0])
        x_train_embedding2.append(x_train[i][1])
        x_train_embeddingc.append(x_train[i][2])

    for i in range(len(x_test)):
        x_test_embedding1.append(x_test[i][0])
        x_test_embedding2.append(x_test[i][1])
        x_test_embeddingc.append(x_test[i][2])

    x_train_embedding1 = torch.stack(x_train_embedding1)
    x_train_embedding2 = torch.stack(x_train_embedding2)
    x_train_embeddingc = torch.stack(x_train_embeddingc)
    x_test_embedding1 = torch.stack(x_test_embedding1)
    x_test_embedding2 = torch.stack(x_test_embedding2)
    x_test_embeddingc = torch.stack(x_test_embeddingc)
    
    return {"train": [x_train_embedding1, x_train_embedding2, x_train_embeddingc, y_train],
            "test": [x_test_embedding1, x_test_embedding2, x_test_embeddingc, y_test]}

In [None]:
# load data function call and prepare to training
model_name = "sentence-transformers/all-MiniLM-L6-v2"
#model_name = "anferico/bert-for-patents"
d = LoadData(df, model_name)

x_train1 = d["train"][0]
x_train2 = d["train"][1]
x_trainc = d["train"][2]
y_train = d["train"][3]

x_test1 = d["test"][0]
x_test2 = d["test"][1]
x_testc = d["test"][2]
y_test = d["test"][3]

In [None]:
class SquareNet1(nn.Module):
    def __init__(self, dim):        
        super(SquareNet1, self).__init__()                 
        self.W = nn.Linear(dim, dim, bias=False).cuda()    
        self.b = nn.Parameter(torch.randn(1, device="cuda"))

    def forward(self, x, y):         
        y = self.W(y)
        z = torch.sum(x*y, axis=1) + self.b
        return nn.Sigmoid()(z)

In [None]:
class SquareNet2(nn.Module):
    def __init__(self, dim):        
        super(SquareNet2, self).__init__()                 
        self.Wxy = nn.Linear(dim, dim, bias=False).to("cuda") 
        self.Wxc = nn.Linear(dim, dim, bias=False).to("cuda")
        self.Wyc = nn.Linear(dim, dim, bias=False).to("cuda")
        self.Bxy = nn.Parameter(torch.randn(1, device="cuda"))
        self.Bxc = nn.Parameter(torch.randn(1, device="cuda"))
        self.Byc = nn.Parameter(torch.randn(1, device="cuda"))
        self.b = nn.Parameter(torch.randn(1, device="cuda"))
          
    def forward(self, x, y, c):
        xy = torch.sum(x*self.Wxy(y), axis=1)
        xc = torch.sum(x*self.Wxc(c), axis=1)
        yc = torch.sum(y*self.Wyc(c), axis=1)
        z = self.Bxy*xy + self.Bxc*xc + self.Byc*yc + self.b 
        return nn.Sigmoid()(z)

In [None]:
class SquareNet3(nn.Module):
    def __init__(self, dim):
        super(SquareNet3, self).__init__()                 
        self.Wxy = nn.Linear(dim, dim, bias=False).to("cuda") 
        self.Wxc = nn.Linear(dim, dim, bias=False).to("cuda")
        self.Wyc = nn.Linear(dim, dim, bias=False).to("cuda")
        self.b = nn.Parameter(torch.randn(1, device="cuda"))
          
    def forward(self, x, y, c):
        xy = torch.sum(x*self.Wxy(y), axis=1)
        xc = torch.sum(x*self.Wxc(c), axis=1)
        yc = torch.sum(y*self.Wyc(c), axis=1)
        z = xy + xc + yc + self.b 
        return nn.Sigmoid()(z)

In [None]:
class SquareNet4(nn.Module):
    def __init__(self, dim):        
        super(SquareNet4, self).__init__()  
        h1, h2 = 3*dim, 3*dim
        self.S1 = nn.Linear(3*dim, h1).to("cuda")
        self.S2 = nn.Linear(h1, h2).to("cuda")
        self.W  = nn.Linear(h2, h2).to("cuda")
        self.b = nn.Parameter(torch.randn(1, device="cuda"))
        
    def forward(self, x, y, c):    
        z = torch.cat([x,y,c], axis=1)
        z = self.S1(z)
        z = nn.ReLU()(z) 
        z = self.S2(z)
        z = nn.ReLU()(z)
        z = torch.sum(z*self.W(z), axis=1) + self.b         
        return nn.Sigmoid()(z)

In [None]:
class SquareNet4v2(nn.Module):
    def __init__(self, dim):        
        super(SquareNet4v2, self).__init__()  
        h1, h2 = 4*dim, 4*dim
        self.S1 = nn.Linear(3*dim, h1).to("cuda")
        self.do1 = nn.Dropout(0.5)
        self.S2 = nn.Linear(h1, h2).to("cuda")
        self.do2 = nn.Dropout(0.5)
        self.W  = nn.Linear(h2, h2).to("cuda")
        self.b = nn.Parameter(torch.randn(1, device="cuda"))
        
    def forward(self, x, y, c):    
        z = torch.cat([x,y,c], axis=1)
        z = self.do1(z)
        z = self.S1(z)
        z = nn.ReLU()(z)
        z = self.do2(z)
        z = self.S2(z)
        z = nn.ReLU()(z)
        z = torch.sum(z*self.W(z), axis=1) + self.b         
        return nn.Sigmoid()(z)

In [None]:
# 384, 1024
net = SquareNet4v2(384)

In [None]:
loss = nn.MSELoss()
#optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9)
optimizer = optim.Adam(net.parameters(), lr=0.0001)

In [None]:
train_loss_arr = list()
test_loss_arr = list()
batch_size = 100
epoch_num = 50

In [None]:
net = net.train(True)

for epoch in range(epoch_num):
    sumL, numB = 0, int(len(x_train1) / batch_size) 
    idx = torch.randperm(len(x_train1)).to("cuda")
    x_train1_select = x_train1[idx]
    x_train2_select = x_train2[idx]
    x_trainc_select = x_trainc[idx]
    y_train_select = torch.tensor(y_train, device="cuda")[idx]
    
    for i in range(0, numB * batch_size, batch_size):          
        x1b = x_train1_select[i:i+batch_size]
        x2b = x_train2_select[i:i+batch_size]
        xcb = x_trainc_select[i:i+batch_size]
        yb = y_train_select[i:i+batch_size]
        
        y_train_result = net(x1b, x2b, xcb)
        L = loss(y_train_result, yb)   

        optimizer.zero_grad()                        
        L.backward()                                        
        optimizer.step()                            
        sumL += L.detach().item()              
        
    net = net.train(False)
    test_result = net(x_test1, x_test2, x_testc)
    net = net.train(True)
    L_test = loss(test_result, torch.tensor(y_test, device="cuda"))
    train_loss_arr.append(sumL/numB)
    test_loss_arr.append(L_test.detach().item())
    
    if (epoch+1) % 5 == 0:
        print("Epoch ["+str(epoch+1)+"]\t", "Train:", round(sumL/numB, 10), "\tTest:", round(L_test.detach().item(), 10))

In [None]:
plt.figure(figsize=(12, 8), dpi=80)
plt.xlabel("Epoch")
plt.ylabel("MSE")
plt.plot(train_loss_arr, label="Train")
plt.plot(test_loss_arr, label="Test")

In [None]:
net = net.train(False)
result = net(x_test1, x_test2, x_testc)
np.corrcoef(result.detach().cpu().numpy(), y_test)[0][1]