## MSDS 631 - Final Project

### Project description

The patent review process involves searching for past filed patents in the same domain of a new patent application. The search keywords used by legal analysts depend on context of the patent application. For example, "strong material" could mean "steel" for construction purposes or "denim" for textiles. This Kaggle competition aims to find a relevance score (five levels from 0 to 1) of a target keyword with a given ancho r word in the context of a domain

### Training data description


### Preprocessing steps


### NLP model selection

### Results


In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from sklearn.model_selection import KFold
from scipy.stats import pearsonr

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

In [None]:
torch.cuda.is_available()

In [None]:
device = torch.device('cuda:0') 

In [None]:
train = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv')
train.head()

In [None]:
code = pd.read_csv('/kaggle/input/titles/titles.csv')
code.head()

In [None]:
code[code['code'] == 'A47']['title']

In [None]:
train = train.merge(code, how='inner', left_on='context', right_on='code')

train.head()

In [None]:
import string
def clean_txt(row):
    row = row.lower()
    row = row.translate(str.maketrans('', '', string.punctuation))
    return row
train['title'] = train['title'].apply(clean_txt)

In [None]:
train['combined_text'] = train['anchor'] + ' ' + train['target']

In [None]:
def change_values(row):
    class_map = {0.00:0,
                0.25:1,
                0.50:2,
                0.75:3,
                1.00:4}
    return class_map[row]
train['score'] = train['score'].apply(change_values)

In [None]:
train.head()

In [None]:
train['score'].value_counts()

### Dataset and dataloader preparation

We will use Pytorch's dataset class to construct a bespoke dataset class that will take either of the augmented train or test datasets

In [None]:
max_length = 0
for idx in range(len(train)):
    row = train.iloc[idx]
    length = len(row['combined_text'].split(' '))
    if length > max_length:
        max_length = length
print(max_length)

In [None]:
word_dict = {word:i for i, word in enumerate(train['combined_text'].str.split(' ', expand=True).stack().unique())}

In [None]:
class PatentDataset(Dataset):
    def __init__(self, df, max_length, word_dict):
        self.df = df
        self.max_len = max_length
        self.word_dict = word_dict
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        combined = row['combined_text'].split(' ')
        x = torch.zeros(self.max_len).long()
        
        # get review as a list of integers
        for idx in range(len(combined)):
            
            # we want to front pad for RNN
            x[self.max_len - len(combined) + idx] = self.word_dict[combined[idx]]
            
        y = torch.tensor(row['score']).float()
        return x, y

In [None]:
class RNN(nn.Module):
    def __init__(self, dict_length, embedding_size, hidden_size):
        super(RNN, self).__init__()
        
        self.word_emb = nn.Embedding(dict_length, embedding_size, padding_idx=0)
        self.rnn = nn.GRU(input_size=embedding_size, hidden_size=hidden_size, batch_first=True)
        #self.rnn2 = nn.GRU(input_size=hidden_size, hidden_size=hidden_size, batch_first=True)
        self.relu = nn.ReLU()
        self.g = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        x = self.word_emb(x)
        hidden = self.rnn(x)[1]
        out = self.relu(hidden)
        out = self.g(out)

        return torch.squeeze(out)

In [None]:
class LSTM(nn.Module):
    def __init__(self, dict_length, embedding_size, hidden_size):
        super(LSTM, self).__init__()
        
        self.word_emb = nn.Embedding(dict_length, embedding_size, padding_idx=0)
        self.lstm = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size)
        self.g = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        x = self.word_emb(x)
        hidden = self.lstm(x)[1]
        out = self.g(out)

        return torch.squeeze(out[:, -1])#torch.squeeze(out)

In [None]:
def one_pass(model, dataloader, optimizer, lossFun, backwards=True):
    
    total_loss = 0.0
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    for X, y in dataloader:
        
        y_pred = model(X.cuda())
        loss = lossFun(y_pred, y.cuda())
        total_loss += loss.item()
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
    return total_loss/len(dataloader)

In [None]:
def train_model(model, optimizer, train_dl, valid_dl, num_epochs):
    train_losses = []
    valid_losses = []

    for epoch in tqdm(range(num_epochs)):

        train_loss = one_pass(model, train_dl, optimizer, lossFun)
        train_losses.append(train_loss)

        valid_loss = one_pass(model, valid_dl, optimizer, lossFun, backwards=False)
        valid_losses.append(valid_loss)
        
        if epoch%5==0:
            print('Epoch: ', epoch)
            print('Train loss: ', train_loss)
            print('Valid loss: ', valid_loss)

In [None]:
def pearsonr_(x, y):
    """
    Mimics `scipy.stats.pearsonr`
    Arguments
    ---------
    x : 1D torch.Tensor
    y : 1D torch.Tensor
    Returns
    -------
    r_val : float
        pearsonr correlation coefficient between x and y
    
    Scipy docs ref:
        https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html
    
    Scipy code ref:
        https://github.com/scipy/scipy/blob/v0.19.0/scipy/stats/stats.py#L2975-L3033
    Example:
        >>> x = np.random.randn(100)
        >>> y = np.random.randn(100)
        >>> sp_corr = scipy.stats.pearsonr(x, y)[0]
        >>> th_corr = pearsonr(torch.from_numpy(x), torch.from_numpy(y))
        >>> np.allclose(sp_corr, th_corr)
    """
    mean_x = torch.mean(x)
    mean_y = torch.mean(y)
    xm = x.sub(mean_x)
    ym = y.sub(mean_y)
    r_num = xm.dot(ym)
    r_den = torch.norm(xm, 2) * torch.norm(ym, 2)
    r_val = r_num / r_den
    return -torch.clamp(r_val, min=-1., max=1.)

In [None]:
np.random.seed(3)
msk = np.random.rand(len(train)) < 0.8
train_ds = PatentDataset(train[msk].reset_index(), max_length, word_dict)
valid_ds = PatentDataset(train[~msk].reset_index(), max_length, word_dict)

In [None]:
model = RNN(len(word_dict), 400, 400).to(device)
lossFun = pearsonr_#nn.CrossEntropyLoss()

train_dl = DataLoader(train_ds, batch_size=1000, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=1000, shuffle=False)

optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay=5e-6)
train_model(model, optimizer, train_dl, valid_dl, 31)
optimizer = optim.Adam(model.parameters(), lr = 0.0002)
train_model(model, optimizer, train_dl, valid_dl, 16)

In [None]:
kf = KFold(n_splits=5)
rs = []
for train_index, test_index in kf.split(train):
    train_ds = PatentDataset(train.iloc[train_index].reset_index(), max_length, word_dict)
    valid_ds = PatentDataset(train.iloc[test_index].reset_index(), max_length, word_dict)
    train_dl = DataLoader(train_ds, batch_size=1000, shuffle=True)
    valid_dl = DataLoader(valid_ds, batch_size=len(valid_ds), shuffle=False)
    
    model = RNN(len(word_dict), 400, 400).to(device)
    optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay=5e-6)
    train_model(model, optimizer, train_dl, valid_dl, 31)
    optimizer = optim.Adam(model.parameters(), lr = 0.0002)
    train_model(model, optimizer, train_dl, valid_dl, 16)
    
    #preds = torch.argmax([model(x.cuda()) for x, _ in valid_dl][0], dim=1).cpu().numpy()
    preds = [model(x.cuda()) for x, _ in valid_dl][0].detach().cpu().numpy()
    y = train.iloc[test_index]['score'].values
    r, _ = pearsonr(preds, y)
    rs.append(r)

In [None]:
print(rs)

In [None]:
np.mean(rs)

In [None]:
print([round(r, 4) for r in rs])