In [2]:
from google.colab import drive
drive.mount('/content/drive') #mounts google colab to my drive. Can be commented out when running locally

%cd /content/drive/MyDrive/"Colab Notebooks"/"us-patent-phrase-to-phrase-matching"

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/us-patent-phrase-to-phrase-matching


In [45]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Model
from keras.layers import Input, SimpleRNN, Dense, LSTM, GRU, Flatten, Dropout, Input
from keras.layers import Embedding
from tensorflow.keras.optimizers import Adam
import keras
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from scipy.stats import pearsonr

In [84]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [85]:
def preprocess(df):
    data = df.to_numpy()
    all_ids = data[:, 0]
    sentences = [data[i, 1].lower() + " " + data[i, 2].lower() + " " + data[i, 3].lower() for i in range(len(data))]
    classes = {0: 0, .25: 1, .5: 2, .75: 3, 1: 4}
    y_train = None
    if data.shape[1] == 5:
        y_train = [classes[val] for val in data[:, 4]]
        y_train = tf.one_hot(y_train, 5)
    return sentences, y_train

In [86]:
X_train, y_train = preprocess(train_df)
X_test, _ = preprocess(test_df)

In [87]:
train_df

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00
...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50
36471,756ec035e694722b,wood article,wooden material,B44,0.75


In [50]:
max_train_len = max([len(sentence) for sentence in X_train])
max_test_len = max([len(sentence) for sentence in X_test])
maxlen = max(max_train_len, max_test_len)

In [90]:
tk = Tokenizer()
tk.fit_on_texts(X_train)
X_train = tk.texts_to_sequences(X_train)
X_test = tk.texts_to_sequences(X_test)

In [91]:
EMBEDDING_SIZE = 128
MAX_SENTENCE_LEN = maxlen
vocab_size = len(tk.word_index)
print(vocab_size)

9036


In [92]:
X_train = sequence.pad_sequences(X_train, maxlen=MAX_SENTENCE_LEN, padding='post')
X_test = sequence.pad_sequences(X_test, maxlen=MAX_SENTENCE_LEN, padding='post')

In [93]:
model = Sequential()
model.add(Embedding(vocab_size + 1, EMBEDDING_SIZE, input_length=MAX_SENTENCE_LEN))
model.add(Flatten())
# model.add(Dense(32))
# model.add(Dropout(.3))
model.add(Dense(5, activation='softmax'))
model.compile(loss='binary_crossentropy', metrics=['accuracy'])

In [94]:
print(X_train.shape)

(36473, 111)


In [95]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [96]:
def one_hot(y):
    arr = np.zeros((y.shape[0], np.max(y) + 1))
    for i in range(arr.shape[0]):
        arr[i, y[i]] = 1
    return arr

In [97]:
y_train_temp = np.argmax(y_train, axis=1)
X_train_real, X_test_real, y_train_vec, y_test_vec = train_test_split(X_train, y_train_temp, test_size=.2)
y_train_split = one_hot(y_train_vec)
model.fit(X_train_real, y_train_split, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fc0912ca550>

In [100]:
(X_test_real)

array([[ 437,   76,  153, ...,    0,    0,    0],
       [ 487,  147,  172, ...,    0,    0,    0],
       [1077,  403,  858, ...,    0,    0,    0],
       ...,
       [1400,  256, 1044, ...,    0,    0,    0],
       [ 624,  515,  624, ...,    0,    0,    0],
       [1122, 5685,   43, ...,    0,    0,    0]], dtype=int32)

In [59]:
pred = model.predict(X_test_real)
res = np.argmax(pred, axis=1)
# real = np.argmax(y_test_real, axis=1)
print(accuracy_score(res, y_test_vec))

0.49856065798492116


In [60]:
r, _ = pearsonr(y_test_vec/4, np.sum(np.arange(0, 1.25, .25) * pred, axis=1))
print(r)

0.4358318007584639


In [61]:
model.save("model1")

INFO:tensorflow:Assets written to: model1/assets


In [62]:
from sklearn.feature_extraction.text import CountVectorizer

In [63]:
train_df['sentence'] = train_df['anchor'].str.lower() + " " + train_df['target'].str.lower() + " " + train_df['context'].str.lower()

In [64]:
corpus = train_df['sentence'].to_numpy()

In [65]:
print(corpus[0:5])

['abatement abatement of pollution a47' 'abatement act of abating a47'
 'abatement active catalyst a47' 'abatement eliminating process a47'
 'abatement forest region a47']


In [66]:
vectorizer = CountVectorizer()

In [67]:
X_train = vectorizer.fit_transform(corpus).toarray()

In [68]:
model_2 = Sequential()

In [69]:
model_2.add(Dense(512))
model_2.add(Dense(64))
model_2.add(Dense(5, activation='softmax'))
model_2.compile(loss='binary_crossentropy', metrics=['accuracy'])

In [70]:
y_train_temp = np.argmax(y_train, axis=1)
X_train_real, X_test_real, y_train_vec, y_test_vec = train_test_split(X_train, y_train_temp, test_size=.2)
y_train_split = one_hot(y_train_vec)
model_2.fit(X_train_real, y_train_split, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fc0a1083c50>

In [71]:
pred = model_2.predict(X_test_real)
res = np.argmax(pred, axis=1)
print(accuracy_score(res, y_test_vec))

0.42590815627141876


In [72]:
model_2.save("model2")

INFO:tensorflow:Assets written to: model2/assets


In [31]:
y_labels = y_test_vec/4

In [32]:
print(np.arange(0, 1.25, .25))
real_accuracy = np.sum(np.arange(0, 1.25, .25) * pred, axis=1)
r, p_val = pearsonr(y_labels, real_accuracy)

[0.   0.25 0.5  0.75 1.  ]


In [33]:
import matplotlib.pyplot as plt

In [101]:
train_df['anchor_context'] = train_df['anchor'].str.lower() + " " + train_df['context'].str.lower()
train_df['target_context'] = train_df['target'].str.lower() + " " + train_df['context'].str.lower()

In [102]:
anchor_context = train_df['anchor_context'].to_numpy()
target_context = train_df['target_context'].to_numpy()
max_anchor_len = max([len(sentence) for sentence in anchor_context])
max_target_len = max([len(sentence) for sentence in target_context])
maxlen = max(max_anchor_len, max_target_len)
print(maxlen)

102


In [103]:
print(anchor_context.shape)

(36473,)


In [104]:
tk = Tokenizer()
tk.fit_on_texts(np.concatenate((anchor_context, target_context), axis=0))
anchor_token = tk.texts_to_sequences(anchor_context)
target_token = tk.texts_to_sequences(target_context)

In [105]:
from nltk.stem import PorterStemmer
from collections import defaultdict

In [106]:
def count_num_duplicate_stems(arr, threshold=2):
    _, counts = np.unique(arr, return_counts=True)
    return np.count_nonzero(counts >= threshold)
        

In [107]:
X_train_anchor = sequence.pad_sequences(anchor_token, maxlen=maxlen, padding='post')
X_train_target = sequence.pad_sequences(target_token, maxlen=maxlen, padding='post')
ps = PorterStemmer()
sentences = (train_df['anchor'] + " " + train_df['target']).to_numpy()
all_stems = [[ps.stem(w) for w in sentence.split()] for sentence in sentences]
all_counts = np.array([count_num_duplicate_stems(stems) for stems in all_stems]).reshape(-1, 1)

In [108]:
from keras import layers
anchor_input = Input(shape=(maxlen,))
target_input = Input(shape=(maxlen,))
num_dup_stems = Input(shape=(1,))
embedding = Embedding(vocab_size + 1, EMBEDDING_SIZE, input_length=maxlen)
anchor_embedding = embedding(anchor_input)
target_embedding = embedding(target_input)
final_layer = layers.concatenate([Flatten()(anchor_embedding), Flatten()(target_embedding), num_dup_stems])
first_dense = Dense(512, activation='relu')(final_layer)
first_dense = Dropout(rate=.5)(first_dense)
output = Dense(5, activation='softmax')(first_dense)
model = Model([anchor_input, target_input, num_dup_stems], output)
model.compile(loss='binary_crossentropy', metrics=['accuracy', 'AUC'])
indices = np.random.choice(X_train_anchor.shape[0], X_train_anchor.shape[0]//5)
mask = np.ones(X_train_anchor.shape[0], dtype=bool)
mask[indices] = False
X_A_train = X_train_anchor[mask, ...]
X_T_train = X_train_target[mask, ...]
X_num_train = all_counts[mask, ...]
X_A_test = X_train_anchor[~mask]
X_T_test = X_train_target[~mask]
X_num_test = all_counts[~mask]
y_t = y_train[mask]
y_te = y_train[~mask]
history = model.fit([X_A_train, X_T_train, X_num_train], y_t, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [109]:
pred = model.predict([X_A_test, X_T_test, X_num_test])
res = np.argmax(pred, axis=1)
classes = np.argmax(y_te, axis=1)
print(accuracy_score(res, classes))

0.5705391040242976


In [110]:
real_accuracy = np.sum(np.arange(0, 1.25, .25) * pred, axis=1)
r, p_val = pearsonr(classes/4, real_accuracy)
print(r)

0.6103685615860248


In [113]:
all_counts

array([[1],
       [1],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [44]:
model.save("BOW_model")

INFO:tensorflow:Assets written to: BOW_model/assets


In [74]:
print(X_T_test)

[[ 964  618   29 ...    0    0    0]
 [1360 1275  343 ...    0    0    0]
 [5420   61   29 ...    0    0    0]
 ...
 [ 600  219    0 ...    0    0    0]
 [ 600   61  219 ...    0    0    0]
 [ 600  107  219 ...    0    0    0]]


['abatement abatement of pollution' 'abatement act of abating'
 'abatement active catalyst' 'abatement eliminating process'
 'abatement forest region']
[['abat', 'abat', 'of', 'pollut'], ['abat', 'act', 'of', 'abat'], ['abat', 'activ', 'catalyst'], ['abat', 'elimin', 'process'], ['abat', 'forest', 'region']]
