In [None]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.utils import shuffle

## Text Preprocessing

In [None]:
path = '../input/us-patent-phrase-to-phrase-matching/train.csv'
path_test = '../input/us-patent-phrase-to-phrase-matching/test.csv'
df = pd.read_csv(path)
df_test = pd.read_csv(path_test)


df = df.drop(columns=['id', 'context'])
test_id = df_test['id']
df_test = df_test.drop(columns=['id', 'context'])
df = shuffle(df)
df = df.reset_index(drop=True)

In [None]:
df_test.shape

In [None]:
x_data_1 = df['anchor']
x_data_2 = df['target']
score = df['score']

## Generate text tokens

In [None]:
test_combined = df_test['anchor'] + ' ' + df_test['target']
x_combined = x_data_1 + " " + x_data_2
df_tokens = pd.concat([test_combined, x_combined])
df_tokens.shape

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_tokens)

In [None]:
anchor_tokenized = tokenizer.texts_to_sequences(x_data_1)
target_tokenized = tokenizer.texts_to_sequences(x_data_2)

In [None]:
padded_anchor = tf.keras.preprocessing.sequence.pad_sequences(anchor_tokenized, maxlen=7)
padded_target = tf.keras.preprocessing.sequence.pad_sequences(target_tokenized, maxlen=17)

In [None]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
y_score = LE.fit_transform(score)

## Build Model

In [None]:
class PositionalEmbedding(keras.layers.Layer):
    def __init__(self, vocab_size, output_dim, input_dim):
        super(PositionalEmbedding, self).__init__()
        self.word_embedding = layers.Embedding(vocab_size, output_dim=output_dim, input_length=input_dim)
        self.postional_embedding = layers.Embedding(input_dim, output_dim)
        
    def call(self, inputs):
        position_indices = tf.range(tf.shape(inputs)[-1])
        embedded_words = self.word_embedding(inputs)
        embedded_indices = self.postional_embedding(position_indices)
        return embedded_words + embedded_indices

In [None]:
class Transformer(keras.layers.Layer):
    def __init__(self,num_heads, embed_dim, ff_dim, rate=0.1):
        super(Transformer,self).__init__()
        self.att = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)
    def call(self, inputs, training):
        out1 = self.att(inputs, inputs)
        out1 = self.dropout1(out1, training=training)
        out1 = self.layernorm1(inputs + out1)
        out2 = self.ffn(out1)
        out2 = self.dropout2(out2, training=training)
        output = self.layernorm2(out1 + out2)
        
        return output

In [None]:
class AutoEncoderModel(keras.Model):
    def __init__(self, vocab_size, num_heads, embed_dim, ff_dim, output_dim, input_dim_1, input_dim_2):
        super(AutoEncoderModel, self).__init__()
        self.embed_layer1 = PositionalEmbedding(vocab_size, output_dim, input_dim_1)
        self.att1 = Transformer(num_heads, embed_dim, ff_dim)
        self.embed_layer2 = PositionalEmbedding(vocab_size, output_dim, input_dim_2)
        self.att2 = Transformer(num_heads, embed_dim, ff_dim)
        # self.drop_out1 = layers.Dropout(rate=0.1)
        # self.drop_out2 = layers.Dropout(rate=0.1)
        self.drop_out_clf = layers.Dropout(rate=0.2)
        self.global_avg1 = layers.GlobalAveragePooling1D()
        self.global_avg2 = layers.GlobalAveragePooling1D()
        self.dense1 = layers.Dense(128, activation='relu')
        self.dense2 = layers.Dense(64, activation='relu')
        self.dense3 = layers.Dense(64, activation='relu')
        self.dense4 = layers.Dense(32)
        self.dense5 = layers.Dense(16)
        self.dense_clf = layers.Dense(5, activation='softmax')
    def call(self, inputs):
        anchor, target = inputs
        out_anchor = self.embed_layer1(anchor)
        out_anchor = self.att1(out_anchor)
        out_anchor = self.global_avg1(out_anchor)
        # out_anchor = self.drop_out1(out_anchor)
        
        out_target = self.embed_layer2(target)
        out_target = self.att2(out_target)
        out_target = self.global_avg2(out_target)
        # out_target = self.drop_out2(out_target)
        
        output = layers.Concatenate(axis=1)([out_anchor, out_target])
        output = self.dense1(output)
        output = self.dense2(output)
        output = self.dense3(output)
        output = self.dense4(output)
        output = self.dense5(output)
        output = self.drop_out_clf(output)
        output = self.dense_clf(output)
        return output

In [None]:
vocab_size = len(tokenizer.word_index)
output_dim = 32
input_dim_1 = 7
input_dim_2 = 17
num_heads = 8
embed_dim = 32
ff_dim = 256

In [None]:
model = AutoEncoderModel(vocab_size, num_heads, embed_dim, ff_dim, output_dim, input_dim_1, input_dim_2)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
x_anchor = padded_anchor[:33000]
x_target = padded_target[:33000]
anchor_val = padded_anchor[33000:]
target_val = padded_target[33000:]
y_data = y_score[:33000]
y_val = y_score[33000:]

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
history = model.fit([x_anchor, x_target], y_data, epochs=100, batch_size=128, callbacks=[callback])

## Model Evaluation

In [None]:
model.evaluate([anchor_val, target_val], y_val)

In [None]:
model.summary()

In [None]:
pre = model.predict([anchor_val[:20], target_val[:20]])
predicted = []
for x in pre:
    predicted.append(np.argmax(x))
predicted = LE.inverse_transform(predicted)
predicted

In [None]:
True_values = LE.inverse_transform(y_val[:20])
True_values

In [None]:
anchor_test= tokenizer.texts_to_sequences(df_test['anchor'])
target_test = tokenizer.texts_to_sequences(df_test['target'])

In [None]:
padded_anchor_test = tf.keras.preprocessing.sequence.pad_sequences(anchor_test, maxlen=7)
padded_target_test = tf.keras.preprocessing.sequence.pad_sequences(target_test, maxlen=17)

In [None]:
test_predicted = model.predict([padded_anchor_test[:], padded_target_test[:]])

In [None]:
predicted_arr = []
for x in test_predicted:
    predicted_arr.append(np.argmax(x))

In [None]:
predicted_arr = LE.inverse_transform(predicted_arr)

In [None]:
test_id_1 = np.array(test_id)
predicted_arr_1 = np.array(predicted_arr)
print(test_id_1.shape, predicted_arr_1.shape)

In [None]:
Submission = pd.DataFrame({'id': test_id_1, 'score': predicted_arr_1})

In [None]:
# import os
# os.makedirs('Submissions')
filename = 'submission.csv'
Submission.to_csv(filename, index=False)