# A Very Basic Attempt
* Basic preprocessing of text data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import string
from sklearn.model_selection import *
import pickle

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.models import Model,load_model
from tensorflow.keras.utils import *
from tensorflow.keras.callbacks import ModelCheckpoint,ReduceLROnPlateau

In [None]:
train_df=pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
test_df=pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')

In [None]:
print('Training Data Shape: ',train_df.shape)
print('Testing Data shape: ',test_df.shape)

In [None]:
train_df.head()

**Clean both Anchor and Target**

In [None]:
wl=WordNetLemmatizer()
ps=PorterStemmer()
sp=stopwords.words('english')

def cleaning(data):
    filtered_txt=[]
    for text in tqdm(data):
        
        # Punctuation Handling
        text=text.translate(str.maketrans('','',string.punctuation)).lower()
        
        # Normalize words
        text=' '.join(ps.stem(i) for i in text.split())
        
        # Now stop words
        text=' '.join(i for i in text.split() if i not in sp)
        
        filtered_txt.append(text)
    return filtered_txt

In [None]:
train_filtered_anchor=cleaning(train_df['anchor'])
train_filtered_target=cleaning(train_df['target'])
training_score=train_df['score'].values

In [None]:
train_anchor,val_anchor,train_target,val_target,train_score,val_score=train_test_split(train_filtered_anchor,
                                                                                      train_filtered_target,
                                                                                       training_score,test_size=0.2)

In [None]:
print('Training Size: ',len(train_anchor))
print('Validation Size: ',len(val_anchor))

In [None]:
#Token hyperParameters
num_words=5000
maxlen=10

In [None]:
#Anchor
tokenizer_anchor = Tokenizer(num_words=num_words)
tokenizer_anchor.fit_on_texts(train_anchor)

#Tokenize
train_anchor = tokenizer_anchor.texts_to_sequences(train_anchor)
val_anchor = tokenizer_anchor.texts_to_sequences(val_anchor)

#pad Sequence
train_anchor = pad_sequences(train_anchor, maxlen=maxlen)
val_anchor = pad_sequences(val_anchor, maxlen=maxlen)

word_index_anchor=tokenizer_anchor.word_index

In [None]:
#Target
tokenizer_target = Tokenizer(num_words=num_words)
tokenizer_target.fit_on_texts(train_target)

#Tokenize
train_target = tokenizer_target.texts_to_sequences(train_target)
val_target = tokenizer_target.texts_to_sequences(val_target)

#pad Sequence
train_target = pad_sequences(train_target, maxlen=maxlen)
val_target = pad_sequences(val_target, maxlen=maxlen)

word_index_target=tokenizer_target.word_index

In [None]:
#Load Word Embedding
embedding_path='../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl'
with open(embedding_path,'rb') as f:
    embedding_dict=pickle.load(f)
print('Found %s word vectors.' % len(embedding_dict))

In [None]:
#Load Word Embedding for Anchor

embedding_matrix_anchor=np.zeros((num_words,300))
print('Loading Embedding Matrix..\n')
for word,ix in tqdm(word_index_anchor.items()):
    if ix<num_words:
        embed_vec=embedding_dict.get(word)
        if embed_vec is not None:
            embedding_matrix_anchor[ix]=embed_vec
        
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix_anchor, axis=1) == 0))

In [None]:
#Load Word Embedding for target

embedding_matrix_target=np.zeros((num_words,300))
print('Loading Embedding Matrix..\n')
for word,ix in tqdm(word_index_target.items()):
    if ix<num_words:
        embed_vec=embedding_dict.get(word)
        if embed_vec is not None:
            embedding_matrix_target[ix]=embed_vec
        
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix_target, axis=1) == 0))

In [None]:
# Model Anchor
inp_anchor=Input((maxlen))
emb_anchor=Embedding(num_words,300)(inp_anchor)
lstm_anchor=Bidirectional(LSTM(128,return_sequences=True))(emb_anchor)

# Model Target
inp_target=Input((maxlen))
emb_target=Embedding(num_words,300)(inp_target)
lstm_target=Bidirectional(LSTM(128,return_sequences=True))(emb_target)
lstm=Subtract()([lstm_anchor,lstm_target])

conv=Conv1D(512,3,1,activation='relu')(lstm)
conv=Conv1D(1024,3,2,activation='relu')(conv)
gap=GlobalAveragePooling1D()(conv)
out=Dense(1,activation='sigmoid')(gap)

In [None]:
model=Model([inp_anchor,inp_target],out)
model.summary()

In [None]:
#Set Layer weights
model.layers[2].set_weights([embedding_matrix_anchor])
model.layers[3].set_weights([embedding_matrix_target])

In [None]:
#Set layers trainable
model.layers[2].trainable=True
model.layers[3].trainable=True

In [None]:
#Compile Model
model.compile(loss='mean_squared_error',optimizer=Adam(0.001))

In [None]:
#Callbacks
rop=ReduceLROnPlateau(monitor='val_loss',min_lr=0.000000005,period=5,verbose=1)
mc=ModelCheckpoint('best_model.h5',save_best_only=True,period=1,verbose=1)

In [None]:
#Model Fitting
history=model.fit([train_anchor,train_target],train_score,batch_size=64,epochs=200,
         validation_data=([val_anchor,val_target],val_score),callbacks=[mc,rop])

In [None]:
#Plot Metrics
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.figure()
plt.plot(epochs, loss, 'b', color='red', label='Training loss')
plt.plot(epochs, val_loss, 'b',color='blue', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
#Test Data

#Clean test data
anchor_test=cleaning(test_df['anchor'])
target_test=cleaning(test_df['target'])


test_anchor_tokenized=tokenizer_anchor.texts_to_sequences(anchor_test)
test_anchor=pad_sequences(test_anchor_tokenized,maxlen=maxlen)

test_target_tokenized=tokenizer_target.texts_to_sequences(target_test)
test_target=pad_sequences(test_target_tokenized,maxlen=maxlen)

In [None]:
#Made Predictions
best_model=load_model('best_model.h5')
predictions=model.predict([test_anchor,test_target])
predictions=[i[0] for i in predictions]

In [None]:
sub=pd.read_csv('../input/us-patent-phrase-to-phrase-matching/sample_submission.csv')
sub['score']=predictions
sub.to_csv('submission.csv',index=False)