In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv')
print(train_df.info())
train_df.head()

In [None]:
test_df = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv')
print(test_df.info())
test_df.head()

In [None]:
sample_sub = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv')
sample_sub.head()

In [None]:
train_df['anchor'].value_counts()

In [None]:
train_df['target'].value_counts()

In [None]:
train_df['context'].value_counts()

In [None]:
train_df['score'].value_counts()

In [None]:
cpc = pd.read_csv('../input/cpc-codes/titles.csv')
cpc.head()

In [None]:
cpc = cpc.rename(columns = {"code" : "context"})
train_df = pd.merge(train_df, cpc[["context","title"]], on ="context", how = "left")
test_df = pd.merge(test_df, cpc[["context","title"]], on ="context", how = "left")

In [None]:
train_df.head()

In [None]:
def clean(x):
    t = x.lower()
    t = t.replace("[",'')
    t = t.replace(";",'')
    t = t.replace(",",'')
    t = t.replace("]",'')
    t = t.replace(":",'')
    return t

train_df['title'] = train_df['title'].apply(lambda x: clean(x))
test_df['title'] = test_df['title'].apply(lambda x: clean(x))

In [None]:
train_df.head()

In [None]:
train_df['sen1'] = train_df['anchor'].astype('str')+' '+train_df['title'].astype('str')
test_df['sen1'] = test_df['anchor'].astype('str')+' '+test_df['title'].astype('str')

In [None]:
train_df.drop(['anchor','context','title'],axis=1,inplace=True)
test_df.drop(['anchor','context','title'],axis=1,inplace=True)
train_df.head()

In [None]:
train_df['all_sen'] = train_df['target']+' '+train_df['sen1']
test_df['all_sen'] = test_df['target']+' '+test_df['sen1']


In [None]:
from sklearn.preprocessing import LabelEncoder
label_encode = LabelEncoder()

train_df['label'] = label_encode.fit_transform(train_df['score'])

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
# seq1_len = [len(i.split()) for i in train_df['all_sen'].values]
# pd.Series(seq1_len).hist(bins = 30)

tar_len = [len(i.split()) for i in test_df['all_sen'].values]
pd.Series(tar_len).hist(bins = 30)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
all_text = list(train_df['all_sen'].values)
all_text.extend(test_df['all_sen'].values)

len(all_text),type(all_text)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_text)

print("The document count",tokenizer.document_count)
print(len(tokenizer.word_index),len(tokenizer.word_counts))
# print("The count of words",tokenizer.word_counts)

In [None]:
vocab_size = len(tokenizer.word_index)+1
max_length = 32

In [None]:
train_seq = tokenizer.texts_to_sequences(train_df['all_sen'].values)
train_seq_pad = pad_sequences(train_seq,maxlen=max_length)


In [None]:
train_seq_pad[:10]

In [None]:
train_score = train_df['score'].values
len(train_score),train_score[:10]

In [None]:
train_y = pd.get_dummies(train_df['label']).values

In [None]:
train_y[:10]

### Using glove embeddings

In [None]:
path_to_glove_file = '../input/glove6b/glove.6B.100d.txt'

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
embeddings_index['<oov>'] = np.zeros((100,),dtype='float32')

In [None]:
embeddings_index.get('<oov>')

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense

In [None]:
embd_dim = 100
vocab_size = len(tokenizer.word_index)+1
max_length = 32

In [None]:
embd_matrix = np.zeros((vocab_size,100))
for word,i in tokenizer.word_index.items():
    vec = embeddings_index.get(word)
    if vec is not None:
        embd_matrix[i]=vec

len(embd_matrix),embd_matrix.size

In [None]:
model = Sequential([
    Embedding(vocab_size,embd_dim,input_length=max_length,weights=[embd_matrix]),#,trainable=False),
    LSTM(100,dropout=0.2,recurrent_dropout=0.2),
    Dense(128,activation="relu"),
    Dense(5,activation="softmax")
])

model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=['accuracy'])
model.summary()

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
batch_size=512
epochs=20
filepath = 'my_best_model.hdf5'
checkpoint = ModelCheckpoint(filepath=filepath, 
                             monitor='val_loss',
                             verbose=1, 
                             save_best_only=True,
                             mode='min')
callbacks = [checkpoint]

model.fit(train_seq_pad,train_y,batch_size=batch_size,epochs=epochs,validation_split=0.2,callbacks=callbacks)

In [None]:
history = model.history

import matplotlib.pyplot as plt

plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()


import matplotlib.pyplot as plt
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show()

#### below when we used cosine similarity

In [None]:
# from sklearn.metrics.pairwise import cosine_similarity

# res = []
# for idx,row in train_df.iterrows():
#     sen1 = row['sen1'].split(' ')
#     sen2 = row['target'].split(' ')
    
#     vec1 = np.mean([embeddings_index.get(word,embeddings_index['<oov>']) for word in sen1],axis=0)
#     vec2 = np.mean([embeddings_index.get(word,embeddings_index['<oov>']) for word in sen2],axis=0)
#     cosine = cosine_similarity([vec1], [vec2])
#     res.append(round(float(cosine[0]),3))

# print(len(res))
# print(train_df.shape)
# train_df['word_embed_score'] = res


# train_df.head()

In [None]:
# from sklearn.metrics import mean_squared_error

# error = mean_squared_error(train_df['score'].values,train_df['word_embed_score'].values)
# error

### test df

In [None]:
# from sklearn.metrics.pairwise import cosine_similarity

# res = []
# for idx,row in test_df.iterrows():
#     sen1 = row['sen1'].split(' ')
#     sen2 = row['target'].split(' ')
    
#     vec1 = np.mean([embeddings_index.get(word,embeddings_index['<oov>']) for word in sen1],axis=0)
#     vec2 = np.mean([embeddings_index.get(word,embeddings_index['<oov>']) for word in sen2],axis=0)
#     cosine = cosine_similarity([vec1], [vec2])
#     res.append([row['id'],round(float(cosine[0]),3)])


In [None]:
test_seq = tokenizer.texts_to_sequences(test_df['all_sen'].values)
test_seq_pad = pad_sequences(test_seq,maxlen=max_length)
len(test_seq_pad)

In [None]:
y = model.predict(test_seq_pad)

In [None]:
y

In [None]:
res=[np.argmax(i) for i in y]
print(res)
ans1 = label_encode.inverse_transform(res)


In [None]:
ans1

In [None]:
ans = zip(test_df['id'].values,ans1)
l=[]
for i in ans:
    l.append([i[0],i[1]])
#     print(i)

In [None]:
l

In [None]:
s = pd.DataFrame(l,columns=['id','score'])
s.head()

In [None]:
s.to_csv('submission.csv',index=False)