Problem statement

In this dataset, you are presented pairs of phrases (an anchor and a target phrase) and asked to rate how similar they are on a scale from 0 (not at all similar) to 1 (identical in meaning). 

Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Load datasets

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Read datasets

In [None]:
train = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv")
test = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv")
submission = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv")

In [None]:
train

In [None]:
test

In [None]:
submission

Analyse score

In [None]:
sns.distplot(train.score)

In [None]:
plt.boxplot(train.score)

Define target

In [None]:
target = train.score

Combine train and test

In [None]:
combi = train.drop(['score'], axis=1).append(test)
combi

Define X and y

In [None]:
y = target
X = combi[: len(train)]
X_test = combi[len(train) :]

Match phrase similarities

In [None]:
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

simularity = []

for i in range(len(X_test)):
    
    # count word occurrences
    anchor_vals = Counter(X_test['anchor'][i])
    target_vals = Counter(X_test['target'][i])
    
    # convert to word-vectors
    words  = list(anchor_vals.keys() | target_vals.keys())
    anchor_vect = [anchor_vals.get(word, 0) for word in words]       
    target_vect = [target_vals.get(word, 0) for word in words]
    
    # find cosine
    len_anchor  = sum(av*av for av in anchor_vect) ** 0.5             
    len_target  = sum(bv*bv for bv in target_vect) ** 0.5             
    dot    = sum(av*bv for av,bv in zip(anchor_vect, target_vect))   
    cosine = dot / (len_anchor * len_target)
    
    sim = cosine_similarity([anchor_vect], [target_vect])
    
    simularity.append(sim)
    
simularity = np.concatenate(simularity, axis=0 )
   
print(len(simularity))
print(simularity)

Normalise cosine simularity

In [None]:
simularity = (simularity - simularity.min()) / (simularity.max() - simularity.min())
simularity

Prepare submission

In [None]:
submission['score'] = simularity
submission.to_csv("submission.csv", index=False)
submission