Problem statement

In this dataset, you are presented pairs of phrases (an anchor and a target phrase) and asked to rate how similar they are on a scale from 0 (not at all similar) to 1 (identical in meaning). 

Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Load datasets

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Read datasets

In [None]:
train = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv")
test = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv")
submission = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv")

In [None]:
train

In [None]:
test

In [None]:
submission

Analyse score

In [None]:
sns.distplot(train.score)

In [None]:
plt.boxplot(train.score)

Define target

In [None]:
target = train.score

Combine train and test

In [None]:
combi = train.drop(['score'], axis=1).append(test)
combi

Match phrase similarities

In [None]:
!pip install -U spacy

In [None]:
!python -m spacy download en

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg")

simularity = []

for i in range(len(combi)):
    anchor = nlp(combi['anchor'].values[i])
    text_target = nlp(combi['target'].values[i])
    sim = anchor.similarity(text_target)
    simularity.append(sim)
    
print(len(simularity))

Create new column in combi

In [None]:
combi['similar'] = simularity
combi

Ordinal encode context

In [None]:
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

combi['context'] = enc.fit_transform(combi['context'].values.reshape(-1,1))
combi

Define X and y

In [None]:
features = ['similar', 'context']

y = target
X = combi[features][: len(train)]
X_test = combi[features][len(train) :]

Define model

In [None]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(random_state=42).fit(X, y)
print(model.score(X, y))

Predict on test set

In [None]:
prediction = model.predict(X_test)
prediction

Prepare submission

In [None]:
submission['score'] = prediction
submission.to_csv("submission.csv", index=False)
submission