# U.S. Patent Phrase to Phrase Matching
In this notebook I used [spaCy](https://spacy.io/) package to get semantical similarity between words.

# Load Packages

In [None]:
import os
import sys
import pandas as pd
import spacy
import time

# Set paths

In [None]:
TRAIN_FILE_PATH = '../input/us-patent-phrase-to-phrase-matching/train.csv'
TEST_FILE_PATH = '../input/us-patent-phrase-to-phrase-matching/test.csv'
SAMPLE_SUBMISSION_PATH = '../input/us-patent-phrase-to-phrase-matching/sample_submission.csv'

# Configure parameters

In [None]:
class config:
    PRINT_EVERY_N_WORD = 100
    BAR_LEN = 50

# Read files

In [None]:
train_df = pd.read_csv(TRAIN_FILE_PATH)
test_df = pd.read_csv(TEST_FILE_PATH)
submission_df = pd.read_csv(SAMPLE_SUBMISSION_PATH)

print('train_df shape:', train_df.shape)
print('test_df shape:', test_df.shape)
print('submission_df shape:', submission_df.shape)

# Get similarity score

In [None]:
similarity_score = []
n_words = train_df.shape[0]
start = time.time()
nlp = spacy.load('en_core_web_lg')

for i, row in train_df.iterrows():
    token1 = nlp(row.anchor)
    token2 = nlp(row.target)
    similarity_score.append(token1.similarity(token2))
    
    if ((i+1)%config.PRINT_EVERY_N_WORD == 0) | (i+1 == n_words):
        end = time.time()
        time_elapsed = end - start
        if i+1 == n_words:
            bar = '[' + '='*int((i+1)*config.BAR_LEN/n_words) + '.'*(config.BAR_LEN - int((i+1)*config.BAR_LEN/n_words) - 1) + ']'
        else:
            bar = '[' + '='*int((i+1)*config.BAR_LEN/n_words) + '>' + '.'*(config.BAR_LEN - int((i)*config.BAR_LEN/n_words) - 1) + ']'
        perc = (i+1)*100/n_words
        sys.stdout.write('\r')
        sys.stdout.write("%i/%i words completed %s %d%% %.1fs %.1fms/word" % (i+1, n_words, bar, perc, time_elapsed, time_elapsed*1000/(i+1)))
        sys.stdout.flush()

train_df['similarity_score'] = similarity_score

In [None]:
# Map score between the number to 0, 0.25, 0.5, 0.75 and 1
'''
0.000 - 0.125 -> 0.00
0.125 - 0.375 -> 0.25
0.375 - 0.625 -> 0.50
0.625 - 0.875 -> 0.75
0.875 - 1.000 -> 1.00
'''

mapping = {0.00: [0.000, 0.125],
           0.25: [0.125, 0.375],
           0.50: [0.375, 0.625],
           0.75: [0.625, 0.875],
           1.00: [0.875, 1.000]}

for key in mapping.keys():
    train_df['similarity_score'] = train_df['similarity_score'].mask((train_df['similarity_score'] >= mapping[key][0]) & (train_df['similarity_score'] < mapping[key][1]), key)

In [None]:
from scipy.stats import pearsonr
corr, _ = pearsonr(train_df.score, train_df.similarity_score)
print('Training Pearson Correlation: %0.3f' % corr)

# Get prediction for test data

In [None]:
similarity_score = []
n_words = test_df.shape[0]
start = time.time()
nlp = spacy.load('en_core_web_lg')

for i, row in test_df.iterrows():
    token1 = nlp(row.anchor)
    token2 = nlp(row.target)
    similarity_score.append(token1.similarity(token2))
    
    if ((i+1)%config.PRINT_EVERY_N_WORD == 0) | (i+1 == n_words):
        end = time.time()
        time_elapsed = end - start
        if i+1 == n_words:
            bar = '[' + '='*int((i+1)*config.BAR_LEN/n_words) + '.'*(config.BAR_LEN - int((i+1)*config.BAR_LEN/n_words) - 1) + ']'
        else:
            bar = '[' + '='*int((i+1)*config.BAR_LEN/n_words) + '>' + '.'*(config.BAR_LEN - int((i)*config.BAR_LEN/n_words) - 1) + ']'
        perc = (i+1)*100/n_words
        sys.stdout.write('\r')
        sys.stdout.write("%i/%i words completed %s %d%% %.1fs %.1fms/word" % (i+1, n_words, bar, perc, time_elapsed, time_elapsed*1000/(i+1)))
        sys.stdout.flush()

test_df['score'] = similarity_score

In [None]:
for key in mapping.keys():
    test_df['score'] = test_df['score'].mask((test_df['score'] >= mapping[key][0]) & (test_df['score'] < mapping[key][1]), key)

In [None]:
submission_df = test_df[['id', 'score']]
submission_df.to_csv('submission.csv', index = False)