In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### import libraries

In [None]:
import os
import sys
import time
import spacy
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
from scipy.stats import pearsonr
nlp = spacy.load("en_core_web_sm")

### Load data

In [None]:
train_data = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
test_data = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
train_data.head()

### check shape of data

In [None]:
print("Shape of Training Data",train_data.shape)
print("Shape of testing data",test_data.shape)

### check the missing values in traing data

In [None]:
train_data.isna().sum()

### count the score values

In [None]:
train_data["score"].value_counts()

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(8,8))
plt.pie(x=train_data["score"].value_counts(), 
        colors=["skyblue","pink","green",'lightblue',"yellow"], 
        labels=[0.50 ,0.25,0.00,0.75,1.00], 
        shadow = True, 
        autopct="%1.2f%%", 
        explode = (0, 0.1,0.2,0.3,0.4)
        )
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.rcParams["font.size"] = 18
plt.bar(train_data["score"].value_counts().sort_values(ascending=True).index,
        train_data["score"].value_counts().sort_values(ascending=True),
        tick_label = train_data["score"].value_counts().sort_values(ascending=True).index,
        width=0.2,color="#dda0dd")

### test data

In [None]:
test_data.head()

### check missing values in test data

In [None]:
test_data.isnull().sum()

In [None]:
class config:
    PRINT_EVERY_N_WORD = 100
    BAR_LEN = 50

### Find Similarty Score

In [None]:
similarity_score = []
n_words = train_data.shape[0]
start = time.time()

for i, row in train_data.iterrows():
    token1 = nlp(row.anchor)
    token2 = nlp(row.target)
    similarity_score.append(token1.similarity(token2))
    
    if ((i+1)%config.PRINT_EVERY_N_WORD == 0) | (i+1 == n_words):
        end = time.time()
        time_elapsed = end - start
        if i+1 == n_words:
            bar = '[' + '='*int((i+1)*config.BAR_LEN/n_words) + '.'*(config.BAR_LEN - int((i+1)*config.BAR_LEN/n_words) - 1) + ']'
        else:
            bar = '[' + '='*int((i+1)*config.BAR_LEN/n_words) + '>' + '.'*(config.BAR_LEN - int((i)*config.BAR_LEN/n_words) - 1) + ']'
        perc = (i+1)*100/n_words
        sys.stdout.write('\r')
        sys.stdout.write("%i/%i words completed %s %d%% %.1fs %.1fms/word" % (i+1, n_words, bar, perc, time_elapsed, time_elapsed*1000/(i+1)))
        sys.stdout.flush()

train_data['similarity_score'] = similarity_score

### mapping

In [None]:
mapping = {0.00: [0.000, 0.125],
           0.25: [0.125, 0.375],
           0.50: [0.375, 0.625],
           0.75: [0.625, 0.875],
           1.00: [0.875, 1.000]}
for key in mapping.keys():
    train_data['similarity_score'] = train_data['similarity_score'].mask((train_data['similarity_score'] >= mapping[key][0]) & (train_data['similarity_score'] < mapping[key][1]), key)

In [None]:
corr, _ = pearsonr(train_data.score, train_data.similarity_score)
print('Training Pearson Correlation: %0.3f' % corr)

### prediction on test data

In [None]:
similarity_score = []
n_words = test_data.shape[0]
start = time.time()

for i, row in test_data.iterrows():
    token1 = nlp(row.anchor)
    token2 = nlp(row.target)
    similarity_score.append(token1.similarity(token2))
    
    if ((i+1)%config.PRINT_EVERY_N_WORD == 0) | (i+1 == n_words):
        end = time.time()
        time_elapsed = end - start
        if i+1 == n_words:
            bar = '[' + '='*int((i+1)*config.BAR_LEN/n_words) + '.'*(config.BAR_LEN - int((i+1)*config.BAR_LEN/n_words) - 1) + ']'
        else:
            bar = '[' + '='*int((i+1)*config.BAR_LEN/n_words) + '>' + '.'*(config.BAR_LEN - int((i)*config.BAR_LEN/n_words) - 1) + ']'
        perc = (i+1)*100/n_words
        sys.stdout.write('\r')
        sys.stdout.write("%i/%i words completed %s %d%% %.1fs %.1fms/word" % (i+1, n_words, bar, perc, time_elapsed, time_elapsed*1000/(i+1)))
        sys.stdout.flush()

test_data['score'] = similarity_score

In [None]:
for key in mapping.keys():
    test_data['score'] = test_data['score'].mask((test_data['score'] >= mapping[key][0]) & (test_data['score'] < mapping[key][1]), key)

In [None]:
submission  = test_data[['id', 'score']]
submission.to_csv('submission.csv', index = False)
results = pd.read_csv('submission.csv')
results.head()

In [None]:
results["score"].value_counts()

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(8,8))
plt.pie(x=results["score"].value_counts(), 
        colors=["skyblue","pink","green",'lightblue',"yellow"], 
        labels=[0.50 ,0.25,0.75,1.00], 
        shadow = True, 
        autopct="%1.2f%%", 
        explode = (0, 0.1,0.2,0.3)
        )
plt.show()