In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

figure(figsize=(15, 10), dpi=80)

# Reading data

In [None]:
train=pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
test=pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')
sample=pd.read_csv('../input/us-patent-phrase-to-phrase-matching/sample_submission.csv')

In [None]:
train.shape,test.shape,sample.shape

In [None]:
train.head()

# anchor

In [None]:

ax=train.groupby('anchor')['id'].count().sort_values(ascending=False).head(10).plot(kind='bar',color = list('rgbkymc'))
for container in ax.containers:
    ax.bar_label(container)
plt.title("top 10 anchor in data")
# plt.xticks(rotation=60)
plt.show()

# target

In [None]:
ax=train.groupby('target')['id'].count().sort_values(ascending=False).head(10).plot(kind='bar',color = list('rgbkymc'))
for container in ax.containers:
    ax.bar_label(container)
plt.title("top 10 target in data")
# plt.xticks(rotation=60)
plt.show()

# context

In [None]:
ax=train.groupby('context')['id'].count().sort_values(ascending=False).head(10).plot(kind='bar',color = list('rgbkymc'))
for container in ax.containers:
    ax.bar_label(container)
plt.title("top 10 context in data")
# plt.xticks(rotation=60)
plt.show()

# Score

In [None]:
train.score.value_counts(normalize=True).sort_index().plot(kind='barh', title='Score',xlabel="Level Score")

# Target length as string

In [None]:

train.assign(Len_Target=train.target.astype(str).str.len())\
      .filter(items=['score','Len_Target'])\
      .groupby('score').Len_Target.sum().plot(kind='bar',color='green',title='Score by Len String');

# Target length as string-words

In [None]:
train.assign(Len_Target=train.target.astype(str).str.split()\
       .apply(len)).filter(items=['score','Len_Target'])\
       .groupby('score').Len_Target.sum().plot(kind='bar',title='Score by Words',color='orange');

# Similarity distance calculation using Leveshtein Metric

In [None]:
def Levenshtein(s0, s1):
        if s0 is None:
            raise TypeError("Argument s0 is NoneType.")
        if s1 is None:
            raise TypeError("Argument s1 is NoneType.")
        if s0 == s1:
            return 0.0
        if len(s0) == 0:
            return len(s1)
        if len(s1) == 0:
            return len(s0)

        v0 = [0] * (len(s1) + 1)
        v1 = [0] * (len(s1) + 1)

        for i in range(len(v0)):
            v0[i] = i

        for i in range(len(s0)):
            v1[0] = i + 1
            for j in range(len(s1)):
                cost = 1
                if s0[i] == s1[j]:
                    cost = 0
                v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost)
            v0, v1 = v1, v0

        return v0[len(s1)]

def distance(s0,s1):
    if s0 == s1:
            return 0.0
        
    m_len = max(len(s0), len(s1))
    if m_len == 0:
        return 0.0
     
    return Levenshtein(s0, s1) / m_len

def similarity(s0, s1):
        return 1.0 - distance(s0, s1)

In [None]:
def Leveshtein_Metric(x):
    "aux funct"
    a=similarity(x['anchor'],x['target'])
    return a

In [None]:
LMetric=train.apply(Leveshtein_Metric,axis=1)

In [None]:
LMetric=LMetric.to_frame('Leveshtein')
LMetric['score']=train.score.copy()

In [None]:
sns.set_style('dark')
sns.relplot(x='Leveshtein',y='score',data=LMetric,height=10);

In [None]:
sns.histplot(
    LMetric,
    x="Leveshtein", hue="score",
    multiple="stack",
    palette="light:m_r",
    edgecolor=".3",
    linewidth=.5
);

In [None]:
LMetric.corr()

In [None]:
LMetric_Test=test.apply(Leveshtein_Metric,axis=1)

In [None]:
# sample_submission = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
sample.score = LMetric_Test
sample.to_csv("submission.csv", index=False)

In [None]:
sample.head()

# Testing

In [None]:
similarity('abatement','abatement of pollution')