This notebook deals with the analysis of a very simple and basic model, using the [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) as string matching or similarity measure.

In [None]:
%matplotlib inline
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd
from pathlib import Path

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
plt.rcParams['figure.figsize'] = 15, 10
plt.style.use('fivethirtyeight')

#Datasets
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')

# Basic Exploration 

In this step I try to understand how is the behavior of the score with respect to other features or texts, not too deep but maybe also not too obvious.


In [None]:
#Set up the path and datasets
print("Datasets")
PATHDATA=Path('/kaggle/input/us-patent-phrase-to-phrase-matching')
dtrain=pd.read_csv(PATHDATA/'train.csv')
dtest=pd.read_csv(PATHDATA/'test.csv')
print()
print("#"*30)
print("Description")

print(f"Shape Dtrain:{dtrain.shape}")
print(f"Shape Dtest: {dtest.shape}")
print()
print("#"*30)

print("Info")
print("Dataset:Train")
print(dtrain.info())

print("#"*30)
print("Dataset:Test")        
print(dtest.info())


In [None]:
#first rows
dtrain.head()
dtest.head()

In [None]:
#Does the data have anything strange?
print("Unique elements per column")
dtrain.nunique()
print()
print("Missing Values?")
dtrain.isnull().sum()

## Score Exploration

These are the objectives of the exploration:

* 


In [None]:
#Score Exploration
dtrain.score.value_counts().sort_index().to_frame()\
      .style.apply(highlight_max, props='color:white;background-color:purple', axis=None)

dtrain.score.value_counts(normalize=True).sort_index().plot(kind='barh', title='Normalized Score',color='purple',
                                                            xlabel="Level Score")

### 0.5 and 0.25 have more than 60% of the records. On the other hand 1.0 does not even have 5%.

In [None]:
#Score by anchor:heatmap
dtrain.groupby(['anchor','score'],as_index=False).id.count()\
      .pivot_table(index=['anchor'],columns=['score'],values='id',aggfunc='sum')\
      .pipe(sns.heatmap);

As expected, the ones with the most "diversity" are 0.25 and 0.5, on the other hand, some anchors have no information or cases with a score equal to 1.0

In [None]:
#Score by anchor:clustermap
dtrain.groupby(['anchor','score'],as_index=False).id.count()\
      .pivot_table(index=['anchor'],columns=['score'],values='id',aggfunc='sum')\
      .fillna(-20)\
      .pipe(sns.clustermap);

It's not very clear that there are some clusters...

In [None]:
#Score by context:heatmap
dtrain.groupby(['context','score'],as_index=False).id.count()\
      .pivot_table(index=['context'],columns=['score'],values='id',aggfunc='sum')\
      .pipe(sns.heatmap);

There is nothing different from the previous heatmap, to the relationship between score vs anchor.

In [None]:
#Score by context:heatmap
dtrain.groupby(['context','score'],as_index=False).id.count()\
      .pivot_table(index=['context'],columns=['score'],values='id',aggfunc='sum')\
      .fillna(-20)\
      .pipe(sns.clustermap);

Some clusters can be seen, which is clearer with values 0.5
* Group 1: A61,C07,G06
* Group 2: F42,B65,E04,H02,A01
* Group 3: B24,D03,G05,G09,F04,C04
* Group 4: all the others

In [None]:
#Score by Context - Capital Letter
dtrain.assign(Capital_Letters=dtrain.context.str.replace("\d", "", case=False, regex=True))\
      .groupby(['Capital_Letters','score'],as_index=False).id.count()\
      .pivot_table(index=['Capital_Letters'],columns=['score'],values='id',aggfunc='sum')\
      .pipe(sns.heatmap,linewidths=0.5);

In [None]:
#Score by Context - Capital Letter
dtrain.assign(Capital_Letters=dtrain.context.str.replace("\d", "", case=False, regex=True))\
      .groupby(['Capital_Letters','score'],as_index=False).id.count()\
      .pivot_table(index=['Capital_Letters'],columns=['score'],values='id',aggfunc='sum')\
      .pipe(sns.clustermap,linewidths=0.5);

The groups that are observed are the following:
* D & E
* A & F
* H & G

Contexts B and C look different than the others.


In [None]:
#What is the relationship between contexts and archors?
# Context vs Anchor
dtrain.pivot_table(index=['context'],columns=['anchor'],values='score',aggfunc='mean',fill_value=0)\
      .pipe(sns.heatmap);

In [None]:
# Context vs Anchor: Cluster
dtrain.pivot_table(index=['context'],columns=['anchor'],values='score',aggfunc='mean',fill_value=0)\
      .pipe(sns.clustermap,metric="correlation");

The relationship between **contexts** and **anchors** seem random, no clusters are observed.

In [None]:
#Cluster : Context-Capital Letter
dtrain.assign(Capital_Letters=dtrain.context.str.replace("\d", "", case=False, regex=True))\
      .pivot_table(index=['Capital_Letters'],columns=['anchor'],values='score',aggfunc='mean',fill_value=0)\
      .pipe(sns.clustermap);

The relationship between contexts-capital_letters and anchors do not show very clear **clusters**.

In [None]:
# Target length as string
fig, ax = plt.subplots()

avg=dtrain.assign(Len_Target=dtrain.target.astype(str).str.len())\
      .filter(items=['Len_Target']).mean().values[0]

dtrain.assign(Len_Target=dtrain.target.astype(str).str.len())\
      .filter(items=['score','Len_Target'])\
      .groupby('score').Len_Target.mean()\
      .plot(kind='barh',color='orange',title='Score - Length String',stacked=True,ax=ax)

ax.axvline(x=avg, color='r', label='Average', linestyle='--', linewidth=3);
ax.legend();

In [None]:
#target length as string-words
fig, ax = plt.subplots()

avg=dtrain.assign(Len_Target=dtrain.target.astype(str).str.split().apply(len))\
      .filter(items=['Len_Target']).mean().values[0]

dtrain.assign(Len_Target=dtrain.target.astype(str).str.split()\
       .apply(len)).filter(items=['score','Len_Target'])\
       .groupby('score').Len_Target.mean().plot(kind='barh',title='Score by Words',color="green",ax=ax)

ax.axvline(x=avg, color='r', label='Average', linestyle='--', linewidth=3);
ax.legend();

In [None]:
#Relationship between metrics: anchor and target 

dtrain.assign(Len_Target=dtrain.target.astype(str).str.len(),
              NWords_Target=dtrain.target.astype(str).str.split().apply(len),
              Len_Anchor=dtrain.anchor.astype(str).str.len(),
              NWords_Anchor=dtrain.anchor.astype(str).str.split().apply(len))\
      .filter(items=['Len_Target','NWords_Target','Len_Anchor','NWords_Anchor','score'])\
      .pipe(sns.pairplot, hue="score");

Something I can see here is the amount of words in Target and anchor, on the first one, most are texts with less than 10 "words" and on the second, most are texts with 2 words.

##  Levenshtein Distance

The goal is to use a very simple similarity metric and try to explore the results of it.

Reference: 
* https://en.wikipedia.org/wiki/Levenshtein_distance

* https://github.com/seatgeek/thefuzz



In [None]:
 def Levenshtein(s0, s1):
        if s0 is None:
            raise TypeError("Argument s0 is NoneType.")
        if s1 is None:
            raise TypeError("Argument s1 is NoneType.")
        if s0 == s1:
            return 0.0
        if len(s0) == 0:
            return len(s1)
        if len(s1) == 0:
            return len(s0)

        v0 = [0] * (len(s1) + 1)
        v1 = [0] * (len(s1) + 1)

        for i in range(len(v0)):
            v0[i] = i

        for i in range(len(s0)):
            v1[0] = i + 1
            for j in range(len(s1)):
                cost = 1
                if s0[i] == s1[j]:
                    cost = 0
                v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost)
            v0, v1 = v1, v0

        return v0[len(s1)]

def distance(s0,s1):
    if s0 == s1:
            return 0.0
        
    m_len = max(len(s0), len(s1))
    if m_len == 0:
        return 0.0
     
    return Levenshtein(s0, s1) / m_len

def similarity(s0, s1):
        return 1.0 - distance(s0, s1)

In [None]:
def Leveshtein_Metric(x):
    "aux funct"
    a=similarity(x['anchor'],x['target'])
    return a


In [None]:
#Examples
print("Example 1:Normal Version")
similarity('abatement','abatement of pollution')
print("Example 2:Without stopwords")
similarity('abatement','abatement pollution')
print("Expected Value: 0.50")


In [None]:
#Estimation
LMetric=dtrain.apply(Leveshtein_Metric,axis=1)

## What can be learned from such a simple similarity metric?

In [None]:
#DataFrame creation
LMetric=LMetric.to_frame('Leveshtein')
LMetric['score']=dtrain.score.copy()


#Dataframe
DF_LM=LMetric.assign(Anchor=dtrain.anchor,
               Target=dtrain.target,
               Context=dtrain.context,
               Capital_Letters=dtrain.context.str.replace("\d", "", case=False, regex=True),
               Len_Target=dtrain.target.astype(str).str.split().apply(len),
               Len_Anchor=dtrain.anchor.astype(str).str.split().apply(len),
               Diff=(LMetric.Leveshtein-LMetric.score))

In [None]:
DF_LM.head()

In [None]:
# Pearson correlation coefficient
DF_LM.filter(items=['Leveshtein','score']).corr()

The possible *Pearson correlation coefficient* as a result of the test data should be close to: **0.4558**

In [None]:
cm = sns.light_palette("orange", as_cmap=True)

DF_LM.groupby('Len_Target').Diff.agg(['count','mean']).style.background_gradient(cmap=cm)

DF_LM.groupby('Len_Anchor').Diff.agg(['count','mean']).style.background_gradient(cmap=cm)


In [None]:
DF_LM[DF_LM.Len_Target>3].describe()
#.pipe((sns.relplot,'data'),x='Leveshtein',y='score',hue='Capital_Letters')

In [None]:
DF_LM[DF_LM.Len_Target==2].groupby(['Capital_Letters','Len_Anchor'])\
     .agg({'Diff':['mean','count'],'Leveshtein':['mean'],'score':['mean']})\
     .style.background_gradient(cmap=cm)

DF_LM[DF_LM.Len_Anchor==2].groupby(['Capital_Letters','Len_Target'])\
     .agg({'Diff':['mean','count'],'Leveshtein':['mean'],'score':['mean']})\
     .style.background_gradient(cmap=cm)


In [None]:
nofig, ax = plt.subplots()
avg=DF_LM.Diff.mean()
sns.histplot(DF_LM.Diff,ax=ax,kde=True)
ax.axvline(x=avg, color='r', label='Average', linestyle='--', linewidth=3);
ax.legend();

In [None]:
sns.histplot(
    DF_LM,
    x="Leveshtein", hue="score",
    multiple="stack",
    palette="light:m_r",
    edgecolor=".3",
    linewidth=.5
);

In [None]:
#
def similarity2(s0, s1):
    if len(str(s1).split())==2:
        if len(str(s0).split())==2:
            return (1.0 - distance(s0, s1))-0.03
        if len(str(s0).split())!=2:
            return (1.0 - distance(s0, s1))-0.02
    if len(str(s1).split())==1:
        if len(str(s0).split())==2:
            return (1.0 - distance(s0, s1))+0.02
        
    if len(str(s1).split())>4:
        return 0.50
    else:
        return 1.0-distance(s0, s1)
    
    
def Leveshtein_Metric2(x):
    "aux funct"
    a=similarity2(x['anchor'],x['target'])
    return a


In [None]:
LMetric2=dtrain.apply(Leveshtein_Metric2,axis=1)

In [None]:
LMetric2=LMetric2.to_frame('Leveshtein')
LMetric2['score']=dtrain.score.copy()

In [None]:
LMetric2.corr()

In [None]:
LMetric_Test=dtest.apply(Leveshtein_Metric2,axis=1)

In [None]:
sample_submission = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
sample_submission.score = LMetric_Test
sample_submission.to_csv("submission.csv", index=False)

In [None]:
sample_submission.head()