Applying basic string matcher on the training data to establish base line errors. 

- [Hamming distance](http://en.wikipedia.org/wiki/Hamming_distance)
- [Levenshtein distance](http://en.wikipedia.org/wiki/Levenshtein_distance)
- [Damerau–Levenshtein distance](http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
- [Jaro–Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from difflib import SequenceMatcher
import os
from sklearn.metrics import mean_absolute_error
import Levenshtein
from scipy.spatial.distance import hamming
from math import floor, ceil
import scipy.stats as stats

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv")
test_df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv")

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.groupby("score").size().reset_index(name='counts')

In [None]:
basic_matcher ={}
pearsonr_coefficient ={}

### Python Inbuilt Sequence Matcher

In [None]:
train_df["sequence_matcher"] = train_df.apply(lambda x: SequenceMatcher(None, x["anchor"], x["target"]).ratio(), axis=1)

In [None]:
sequence_matcher_score = round((mean_absolute_error(train_df['score'],train_df['sequence_matcher'])), 4)
print(f"Baseline Mean absolute error : {sequence_matcher_score}")
pearsonr_coefficient["Sequence Matcher"]=round(stats.pearsonr(train_df['score'], train_df['sequence_matcher'])[0], 4)
basic_matcher["Sequence Matcher"] = sequence_matcher_score

### Levenshtein Distance


In [None]:
train_df["levenshtein"] = train_df.apply(lambda x: Levenshtein.ratio(x["anchor"], x["target"]), axis=1)

In [None]:
levenshtein_score = round((mean_absolute_error(train_df['score'],train_df['levenshtein'])), 4)
print(f"Mean absolute error : {levenshtein_score}")
pearsonr_coefficient["Levenshtein"]=round(stats.pearsonr(train_df['score'], train_df['levenshtein'])[0], 4)
basic_matcher["Levenshtein"] = levenshtein_score

### Hamming Distance

In [None]:
train_df["hamming"] = train_df.apply(lambda x: hamming(x["anchor"], x["target"]), axis=1)

In [None]:
hamming_score = round((mean_absolute_error(train_df['score'],train_df['hamming'])), 4)
print(f"Mean absolute error : {hamming_score}")
pearsonr_coefficient["Hamming"]=round(stats.pearsonr(train_df['score'], train_df['hamming'])[0], 4)
basic_matcher["Hamming"] = hamming_score

### Damerau–Levenshtein distance

In [None]:
def damerau_levenshtein_distance(s1, s2):
    d = {}
    lenstr1 = len(s1)
    lenstr2 = len(s2)
    for i in range(-1,lenstr1+1):
        d[(i,-1)] = i+1
    for j in range(-1,lenstr2+1):
        d[(-1,j)] = j+1

    for i in range(lenstr1):
        for j in range(lenstr2):
            if s1[i] == s2[j]:
                cost = 0
            else:
                cost = 1
            d[(i,j)] = min(
                           d[(i-1,j)] + 1, # deletion
                           d[(i,j-1)] + 1, # insertion
                           d[(i-1,j-1)] + cost, # substitution
                          )
            if i and j and s1[i]==s2[j-1] and s1[i-1] == s2[j]:
                d[(i,j)] = min (d[(i,j)], d[i-2,j-2] + cost) 
    distance = float(d[lenstr1-1,lenstr2-1])
    result = 1.0-distance/max(lenstr1,lenstr2)
    return result

In [None]:
train_df["damerau–levenshtein"] = train_df.apply(lambda x: damerau_levenshtein_distance(x["anchor"], x["target"]), axis=1)

In [None]:
damerau_levenshtein_score = round((mean_absolute_error(train_df['score'],train_df['damerau–levenshtein'])), 4)
print(f"Mean absolute error : {damerau_levenshtein_score}")
pearsonr_coefficient["Damerau Levenshtein"]=round(stats.pearsonr(train_df['score'], train_df['damerau–levenshtein'])[0], 4)
basic_matcher["Damerau Levenshtein"] = damerau_levenshtein_score

### Jaro–Winkler distance

In [None]:
def jaro_distance(s1, s2):
    if (s1 == s2):
        return 1.0

    len1 = len(s1)
    len2 = len(s2)

    max_dist = floor(max(len1, len2) / 2) - 1

    match = 0

    # Hash for matches
    hash_s1 = [0] * len(s1)
    hash_s2 = [0] * len(s2)

    # Traverse through the first
    for i in range(len1):

        # Check if there is any matches
        for j in range(max(0, i - max_dist),
            min(len2, i + max_dist + 1)):

            # If there is a match
            if (s1[i] == s2[j] and hash_s2[j] == 0):
                hash_s1[i] = 1
                hash_s2[j] = 1
                match += 1
                break

    # If there is no match
    if (match == 0):
        return 0.0

    # Number of transpositions
    t = 0
    point = 0

    # Count number of occurrences
    # where two characters match but
    # there is a third matched character
    # in between the indices
    for i in range(len1):
        if (hash_s1[i]):

            # Find the next matched character
            # in second
            while (hash_s2[point] == 0):
                point += 1

            if (s1[i] != s2[point]):
                t += 1
            point += 1
    t = t//2

    # Return the Jaro Similarity
    return (match/ len1 + match / len2 +
            (match - t) / match)/ 3.0

def jaro_Winkler(s1, s2) :
 
    jaro_dist = jaro_distance(s1, s2);
 
    # If the jaro Similarity is above a threshold
    if (jaro_dist > 0.7) :
 
        # Find the length of common prefix
        prefix = 0;
 
        for i in range(min(len(s1), len(s2))) :
         
            # If the characters match
            if (s1[i] == s2[i]) :
                prefix += 1;
 
            # Else break
            else :
                break;
 
        # Maximum of 4 characters are allowed in prefix
        prefix = min(4, prefix);
 
        # Calculate jaro winkler Similarity
        jaro_dist += 0.1 * prefix * (1 - jaro_dist);
 
    return jaro_dist;

In [None]:
train_df["jaro–winkler"] = train_df.apply(lambda x: jaro_Winkler(x["anchor"], x["target"]), axis=1)

In [None]:
jaro_winkler_score = round((mean_absolute_error(train_df['score'],train_df['jaro–winkler'])), 4)
print(f"Mean absolute error : {jaro_winkler_score}")
pearsonr_coefficient["Jaro Winkler"]=round(stats.pearsonr(train_df['score'], train_df['jaro–winkler'])[0], 4)
basic_matcher["Jaro Winkler"] = jaro_winkler_score

In [None]:
basic_matcher

In [None]:
pd.DataFrame(list(pearsonr_coefficient.items()), columns = ['String Matcher', 'pearsonr'])

In [None]:
pd.DataFrame(list(basic_matcher.items()), columns=['String Matcher', 'MAE'])

#### Applying Damerau levenshtein on test data

In [None]:
test_df["score"] = test_df.apply(lambda x: damerau_levenshtein_distance(x["anchor"], x["target"]), axis=1)

In [None]:
test_df.head()

In [None]:
header = ["id", "score"]
test_df.to_csv('submission.csv', columns = header, index=False)