# Positional Character-based Shingling

This notebook details about algorithms discussed in section 2 of the paper, "Alignment Analysis of Sequential Segmentation of Lexicons to Improve Automatic Cognate Detection"

## K-gram shingling

In [2]:
def shingle(input, k):
    k = min(len(input), k)
    start_combinations = [input[:i] for i in range(1, k)]
    kgrams = [input[i:i + k] for i in range(len(input) - k + 1)]
    end_combinations = [input[-i:] for i in range(k - 1, 0, -1)]
    return start_combinations + kgrams + end_combinations

## Positional Shingling from 2 Ends

In [9]:
def one_end(input, k):
    basic = shingle(input, k)
    result =[]
    for i in range(1, len(basic) + 1):
        if i <= (len(input) - i + 2):
            result.append(str(i) + basic[i - 1]) # Append numbers from start
        else:
            result.append(basic[i - 1] + str(len(basic) - i + 1)) # Append numbers from end
    return result

## Similarity Checking

In [10]:
def jaccard(str1, str2, ends = 2, k = 2):
    if ends == 2:
        set1 = set(one_end(str1, k))
        set2 = set(one_end(str2, k))
    else:
        set1 = set(shingle(str1, k))
        set2 = set(shingle(str2, k))
    numerator = len(set1.intersection(set2))
    denominator = len(set1.union(set2))
    return numerator / denominator

In [11]:
print(jaccard("apple", "appe"))

0.5714285714285714
