In [None]:
# Tokens Distance Examples

In [1]:
cd ../../

/home/abelm


In [2]:
import textsim
from textsim import *

In [3]:
s1 = """PCCW's chief operating officer, Mike Butcher, and Alex Arena, 
        the chief financial officer, will report directly to Mr So."""
s2 = """Current Chief Operating Officer Mike Butcher and Group Chief 
        Financial Officer Alex Arena will report to So."""

### **Listing all available distances!**

In [4]:
print('total',len(textsim.tokendists.__all__))
textsim.tokendists.__all__

total 35


['mahalanobis_distance_scipy',
 'chebyshev_distance_scipy',
 'interval_distance',
 'overlap_distance_textsim',
 'manhattan_distance_sklearn',
 'correlation_distance_scipy',
 'masi_distance',
 'kulsinski_distance_scipy',
 'cityblock_distance_scipy',
 'hamming_distance_scipy',
 'yule_distance_scipy',
 'matching_distance_scipy',
 'jaccard_distance_textsim',
 'matching_coefficient_textsim',
 'sqeuclidean_distance_scipy',
 'dice_coefficient_textsim',
 'braycurtis_distance_scipy',
 'cosine_distance_scipy',
 'sokalsneath_distance_scipy',
 'euclidean_distance_scipy',
 'matching_coefficient_pablo',
 'sokalmichener_distance_scipy',
 'cosine_similarity_sklearn',
 'jaccard_distance_scipy',
 'jaccard_distance_nltk',
 'dice_distance_scipy',
 'minkowski_distance_scipy',
 'euclidean_distance_sklearn',
 'canberra_distance_scipy',
 'seuclidean_distance_scipy',
 'rogerstanimoto_distance_scipy',
 'russellrao_distance_scipy',
 'token_containment_distance',
 'qgram_distance',
 'cosine_distance_sklearn']

### **Calling all distances in a flash!**

In [5]:
dictdist = {}
for metric in textsim.tokendists.PAIRED_DISTANCES:
    func = textsim.tokendists.PAIRED_DISTANCES[metric]
    try:
        dictdist[metric] = float(func(s1,s2))
    except:
        pass
    
dictcp = dictdist.copy()
for value in sorted(dictdist.values()):
    for word in dictdist.keys():
        if dictdist[word] == value and word in dictcp:
            print('%.3f: %s' % (value,word))
            dictcp.pop(word)

-0.267: sokalsneath_distance_scipy
-0.235: sokalmichener_distance_scipy
-0.235: rogerstanimoto_distance_scipy
-0.118: kulsinski_distance_scipy
-0.105: matching_distance_scipy
-0.056: dice_distance_scipy
-0.000: yule_distance_scipy
-0.000: russellrao_distance_scipy
0.135: cosine_distance_scipy
0.135: cosine_distance_sklearn
0.167: braycurtis_distance_scipy
0.316: hamming_distance_scipy
0.316: jaccard_distance_scipy
0.412: matching_coefficient_pablo
0.438: qgram_distance
0.545: jaccard_distance_textsim
0.545: jaccard_distance_nltk
0.584: correlation_distance_scipy
0.588: token_containment_distance
0.625: dice_coefficient_textsim
0.667: overlap_distance_textsim
0.850: masi_distance
0.865: cosine_similarity_sklearn
1.000: chebyshev_distance_scipy
2.449: euclidean_distance_scipy
2.449: euclidean_distance_sklearn
4.000: interval_distance
4.243: mahalanobis_distance_scipy
6.000: manhattan_distance_sklearn
6.000: cityblock_distance_scipy
6.000: sqeuclidean_distance_scipy
6.000: minkowski_dista

  return np.sqrt(((XA - XB) ** 2 / V).sum())


### **Calling a specific distance!**

In [6]:
jaccard_distance_textsim(s1,s2)

0.5454545454545454

## Performance between Token-Based Similar Distances

Some distances have the same value, but comes from different implementations. Which is better?
The names of distances inside textim have been changed after run this performance test the first time.

    E.g. jaccard_distance = jaccard_distance_nltk
         
jaccard_distance_textsim is a self implementation, inside textsim package, of jaccard_distance. The code is contained for students to take it as an example of implementation of this kind of distance.

In [7]:
%timeit jaccard_distance_nltk(s1,s2)
%timeit jaccard_distance_textsim(s1,s2)
%timeit jaccard_distance_scipy(s1,s2)

100000 loops, best of 3: 12 µs per loop
100000 loops, best of 3: 11.1 µs per loop
1000 loops, best of 3: 1.71 ms per loop


In [8]:
%timeit cosine_distance_scipy(s1,s2)
%timeit cosine_distance_sklearn(s1,s2)

1000 loops, best of 3: 1.72 ms per loop
1000 loops, best of 3: 1.87 ms per loop


In [9]:
textsim.tokendists.__not_implemented__

['Kullback-Leibler distance',
 'Hellinger distance',
 'Jensen Shanon divergence',
 'Harmonic Mean distance',
 'Skew divergence',
 'Tau distance',
 'Containment distance',
 'Q-gram Overlap',
 'Skip-grams distance',
 'Greedy String Tiling']

# Qgram Distance Examples

In [10]:
print(qgram_distance(s1,s2))
print(qgram_distance(s1,s2,n=2))
print(qgram_distance(s1,s2,n=3))
print(qgram_distance(s1,s2,n=2,method='overlap'))

0.4375
0.058823529411764705
0.0
0.05555555555555555


# Syntactic Qgram Distance Examples

In [11]:
import time
#Syntanctic preprocessing of texts
from preprocess import deep, shallow
init = time.time()
#Name Entity Transformations
s1A = deep.ner(s1,multioutput='raw_value')
s1B = deep.ner(s1,multioutput='tuple_list')
s1C = deep.ner(s1,multioutput='raw_tag')
s2A = deep.ner(s2,multioutput='raw_value')
s2B = deep.ner(s2,multioutput='tuple_list')
s2C = deep.ner(s2,multioutput='raw_tag')

#Part of Speech Transformations
s1D = shallow.pos(s1,multioutput='raw_value')
s1E = shallow.pos(s1,multioutput='tuple_list')
s1F = shallow.pos(s1,multioutput='raw_tag')
s2D = shallow.pos(s2,multioutput='raw_value')
s2E = shallow.pos(s2,multioutput='tuple_list')
s2F = shallow.pos(s2,multioutput='raw_tag')

end = time.time()-init
print(end)

28.136187076568604


In [12]:
print(qgram_distance(s1A,s2A))
print(qgram_distance(s1A,s2A,n=2))
print(qgram_distance(s1A,s2A,n=3))
print(qgram_distance(s1A,s2A,n=2,method='overlap'))

0.375
0.058823529411764705
0.0
0.05555555555555555


In [13]:
print(qgram_distance(s1B,s2B))
print(qgram_distance(s1B,s2B,n=2))
print(qgram_distance(s1B,s2B,n=3))
print(qgram_distance(s1B,s2B,n=2,method='overlap'))

0.375
0.058823529411764705
0.0
0.05555555555555555


In [14]:
print(qgram_distance(s1C,s2C))
print(qgram_distance(s1C,s2C,n=2))
print(qgram_distance(s1C,s2C,n=3))
print(qgram_distance(s1C,s2C,n=2,method='overlap'))

0.38461538461538464
0.1935483870967742
0.06451612903225806
0.17647058823529413


In [15]:
print(qgram_distance(s1D,s2D))
print(qgram_distance(s1D,s2D,n=2))
print(qgram_distance(s1D,s2D,n=3))
print(qgram_distance(s1D,s2D,n=2,method='overlap'))

0.4375
0.058823529411764705
0.0
0.05555555555555555


In [16]:
print(qgram_distance(s1E,s2E))
print(qgram_distance(s1E,s2E,n=2))
print(qgram_distance(s1E,s2E,n=3))
print(qgram_distance(s1E,s2E,n=2,method='overlap'))

0.4375
0.058823529411764705
0.0
0.05555555555555555


In [17]:
print(qgram_distance(s1F,s2F))
print(qgram_distance(s1F,s2F,n=2))
print(qgram_distance(s1F,s2F,n=3))
print(qgram_distance(s1F,s2F,n=2,method='overlap'))

0.7058823529411765
0.4166666666666667
0.23076923076923078
0.3125


In [18]:
print('-',s2)
print('A-',s2A)
print('B-',s2B)
print('C-',s2C)
print('D-',s2D)
print('E-',s2E)
print('F-',s2F)

- Current Chief Operating Officer Mike Butcher and Group Chief 
        Financial Officer Alex Arena will report to So.
A- Current Chief Operating Officer Mike/PERSON Butcher/PERSON and Group Chief Financial Officer Alex/PERSON Arena/PERSON will report to So./LOCATION 
B- [('Current', 'O'), ('Chief', 'O'), ('Operating', 'O'), ('Officer', 'O'), ('Mike', 'PERSON'), ('Butcher', 'PERSON'), ('and', 'O'), ('Group', 'O'), ('Chief', 'O'), ('Financial', 'O'), ('Officer', 'O'), ('Alex', 'PERSON'), ('Arena', 'PERSON'), ('will', 'O'), ('report', 'O'), ('to', 'O'), ('So.', 'LOCATION')]
C- Current Chief Operating Officer PERSON PERSON and Group Chief Financial Officer PERSON PERSON will report to LOCATION 
D- Current/JJ Chief/NNP Operating/NNP Officer/NNP Mike/NNP Butcher/NNP and/CC Group/NNP Chief/NNP Financial/NNP Officer/NNP Alex/NNP Arena/NNP will/MD report/VB to/TO So./NNP 
E- [('Current', 'JJ'), ('Chief', 'NNP'), ('Operating', 'NNP'), ('Officer', 'NNP'), ('Mike', 'NNP'), ('Butcher', 'NNP'), ('

## Evaluating All Tokendists with Tuple-List Structures

In [21]:
#tokendist distances works with tuple list by default
print(jaccard_distance_textsim(s1A,s2A))
print(jaccard_distance_textsim(s1,s2))

0.6086956521739131
0.5454545454545454
