# String Matching Demo for DSE203-Fall2020

###### py_stringmatching documentation Link:
    http://anhaidgroup.github.io/py_stringmatching/v0.4.1/Tutorial.html
    
###### jupyter no

In [2]:
import py_stringmatching as sm

## Select a Similarity Measure

The currently implemented similarity measures include:
- sequence-based measures: affine gap, bag distance, editex, Hamming distance, Jaro, Jaro Winkler, Levenshtein, Needleman Wunsch, partial ratio, partial token sort, ratio, Smith Waterman, token sort.
- set-based measures: cosine, Dice, Jaccard, overlap coefficient, Tversky Index.
- bag-based measures: TF/IDF.
- phonetic-based measures: soundex.

## Select a Tokenzier type

a set of different tokenizer types provided by py_stringmatching: 
- alphabetical tokenizer, alphanumeric tokenizer, delimiter-based tokenizer, qgram tokenizer, and whitespace tokenizer (more tokenizer types can easily be added)

## Create a Tokenizer Object

In [5]:
test_string = ' .hello, world!! data, science, is    amazing!!. hello.'

In [6]:
# create an alphabetical tokenizer that returns a bag of tokens
alphabet_tok = sm.AlphabeticTokenizer()

In [7]:
alphabet_tok.tokenize(test_string)

['hello', 'world', 'data', 'science', 'is', 'amazing', 'hello']

In [8]:
# create an alphanumeric tokenizer
alnum_tok = sm.AlphanumericTokenizer()

In [9]:
alnum_tok.tokenize(test_string)

['hello', 'world', 'data', 'science', 'is', 'amazing', 'hello']

In [11]:
# create a delimiter tokenizer using comma as a delimiter
delim_tok = sm.DelimiterTokenizer(delim_set=[','])

In [None]:
# create a whitespace tokenizer
ws_tok = sm.WhitespaceTokenizer()

In [12]:
delim_tok.tokenize(test_string)

[' .hello', ' world!! data', ' science', ' is    amazing!!. hello.']

In [19]:
# create a qgram tokenizer using q=3
qg3_tok = sm.QgramTokenizer(qval=3)

In [20]:
qg3_tok.tokenize(test_string)

['## ',
 '# .',
 ' .h',
 '.he',
 'hel',
 'ell',
 'llo',
 'lo,',
 'o, ',
 ', w',
 ' wo',
 'wor',
 'orl',
 'rld',
 'ld!',
 'd!!',
 '!! ',
 '! d',
 ' da',
 'dat',
 'ata',
 'ta,',
 'a, ',
 ', s',
 ' sc',
 'sci',
 'cie',
 'ien',
 'enc',
 'nce',
 'ce,',
 'e, ',
 ', i',
 ' is',
 'is ',
 's  ',
 '   ',
 '   ',
 '  a',
 ' am',
 'ama',
 'maz',
 'azi',
 'zin',
 'ing',
 'ng!',
 'g!!',
 '!!.',
 '!. ',
 '. h',
 ' he',
 'hel',
 'ell',
 'llo',
 'lo.',
 'o.$',
 '.$$']

In [27]:
# create an alphabetical tokenizer that returns a set of tokens
alphabet_tok_set = sm.AlphabeticTokenizer(return_set=True)
ws_tok_set = sm.WhitespaceTokenizer(return_set=True)
qg3_tok_set = sm.QgramTokenizer(qval=3, return_set=True)

In [28]:
alphabet_tok_set.tokenize(test_string)

['hello', 'world', 'data', 'science', 'is', 'amazing']

## Creating a Similarity Measure Object and Using it to Compute a Similarity Score

In [29]:
# create a Jaccard similarity measure object
jac = sm.Jaccard()

In [30]:
# create a Levenshtein similarity measure object
lev = sm.Levenshtein()

In [31]:
x = 'string matching package'
y = 'string matching library'

In [32]:
# compute Jaccard score over sets of tokens of x and y, tokenized using whitespace
jac.get_raw_score(ws_tok_set.tokenize(x), ws_tok_set.tokenize(y))

0.5

In [33]:
# compute Jaccard score over sets of tokens of x and y, tokenized into qgrams (with q=3)
jac.get_raw_score(qg3_tok_set.tokenize(x), qg3_tok_set.tokenize(y))

0.4375

In [34]:
lev.get_raw_score(x, y)

6

In [35]:
# get normalized Levenshtein similarity score between x and y
lev.get_sim_score(x, y)

0.7391304347826086

In [36]:
# get normalized Jaccard similarity score (this is the same as the raw score)
jac.get_sim_score(ws_tok_set.tokenize(x), ws_tok_set.tokenize(y))

0.5