# Chapter 4: Taming tokens

## Re-creating the standard analyzer

In [70]:
import importlib

try:
    importlib.reload(my_utils)
except (NameError, TypeError): 
    pass

Imported my_utils module


In [5]:
import my_utils
from my_utils import host, index, indexBaseUrl, headers
import requests  # HTTP lib
import json  # json parsing

Imported my_utils module


In [9]:
settings = {
  "settings": {
    "analysis": {
      "analyzer": {
        "standard_clone": {
          "type": "custom", 
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "stop"
          ]
        }
      }
    }
  }
}
newIndex = "my-index"
newIndexBaseUrl = host + newIndex

settingsJson = json.dumps(settings)
requests.put(newIndexBaseUrl, data=settingsJson, headers=headers)


<Response [400]>

In [10]:
analyzeQuery = {
    "analyzer": "standard_clone",
    "text": "Dr. Strangelove: Or How I Learned to Stop Worrying and Love the Bomb"
}

response = requests.get(newIndexBaseUrl + "/_analyze", data=json.dumps(analyzeQuery), headers = headers)
tokens = my_utils.getTokenStrings(response)
   
print(tokens)

['dr', 'strangelove', 'how', 'i', 'learned', 'stop', 'worrying', 'love', 'bomb']


In [11]:
confusedUserAnalyzeQuery = {
    "analyzer": "standard_clone",
    "text": "Mr. Weirdlove: Don't worry, I'm learning to start loving bombs"
}

response = requests.get(newIndexBaseUrl + "/_analyze", data=json.dumps(confusedUserAnalyzeQuery), headers = headers)
tokens = my_utils.getTokenStrings(response)

print(tokens)

['mr', 'weirdlove', "don't", 'worry', "i'm", 'learning', 'start', 'loving', 'bombs']


In [12]:
analyzeQuery = {
    "analyzer": "english",
    "text": "Dr. Strangelove: Or How I Learned to Stop Worrying and Love the Bomb"
}

response = requests.get(newIndexBaseUrl + "/_analyze", data=json.dumps(analyzeQuery), headers = headers)
tokens = my_utils.getTokenStrings(response)
   
print(tokens)

['dr', 'strangelov', 'how', 'i', 'learn', 'stop', 'worri', 'love', 'bomb']


In [13]:
confusedUserAnalyzeQuery = {
    "analyzer": "english",
    "text": "Mr. Weirdlove: Don't worry, I'm learning to start loving bombs"
}

response = requests.get(newIndexBaseUrl + "/_analyze", data=json.dumps(confusedUserAnalyzeQuery), headers = headers)
tokens = my_utils.getTokenStrings(response)

print(tokens)

['mr', 'weirdlov', "don't", 'worri', "i'm", 'learn', 'start', 'love', 'bomb']


## Precision and recall – have your cake and eat it too

#### Creating a new index for fruit

In the folling section, we create a new index for fruit. 
Once without the English analyzer, and once we will re-index and create it with the English analyzer.
Using the English analyzer to create tokens enables us to strike a good balance between precision and recall.

In [14]:
settings = {
    "settings": {
        "number_of_shards": 1
    }
}

newIndex = "fruits"
newIndexBaseUrl = host + newIndex

def putFruitDoc(title):
    return json.dumps({"title": title})

requests.delete(newIndexBaseUrl)
requests.put(newIndexBaseUrl, data=json.dumps(settings), headers=headers)

<Response [200]>

In [15]:
requests.post(newIndexBaseUrl + "/_doc", data=putFruitDoc("apple apple apple apple apple"), headers = headers)
requests.post(newIndexBaseUrl + "/_doc", data=putFruitDoc("apple apple apple banana banana"), headers = headers)
requests.post(newIndexBaseUrl + "/_doc", data=putFruitDoc("apple banana blueberry coconut"), headers = headers)
requests.post(newIndexBaseUrl + "/_doc", data=putFruitDoc("apple apples"), headers = headers)

<Response [201]>

In [21]:
baseQuery = {
    "query": {
        "match": {
            "title": "apple",
        }
    },
    "explain": "true"
}

hits = my_utils.getSearchHits(newIndexBaseUrl, baseQuery)['hits']

print("%s\t%s\t%s" %("Num", "Score", "Title"))
for idx, hit in enumerate(hits):
    print("%s\t%s\t%s" %(idx+1, str(round(hit['_score'], 3)).rjust(4, ' '), hit['_source']['title']))
print("")
    

for idx, hit in enumerate(hits):
        print(idx+1)
        my_utils.titleAndExplanation(hit)
        print("")

Num	Score	Title
1	0.18	apple apple apple apple apple
2	0.169	apple apples
3	0.157	apple apple apple banana banana
4	0.105	apple banana blueberry coconut

1
title: apple apple apple apple apple
└──0.18038377 (weight(title:appl in 0) [PerFieldSimilarity], result of:)
   └──0.18038377 (score(freq=5.0), computed as boost * idf * tf from:)
      └──2.2 (boost)
      └──0.105360515 (idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:)
         └──4 (n, number of documents containing term)
         └──4 (N, total number of documents with field)
      └──0.7782101 (tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:)
         └──5.0 (freq, occurrences of term within document)
         └──1.2 (k1, term saturation parameter)
         └──0.75 (b, length normalization parameter)
         └──5.0 (dl, length of field)
         └──4.0 (avgdl, average length of field)

2
title: apple apples
└──0.16857684 (weight(title:appl in 3) [PerFieldSimilarity], result of:)
   └──0.16857684 

In [22]:
settings = {
  "settings": {
    "number_of_shards": 1,
    "analysis": {
      "analyzer": {
          "default": {
              "type": "english"
          },
          "default_search": {
              "type": "english"
          }
      }
    }
  }
}

newIndex = "fruits"
newIndexBaseUrl = host + newIndex

requests.delete(newIndexBaseUrl)
requests.put(newIndexBaseUrl, data=json.dumps(settings), headers=headers)

<Response [200]>

In [23]:
requests.post(newIndexBaseUrl + "/_doc", data=putFruitDoc("apple apple apple apple apple"), headers = headers)
requests.post(newIndexBaseUrl + "/_doc", data=putFruitDoc("apple apple apple banana banana"), headers = headers)
requests.post(newIndexBaseUrl + "/_doc", data=putFruitDoc("apple banana blueberry coconut"), headers = headers)
requests.post(newIndexBaseUrl + "/_doc", data=putFruitDoc("apple apples"), headers = headers)

<Response [201]>

### Scoring strength of a feature in a single field

In [24]:
baseQuery = {
    "query": {
        "match": {
            "title": "apple",
        }
    },
    "explain": "true"
}

hits = my_utils.getSearchHits(newIndexBaseUrl, baseQuery)['hits']

print("%s\t%s\t%s" %("Num", "Score", "Title"))
for idx, hit in enumerate(hits):
    print("%s\t%s\t%s" %(idx+1, str(round(hit['_score'], 3)).rjust(4, ' '), hit['_source']['title']))
print("")

for idx, hit in enumerate(hits):
        print(idx+1)
        my_utils.titleAndExplanation(hit)
        print("")

Num	Score	Title
1	0.18	apple apple apple apple apple
2	0.169	apple apples
3	0.157	apple apple apple banana banana
4	0.105	apple banana blueberry coconut

1
title: apple apple apple apple apple
└──0.18038377 (weight(title:appl in 0) [PerFieldSimilarity], result of:)
   └──0.18038377 (score(freq=5.0), computed as boost * idf * tf from:)
      └──2.2 (boost)
      └──0.105360515 (idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:)
         └──4 (n, number of documents containing term)
         └──4 (N, total number of documents with field)
      └──0.7782101 (tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:)
         └──5.0 (freq, occurrences of term within document)
         └──1.2 (k1, term saturation parameter)
         └──0.75 (b, length normalization parameter)
         └──5.0 (dl, length of field)
         └──4.0 (avgdl, average length of field)

2
title: apple apples
└──0.16857684 (weight(title:appl in 1) [PerFieldSimilarity], result of:)
   └──0.16857684 

### Scoring beyond $TF \times IDF$: multiple search terms and multiple fields

#### Searching for two terms

In [25]:
baseQuery = {
    "query": {
        "match": {
            "title": "apple banana",
        }
    },
    "explain": "true"
}

hits = my_utils.getSearchHits(newIndexBaseUrl, baseQuery)['hits']

print("%s\t%s\t%s" %("Num", "Score", "Title"))
for idx, hit in enumerate(hits):
    print("%s\t%s\t%s" %(idx+1, str(round(hit['_score'], 3)).rjust(4, ' '), hit['_source']['title']))
print("")

for idx, hit in enumerate(hits):
        print(idx+1)
        my_utils.titleAndExplanation(hit)
        print("")

Num	Score	Title
1	1.048	apple apple apple banana banana
2	0.799	apple banana blueberry coconut
3	0.18	apple apple apple apple apple
4	0.169	apple apples

1
title: apple apple apple banana banana
└──1.0476143 (sum of:)
   └──0.15714788 (weight(title:appl in 1) [PerFieldSimilarity], result of:)
      └──0.15714788 (score(freq=3.0), computed as boost * idf * tf from:)
         └──2.2 (boost)
         └──0.105360515 (idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:)
            └──4 (n, number of documents containing term)
            └──4 (N, total number of documents with field)
         └──0.6779661 (tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:)
            └──3.0 (freq, occurrences of term within document)
            └──1.2 (k1, term saturation parameter)
            └──0.75 (b, length normalization parameter)
            └──5.0 (dl, length of field)
            └──4.0 (avgdl, average length of field)
   └──0.89046645 (weight(title:banana in 1) [PerFiel

## Analysis strategies

### Delimiters

In [51]:
settings = {
  "settings": {
    "number_of_shards": 1,
    "analysis": {
        "filter": {
            "phone_num_filter": {
                "type": "word_delimiter",
                "catenate_all": "true",
                "generate_number_parts": "false"
            },
            "phone_num_parts": {
                "type": "pattern_capture",
                "patterns": ["(\\d{7}$)", "(\\d{10}$)"],
                "preserve_original": "true"
            }
        },
      "analyzer": {
          "phone_num": {
              "tokenizer": "keyword",
              "filter": ["phone_num_filter", "phone_num_parts"]
          }
      }
    }
  }
}

newIndex = "phone-nums"
newIndexBaseUrl = host + newIndex

requests.delete(newIndexBaseUrl)
requests.put(newIndexBaseUrl, data=json.dumps(settings), headers=headers)

<Response [200]>

In [52]:
phoneNumQuery = {
    "analyzer": "phone_num",
    "text": "1 (800) 867-5309"
}

resp = requests.get(newIndexBaseUrl + "/_analyze", data=json.dumps(phoneNumQuery), headers=headers)
tokens = my_utils.getTokenStrings(resp)

print(tokens)

['18008675309', '8008675309', '8675309']


### Capturing meaning with synonyms

In [75]:
settings = {
    "settings": {
        "number_of_shards": 1,
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": "_english_"
                },
                "english_keywords": {
                    "type": "keyword_marker",
                    "keywords": [
                        "example"
                    ]
                },
                "english_stemmer": {
                    "type": "stemmer",
                    "language": "english"
                },
                "english_possessive_stemmer": {
                    "type": "stemmer",
                    "language": "possessive_english"
                },
                "retail_syn_filter": {
                    "type": "synonym",
                    "synonyms": [
                        "dress shoe, dress shoes => dress_shoe, shoe"
                    ]
                }
            },
            "analyzer": {
                "retail_analyzer": {
                    "tokenizer": "standard",
                    "filter": [
                        "english_possessive_stemmer",
                        "lowercase",
                        "retail_syn_filter",
                        "english_stop",
                        "english_keywords",
                        "english_stemmer"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "desc": {
                "type": "text",
                "analyzer": "retail_analyzer"
            }
        }
    }
}

newIndex = "synonyms"
newIndexBaseUrl = host + newIndex

requests.delete(newIndexBaseUrl)
requests.put(newIndexBaseUrl, data=json.dumps(settings), headers=headers)

<Response [200]>

In [76]:
requests.post(newIndexBaseUrl + "/_doc", data=json.dumps({"desc": "bob's brand dress shoes are the bomb"}), headers = headers) # dress shoe doc
requests.post(newIndexBaseUrl + "/_doc", data=json.dumps({"desc": "this little black dress is sure to impress"}), headers = headers) # dress doc
requests.post(newIndexBaseUrl + "/_doc", data=json.dumps({"desc": "tennis shoes... you know, for tennis"}), headers = headers) # shoe doc

<Response [201]>

The *dressQuery* yields the dress document ✅

In [77]:
dressQuery = {
    "query": {
        "match": {
            "desc": "dress"
        }
    }
}

resp = requests.get(newIndexBaseUrl + "/_search", data=json.dumps(dressQuery), headers=headers)
my_utils.getFieldFromHits(resp, 'desc')

['this little black dress is sure to impress']

The *shoe query* yields the shoe documents ✅

In [78]:
shoeQuery = {
    "query": {
        "match": {
            "desc": "shoe"
        }
    }
}

resp = requests.get(newIndexBaseUrl + "/_search", data=json.dumps(shoeQuery), headers=headers)
my_utils.getFieldFromHits(resp, 'desc')

["bob's brand dress shoes are the bomb",
 'tennis shoes... you know, for tennis']

But the *dress shoe* query also yields the shoe document ❗️

In [79]:
dressShoeQuery = {
    "query": {
        "match": {
            "desc": "dress shoe"
        }
    }
}

resp = requests.get(newIndexBaseUrl + "/_search", data=json.dumps(dressShoeQuery), headers=headers)
my_utils.getFieldFromHits(resp, 'desc')

["bob's brand dress shoes are the bomb",
 'tennis shoes... you know, for tennis']

#### Two-sided analysis

In [None]:
settings = {
    "settings": {
        "number_of_shards": 1,
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": "_english_"
                },
                "english_keywords": {
                    "type": "keyword_marker",
                    "keywords": [
                        "example"
                    ]
                },
                "english_stemmer": {
                    "type": "stemmer",
                    "language": "english"
                },
                "english_possessive_stemmer": {
                    "type": "stemmer",
                    "language": "possessive_english"
                },
                "retail_syn_filter_index": {
                    "type": "synonym",
                    "synonyms": [
                        "dress shoe, dress shoes => dress_shoe, shoe"
                    ]
                },
                                "retail_syn_filter_search": {
                    "type": "synonym",
                    "synonyms": [
                        "dress shoe, dress shoes => dress_shoe"
                    ]
                }
            },
            "analyzer": {
                "retail_analyzer_index": {
                    "tokenizer": "standard",
                    "filter": [
                        "english_possessive_stemmer",
                        "lowercase",
                        "retail_syn_filter_index",
                        "english_stop",
                        "english_keywords",
                        "english_stemmer"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "desc": {
                "type": "text",
                "analyzer": "retail_analyzer"
            }
        }
    }
}

newIndex = "synonyms"
newIndexBaseUrl = host + newIndex

requests.delete(newIndexBaseUrl)
requests.put(newIndexBaseUrl, data=json.dumps(settings), headers=headers)