In [16]:
import pandas as pd
import nltk
import os
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize
from tqdm.notebook import tqdm
from sklearn import feature_extraction
import dotenv
import sys
lemmatizer = WordNetLemmatizer()
dotenv.load_dotenv('../ext_variables.env') # Necessary to avoid putting absolute paths
os.chdir(os.getenv("PATH_FILES_ADM"))
tqdm.pandas() # add tqdm progress_apply method for Pandas classes (DataFrame, Series and GroupBy classes)
nltk.download('wordnet') # wordnet
nltk.download("stopwords") # stopword list
nltk.download('punkt') # sentence separation
nltk.download('averaged_perceptron_tagger') # pos-tagging
nltk.download('omw-1.4') # wordnet

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/simonedigregorio/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simonedigregorio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/simonedigregorio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/simonedigregorio/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/simonedigregorio/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [17]:
# First of all, let's load the csv/the tsv
places  = pd.read_csv("places.csv", sep = "\t", index_col=0)

## 2. Search Engine
<a id = "point_2"></a>

First of all, let's perform some pre-processing with lemmatization (incorporating POS tagging), stopwords removal (what remains of them at that point) and lowercase conversion on the _description_ columns.

The first function in the next block of code converts the tag from the [Penn Treebank project](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) to the ones of [WordNet](https://wordnet.princeton.edu).

The second function performs the whole-preprocessing itself:
 - First of all, it tokenizes the strings with `word_tokenize`. This may seem simple, but it is not. Under the hood, `nltk` uses an already trained (for English) unsupervised model able to understand how to split the string into sentence (one can retrain the unsupervised model with a particular corpus, with a specific target language). The output is then parsed with a RegEx expression in order to be split into words.
  - The tokenized output is then passed to the part-of-speech tagger, whose details can be found in a very good blog post, [here](https://explosion.ai/blog/part-of-speech-pos-tagger-in-python).
  - The tags are mapped to the ones from WordNet with the function `get_wordnet_pos`, the first defined in the block. Notice that tags not related to adjectives, verbs, nouns or adverbs will be cancelled out by the function, returning `None`. Once this is done, if the resulting tag has `None` value, the related token/word is not considered.
  - With the tuples `(token, pos_tag)` we can finally call the WordNet lemmatizer (more specifically, the _morphy_ function, more info [here](https://wordnet.princeton.edu/documentation/morphy7wn)).
  - The output is converted to lower case and checked against stopwords (and against some punctuation which may still be there).
  - The words are joined together with the `|` separator, a character that I do not expect to be popular in the English language.

Notice that with this process we have implicitly removed punctuation.

In [None]:
# For the following mapping, credits to this stack overflow question: https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


# The following is a function which performs the whole pre-processing with lemmatization (incorporating information from POS tagging), stopwords removal and lowercase conversion
def preprocessing_field(description: str) -> str:
    # First of all, we need to tokenize the text.
    # We use word_tokenize from nltk
    tokenized = word_tokenize(description, language='english')
    # Then we use the pos tagger from nltk
    pos_tagged = nltk.pos_tag(tokenized)
    # Then we convert tags from Penn Treebank format to the one of WordNet
    # We also perform some manual filtering of punctuation that may still be there
    converted_tags_words = [(x[0], get_wordnet_pos(x[1])) for x in pos_tagged]
    # Filter everything that is not an adverb, a verb, a noun or an adjective
    converted_tags_words = [(x[0], x[1]) for x in converted_tags_words if x[1] is not None]
    # Lemmatize the words (WordNet lemmatizer, morphy function)
    lemmatized_words = [lemmatizer.lemmatize(word = x[0], pos = x[1]) for x in converted_tags_words]
    # Lowercase conversion and stopwords removal (most of the stopwords were anyway removed with get_wordnet_pos and related filtering)
    # Some residual punctuation is also removed. Notice that stopwords removal could be done also later on with CountVectorizer and scikit, but we do it here
    lemmatized_words = [x.lower() for x in lemmatized_words if (x.lower() not in stopwords.words("english")) and (x not in ("’", "‘", "\'", "‟", "”", "-","“","."))]
    # Join the words into strings with words separated explicitly (afterwards we will only need to specify the separator for scikit learn CountVectorizer)
    return "|"+"|".join(lemmatized_words)+"|"

The following is just a little example to show the power of what we are doing right now. It is a little homage to Breiman, from his Wikipedia page.

Also notice that, contrary to what we do with stemming, lemmatization retains something which can be understood and read; this in general improves visualization, reporting and debugging with text data. Additionally, lemmatization preserves far more variability than stemming, for obvious reasons (we are reducing words to their dictionary lemma, not to their root), and this may be useful since we are working on a search engine, whose queries may aim at specific words.

However, we may not always want this variability, and this is indeed the case for some specific applications, such as ML modelling where we prefer to catch the signal while disregarding noise due to specific lemmas.

In [None]:
preprocessing_field("Leo Breiman was a distinguished statistician at the University of California, Berkeley. He was the recipient of numerous honors and awards, and was a member of the United States National Academy of Sciences. Breiman's work helped to bridge the gap between statistics and computer science, particularly in the field of machine learning. His most important contributions were his work on classification and regression trees and ensembles of trees fit to bootstrap samples. Bootstrap aggregation was given the name bagging by Breiman. Another of Breiman's ensemble approaches is the random forest.")

Now we apply what we have described to the whole _descriptions_ columns. We are using the integration between `pandas` and `tqdm` in order to keep track of progress.

In [None]:
# Lemmatization of Description and Short Description with POS tagging
# We simply assign the description columns to the transformed descriptions
places.loc[:, "placeDesc"], places.loc[:, "placeShortDesc"] = places.placeDesc.progress_apply(preprocessing_field), places.placeShortDesc.progress_apply(preprocessing_field)

In [None]:
places.placeDesc

## 2.1 Conjunctive query
<a id = "point_2.1"></a>

### 2.1.1 Create your index

Next step is building the inverted index. In order to do that, the first step is building a so-called Term-Document matrix, which is easily built with scikit-learn, more specifically with its (very, very useful) `CountVectorizer` class. The `transform` method of the class returns a Document-Term matrix, so we need to take its transpose. For every row (a document) we will have a list (along the columns) of one-hot representations (presence or absence of a word).

We need to pass a RegEx expression to enforce the separator we have placed before: `|`. We also pass other arguments to the `__init__` method, but they are relatively straightforward.


In [None]:
# The RegEx token pattern is very simple here, it just has a single capturing group with a non-greedy quantifier and a lookahead and a lookbehind to avoid consuming "|" in the match, which is a NO-GO
# Binary = TRUE => one hot encoding representation
# We avoid considering tokens appearing only once
one_hot_vectorizer = feature_extraction.text.CountVectorizer(strip_accents=False, lowercase=False, token_pattern=r"(?<=\|)(.*?)(?=\|)", min_df=2, binary = True)
one_hot_vectorizer.fit(places.placeDesc) # Get the vocabulary from the corpus
document_term = one_hot_vectorizer.transform(places.placeDesc).transpose() # Transform the corpus into the document-term matrix, then take the transpose of the output

We then get the dictionary mapping each word to its index in the matrix. In order to do that, we can just access an attribute of the one_hot_vectorizer object. It will be saved, serialized, in storage. For that we can just use the `pickle` Python module.

In [None]:
import pickle
# The vocabulary is saved, serialized, in storage
vocabulary_word_index = one_hot_vectorizer.vocabulary_
with open("vocabulary_word_index.pickle", "wb") as vocab_file:
    pickle.dump(vocabulary_word_index, file = vocab_file)

In order to get the dictionary/the inverted index we just need to extract the indexes of the non-zero entries in the sparse Document-Term matrix. Since the `sparse.csc_matrix` class of _scipy_ (whose instance is returned by the `transform` method of the `CountVectorizer` class) has a method for this, we can just use that one. What is returned, as usual, are two NumPy arrays, one for the indexes referring to the rows, and one for the indexes referring to the columns. They are two iterables, so we `zip` them together and iterate over them in order to build the inverted index.

In [None]:
from collections import defaultdict
inverted_index_onehot = defaultdict(list)
for row in zip(document_term.nonzero()[0], document_term.nonzero()[1]):
    inverted_index_onehot[row[0]].append(row[1])