# Main steps for building an index:

![Sort-Based-Index](img/index_flow.png)

# Tokenization

In [8]:
from nltk.tokenize import PunktSentenceTokenizer
sent_tokenizer = PunktSentenceTokenizer()

example_document = """
Hurricane-force winds have struck central and northern Portugal, leaving 300,000 homes without power.

The remnants of Hurricane Leslie swept in overnight on Saturday, with winds gusting up to 176km/h (109mph).

Civil defence officials said 27 people suffered minor injuries, with localised flooding, hundreds of trees uprooted and a number of flights cancelled.

The storm, one of the most powerful to ever hit the country, is now passing over northern Spain.
"""
sentences = sent_tokenizer.tokenize(example_document)
sentences

['\nHurricane-force winds have struck central and northern Portugal, leaving 300,000 homes without power.',
 'The remnants of Hurricane Leslie swept in overnight on Saturday, with winds gusting up to 176km/h (109mph).',
 'Civil defence officials said 27 people suffered minor injuries, with localised flooding, hundreds of trees uprooted and a number of flights cancelled.',
 'The storm, one of the most powerful to ever hit the country, is now passing over northern Spain.']

In [2]:
from nltk.tokenize import TweetTokenizer, WhitespaceTokenizer, WordPunctTokenizer, TreebankWordTokenizer
tweet_tok, whsp_tok, wordpunct_tok, treebank_tok = TweetTokenizer(), WhitespaceTokenizer(), WordPunctTokenizer(), TreebankWordTokenizer()

print("Whitespace tokenizer:")
print(whsp_tok.tokenize(sentences[0]))

print("\nWord Punctuation Tokenizer:")
print(wordpunct_tok.tokenize(sentences[2]))
print(wordpunct_tok.tokenize('On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88.'))

print("\nTree bank tokenizer:")
print(treebank_tok.tokenize('On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88.'))
print(treebank_tok.tokenize('My email is admin@sth.com'))

print("\nTweeter Tokenizer:")
print(tweet_tok.tokenize('My email is admin@sth.com'))
print(tweet_tok.tokenize('Last sunny day before winter :) #sunisenergy'))
print(tweet_tok.tokenize('Here is my ip address 192.33.24.45')) # we might want to fix this
print(tweet_tok.tokenize("I can't stop singing that song!")) # we might want to split can't in can, 't instead
print(tweet_tok.tokenize(sentences[1])) # maybe we want to keep  km/h together apart from 176?
print(tweet_tok.tokenize('Eugenia e Jack sposi e innamorati (ma la magia di Harry e Meghan non c’è)')) # maybe we want language specific tokenizers

Whitespace tokenizer:
['Hurricane-force', 'winds', 'have', 'struck', 'central', 'and', 'northern', 'Portugal,', 'leaving', '300,000', 'homes', 'without', 'power.']

Word Punctuation Tokenizer:
['Civil', 'defence', 'officials', 'said', '27', 'people', 'suffered', 'minor', 'injuries', ',', 'with', 'localised', 'flooding', ',', 'hundreds', 'of', 'trees', 'uprooted', 'and', 'a', 'number', 'of', 'flights', 'cancelled', '.']
['On', 'a', '$', '50', ',', '000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366', '.', '88', '.']

Tree bank tokenizer:
['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.']
['My', 'email', 'is', 'admin', '@', 'sth.com']

Tweeter Tokenizer:
['My', 'email', 'is', 'admin@sth.com']
['Last', 'sunny', 'day', 'before', 'winter', ':)', '#sunisenergy']
['Here', 'is', 'my', 'ip', 'address', '192.33', '.', '24.45']
['

## Exercise: play with the tokenizers and find more vulnerabilities

# Normalization

Depending on the domain and the task at hand, we would like to make text normalization. Some examples of text normalizing steps can include:
- Whole chunks of the document might be irrelevant, e.g. html tags.
- __Numbers__ - we can remove them or replace them with a common token.
- We can do the same for all other token types we don't care about, e.g. URLs, @ mentions, 😀, etc.
- We can __lower-case__ all words or even use a tool to upper-case all named entities like geographic names.
- We might want to add __equivalence classes__ for matching synonyms, abbreviations, etc.
- Remove __stopwords__ - the list of stopwords is usually manually curated
- Google N-Grams - a huge corpus of statistics, showing frequency of word n-grams in web pages. It contains statistics for many languages and can be used to further examine frequently used words. 
- __Stemming__ - crude removal of prefixes and suffixes to reduce the word form to match words like car-sharing
- __Lemmatiazation__ - replacing the word by its lemma

__Exercise__: Find examples when we wouldn't like to normalize text with each of the proposed methods.

In [3]:
import nltk 
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
tokens = tweet_tok.tokenize('Car sharing will bring a whole new era to the automobile industry. Eventually, it may even decreas air pollutions.')
for token in tokens:
    print(token, lemmatizer.lemmatize(token), stemmer.stem(token))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\didimitrov\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\didimitrov\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Car Car car
sharing sharing share
will will will
bring bring bring
a a a
whole whole whole
new new new
era era era
to to to
the the the
automobile automobile automobil
industry industry industri
. . .
Eventually Eventually eventu
, , ,
it it it
may may may
even even even
decreas decreas decrea
air air air
pollutions pollution pollut
. . .


In [4]:
nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\didimitrov\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Stemming
#### Resource - https://www.geeksforgeeks.org/introduction-to-stemming/ / https://www.semanticscholar.org/paper/A-Comparative-Study-of-Stemming-Algorithms-Ms-.-Jivani/1c0c0fa35d4ff8a2f925eb955e48d655494bd167?p2df

* Stemming is used in information retrieval systems like search engines.
* It is used to determine domain vocabularies in domain analysis.
* To display search results by indexing while documents are evolving into numbers and to map documents to common subjects by stemming.
* Sentiment Analysis, which examines reviews and comments made by different users about anything, is frequently used for product analysis, such as for online retail stores. Before it is interpreted, stemming is accepted in the form of the text-preparation mean.
* A method of group analysis used on textual materials is called document clustering (also known as text clustering). Important uses of it include subject extraction, automatic document structuring, and quick information retrieval.

##### Popular stemmers:
* **Porter’s Stemmer** -  removing the commoner morphological and inflexional endings from words in English
    * Advantage: It produces the best output as compared to other stemmers and it has less error rate.
    * Limitation:  Morphological variants produced are not always real words.
* **Lovins Stemmer** - removes the longest suffix from a word then the word is recorded to convert this stem into valid words
    * Advantage: It is fast and handles irregular plurals like 'teeth' and 'tooth' etc.
    * Limitation: It is time consuming and frequently fails to form words from stem.
* **Dawson Stemmer** - suffixes are stored in the reversed order indexed by their length and last letter
    * Advantage: It is fast in execution and covers more suffices.
    * Limitation: It is very complex to implement.
* **Krovetz Stemmer** - 1) Convert the plural form of a word to its singular form. 2) Convert the past tense of a word to its present tense and remove the suffix ‘ing’. 
    * Advantage: It is light in nature and can be used as pre-stemmer for other stemmers.
    * Limitation: It is inefficient in case of large documents.
* **Snowball Stemmer** - can map non-English words too. Also called Porter2 Stemmer (has performance improvements,  in addition to the multi-lingual application)
* **N-Gram Stemmer**
   * Advantage: It is based on string comparisons and it is language dependent.
   * Limitation: It requires space to create and index the n-grams and it is not time efficient.
* **and many others...**

In [36]:
%pip install tabulate
from nltk.stem import *
import pandas as pd

from collections import defaultdict
stem_list = [porter.PorterStemmer(), lancaster.LancasterStemmer(),snowball.SnowballStemmer('english')]
#List of words
examples = ['agreed', 'children','flies', 'humbled', 'colonizer', 'owned', 'meeting','sitting', 'understood', 'whom']
result = defaultdict(list)
for word in examples:
    result['original'].append(word)
    for stemmer in stem_list:
        result[stemmer.__class__.__name__].append(stemmer.stem(word))
print(pd.DataFrame(result).to_markdown())

Note: you may need to restart the kernel to use updated packages.
|    | original   | PorterStemmer   | LancasterStemmer   | SnowballStemmer   |
|---:|:-----------|:----------------|:-------------------|:------------------|
|  0 | agreed     | agre            | agree              | agre              |
|  1 | children   | children        | childr             | children          |
|  2 | flies      | fli             | fli                | fli               |
|  3 | humbled    | humbl           | humbl              | humbl             |
|  4 | colonizer  | colon           | colon              | colon             |
|  5 | owned      | own             | own                | own               |
|  6 | meeting    | meet            | meet               | meet              |
|  7 | sitting    | sit             | sit                | sit               |
|  8 | understood | understood      | understood         | understood        |
|  9 | whom       | whom            | whom               | whom  



## Exercise: explore the dataset from 20 news groups and create another preprocessing method, which will clean the text of the news articles:
- Select one of the tokenizers or try to write one of your own
- Clean the headings and any common parts
- Lowercase the text and see if you might want to remove any token types (e.g. URLs, you might also want to split URLs to meaningful words and use them as tokens)
- Explore the most frequent words in the corpus and add more stop-words (depending on your tokenizer you might also need to modify the stopwords, too). Then, remove them from the text.
- Have a method variable for choosing lemmatization or stemming
- Finally compare how all the steps reduced/modified the index.


You will also have to include all the steps for the query words in your query methods, too!
- Explore how the quering results changed in both positive and negative ways.

In [5]:
from nltk.tokenize import sent_tokenize, TweetTokenizer
from string import punctuation
from os import scandir
tw_tokenizer = TweetTokenizer()

def tokenize_documents(path, tokenizer):
    """
    Implement a tokenizer function that accepts a directory path and a tokenizer. 
    The function should return a list of tokenized documents. You can also return a dictionary with document names
    """
   pass

In [6]:
tokenize_documents('data/mini_newsgroups/sci.crypt/',tw_tokenizer)[1]

['path',
 'cantaloupe.srv.cs.cmu.edu',
 'das-news.harvard.edu',
 'ogicse',
 'uwm.edu',
 'zaphod.mps.ohio-state.edu',
 'howland.reston.ans.net',
 'sol.ctr.columbia.edu',
 'news.cs.columbia.edu',
 'ji',
 'from',
 'ji@cs.columbia.edu',
 'john',
 'ioannidis',
 'newsgroups',
 'sci.crypt',
 'subject',
 're',
 'source',
 'of',
 'random',
 'bits',
 'on',
 'a',
 'unix',
 'workstation',
 'message-id',
 '<c5jp0k.4p5@cs.columbia.edu>',
 'date',
 '15',
 'apr',
 '93',
 '21:57',
 '55',
 'gmt',
 'article-i',
 'd',
 'cs',
 'c5jp0k',
 '4p5',
 'references',
 '<897@pivot.sbi.com>',
 '<c5ja6s.a59@cs.psu.edu>',
 'sender',
 'news@cs.columbia.edu',
 'the',
 'daily',
 'news',
 'organization',
 'columbia',
 'university',
 'department',
 'of',
 'computer',
 'science',
 'lines',
 '35',
 'in',
 'article',
 '<c5ja6s.a59@cs.psu.edu>',
 'so@eiffel.cs.psu.edu',
 'nicol',
 'c',
 'so',
 'writes',
 'in',
 'article',
 '<897@pivot.sbi.com>',
 'bet@sbi.com',
 'bennett',
 'todd',
 'salomon',
 'brothers',
 'inc',
 'ny',
 'wri

In [7]:
from collections import Counter
from pandas import DataFrame
import matplotlib.pyplot as plt

print(">>>>>>Observing how the most frequent tokens change:")
# normal list of words
word_freqs = Counter(sum(tokenize_documents('data/mini_newsgroups/sci.crypt/',tw_tokenizer), []))
df = DataFrame(list(word_freqs.items()), columns=['text', 'freq'])
df.sort_values(['freq'], inplace=True, ascending=False)
print(">>>>>Original Tokens:")
print(df.head())

# remove stopwords
en_stopwords = set(stopwords.words('english'))
df_no_stopwords = df.copy()
df_no_stopwords = df_no_stopwords[~df_no_stopwords['text'].isin(en_stopwords)]
print(">>>>>>Removed stopwords:")
print(df_no_stopwords.head())

# stem words
stem = stemmer.stem
df_no_stopwords['stemmed'] = df_no_stopwords['text'].apply(stem)
print(">>>>>>Applied stemming:")
print(df_no_stopwords.tail())

# merge same stems into one row
stemmed_freqs = df_no_stopwords.groupby(['stemmed'])['freq'].sum()
stemmed_freqs.sort_values(inplace=True, ascending=False)
print(">>>>>>Merge same stemmed tokens into one:")
print(stemmed_freqs.head())

>>>>>>Observing how the most frequent tokens change:
>>>>>Original Tokens:
    text  freq
52   the  1718
45    to   831
19    of   711
88     a   563
173  and   506
>>>>>>Removed stopwords:
                          text  freq
15                   sci.crypt   158
1    cantaloupe.srv.cs.cmu.edu   150
271                 encryption   148
21                         key   133
269                    clipper   115
>>>>>>Applied stemming:
            text  freq     stemmed
3036   directive     1      direct
3037  additional     1       addit
3038         mat     1         mat
3039      heyman     1      heyman
5768  0)794-3017     1  0)794-3017
>>>>>>Merge same stemmed tokens into one:
stemmed
key                          198
encrypt                      187
use                          185
sci.crypt                    158
cantaloupe.srv.cs.cmu.edu    150
Name: freq, dtype: int64
