In [2]:
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.tokenize import sent_tokenize, word_tokenize 
from rank_bm25 import BM25Okapi

#### Basic Similarity Calculation

**Need to run this code first**

In [None]:
# import nltk
# nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

**Test Data**

In [None]:
sentence = "Natural language processing is an interdisciplinary subfield of computer science and linguistics. It is primarily concerned with giving computers the ability to support and manipulate human language."

sentence_collection = [
    'Natural language processing is an interdisciplinary subfield of computer science and linguistics. It is primarily concerned with giving computers the ability to support and manipulate human language.',

    'Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence. The proposed test includes a task that involves the automated interpretation and generation of natural language.'
                       ]

**`sent_tokenize` is used to tokenize a text into sentences**
- It accepts only a single string at a time. It does not accept a collection of string.

In [None]:
sent_tokenize(sentence)

['Natural language processing is an interdisciplinary subfield of computer science and linguistics.',
 'It is primarily concerned with giving computers the ability to support and manipulate human language.']

In [None]:
for sentence in sentence_collection:
    print(sent_tokenize(sentence))

['Natural language processing is an interdisciplinary subfield of computer science and linguistics.', 'It is primarily concerned with giving computers the ability to support and manipulate human language.']
['Natural language processing has its roots in the 1950s.', 'Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence.', 'The proposed test includes a task that involves the automated interpretation and generation of natural language.']


**`word_tokenize` is used to tokenize a sentence or a piece of text into words.**
- It accepts only a single string at a time. It does not accept a collection of string.

In [None]:
docs1 = [
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "The Sorcerer's Den! 5s",
    "Great! Sorcerer's of NY 2",
    "Great Secrets of Amazon",
    "S",
    "Ss",
    "7x7"
]

In [None]:
for doc in docs1:
    print(word_tokenize(doc))

['Harry', 'Potter', '#', 'and', 'the', 'Sorcerer', "'s", 'Stone']
['Harry', 'Potter', '#', '2', 'and', 'the', 'Chamber', 'of', 'Secrets']
['The', 'Sorcerer', "'s", 'Den', '!', '5s']
['Great', '!', 'Sorcerer', "'s", 'of', 'NY', '2']
['Great', 'Secrets', 'of', 'Amazon']
['S']
['Ss']
['7x7']


**Structured way of creating tokenized list of documents**

In [None]:
tokenized_documents = [word_tokenize(document.lower()) for document in docs1]
tokenized_documents

[['harry', 'potter', '#', 'and', 'the', 'sorcerer', "'s", 'stone'],
 ['harry', 'potter', '#', '2', 'and', 'the', 'chamber', 'of', 'secrets'],
 ['the', 'sorcerer', "'s", 'den', '!', '5s'],
 ['great', '!', 'sorcerer', "'s", 'of', 'ny', '2'],
 ['great', 'secrets', 'of', 'amazon'],
 ['s'],
 ['ss'],
 ['7x7']]

- It is a smart word tokenizer which understand sentences properly and can differentiate between words and punctuations and also consider them as valid tokens

In [None]:
bm25 = BM25Okapi(tokenized_documents)

- `BM25Okapi` takes only tokenized collection of documents

In [None]:
query = "Harry Potter #2"

tokenized_query = word_tokenize(query.lower())

scores = bm25.get_scores(tokenized_query)

scores

array([2.15792005, 2.68086623, 0.        , 0.77615639, 0.        ,
       0.        , 0.        , 0.        ])

- We get the scores as 1D array

In [None]:
query = "Harry Potter #2"
tokenized_query = word_tokenize(query.lower())

top_n = bm25.get_top_n(tokenized_query, docs1, n=4)
top_n

['Harry Potter #2 and the Chamber of Secrets',
 "Harry Potter # and the Sorcerer's Stone",
 "Great! Sorcerer's of NY 2",
 '7x7']

- We can get the matches directly based on the scores

In [None]:
query = "Harry Potter #2"
tokenized_query = word_tokenize(query.lower())

top_n = bm25.get_top_n(tokenized_query, tokenized_documents, n=4)
top_n

[['harry', 'potter', '#', '2', 'and', 'the', 'chamber', 'of', 'secrets'],
 ['harry', 'potter', '#', 'and', 'the', 'sorcerer', "'s", 'stone'],
 ['great', '!', 'sorcerer', "'s", 'of', 'ny', '2'],
 ['7x7']]

- We can get the matches based on the scores in the tokenized form

In [None]:
# stop code execution

10/0

#### (TFIDF + Cosine Similarity) vs BM25 - Output Check and Comparison

In [None]:
def tfidf(docs1,query_str):
    tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                     min_df=0)
    tfidf1 = tf1.fit_transform(docs1)
    cosine_sim1 = cosine_similarity(tfidf1, tf1.transform([query_str]))

    return cosine_sim1

In [None]:
def bm25(docs1,query_str):
    tokenized_query = word_tokenize(query_str.lower())
    tokenized_documents = [word_tokenize(document.lower()) for document in docs1]
    bm25 = BM25Okapi(tokenized_documents)
    scores = bm25.get_scores(tokenized_query)

    return scores

In [None]:
docs1 = [
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "The Sorcerer's Den! 5s",
    "Great! Sorcerer's of NY 2",
    "Great Secrets of Amazon",
    "S",
    "Ss",
    "7x7"
]

In [None]:
query_str = "Harry Potter"

print(list(reversed((np.argsort(tfidf(docs1,query_str).flatten())[-4:]))))
print(list(reversed((np.argsort(bm25(docs1,query_str))[-4:]))))

[0, 1, 7, 6]
[0, 1, 7, 6]


In [None]:
query_str = "Sorcerer's Stone NY"

print(list(reversed((np.argsort(tfidf(docs1,query_str).flatten())[-4:]))))
print(list(reversed((np.argsort(bm25(docs1,query_str))[-4:]))))

[3, 0, 2, 7]
[3, 0, 2, 7]


In [None]:
query_str = "the Potter Harry "

print(list(reversed((np.argsort(tfidf(docs1,query_str).flatten())[-4:]))))
print(list(reversed((np.argsort(bm25(docs1,query_str))[-4:]))))

[0, 1, 2, 7]
[0, 1, 2, 7]


- For exact token matches performance accuracy is same at this scale

In [None]:
query_str = "Harry Potter Sorcerer's"

print(list(reversed((np.argsort(tfidf(docs1,query_str).flatten())[-4:]))))
print(list(reversed((np.argsort(bm25(docs1,query_str))[-4:]))))

[0, 1, 3, 2]
[0, 1, 2, 3]


- We can see a difference here because `Sorcerer's` gets tokenized differently in two modules.
  - In BM25, `Sorcerer's` => `sorcerer`, `'s`
  - In TFIDF, `Sorcerer's` => `sorcerer` but there won't be any token as `'s`
- So, there is a difference in tokens and so in score

#### Combine `nltk` for Vocabulary Generation and (`tfidf` + `Cosine Similarity`) for Similarity Calculation Vs BM25

In [None]:
docs1 = [
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "The Sorcerer's Den! 5s",
    "Great! Sorcerer's of NY 2",
    "Great Secrets of Amazon",
    "S",
    "Ss",
    "7x7"
]

In [None]:
vocabulary_set = set()

for document in docs1:
    vocabulary_set.update(word_tokenize(document.lower()))

print(vocabulary_set)
print(len(vocabulary_set))

{'amazon', 'great', '!', 'of', '#', 'den', 'secrets', '7x7', 's', "'s", 'the', 'ny', 'and', 'potter', 'stone', 'ss', 'harry', 'sorcerer', '5s', '2', 'chamber'}
21


**Sorting the list**

In [None]:
vocabulary_list = list(vocabulary_set)
vocabulary_list.sort()
print(vocabulary_list)

['!', '#', "'s", '2', '5s', '7x7', 'amazon', 'and', 'chamber', 'den', 'great', 'harry', 'ny', 'of', 'potter', 's', 'secrets', 'sorcerer', 'ss', 'stone', 'the']


In [None]:
tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                    min_df=0, vocabulary=vocabulary_set)

tfidf1 = tf1.fit_transform(docs1)

print(tfidf1.toarray().shape)
print("="*50)
print(tfidf1.toarray())

(8, 21)
[[0.         0.         0.         0.         0.         0.
  0.         0.41124081 0.         0.         0.         0.41124081
  0.         0.         0.41124081 0.         0.         0.35486708
  0.         0.49069512 0.35486708]
 [0.         0.         0.         0.         0.         0.
  0.         0.38033548 0.45381869 0.         0.         0.38033548
  0.         0.32819832 0.38033548 0.         0.38033548 0.
  0.         0.         0.32819832]
 [0.         0.         0.         0.         0.57297276 0.
  0.         0.         0.         0.57297276 0.         0.
  0.         0.         0.         0.         0.         0.41436966
  0.         0.         0.41436966]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.50552809 0.
  0.6031993  0.43622927 0.         0.         0.         0.43622927
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.58442997 0.         0.  

- We can observe that `'!', '#', "'s", '2', 's'` not getting considered as any match
- `"S"` not getting considered as any valid vector eventhough there is a term `'s'`

In [None]:
tf1.get_feature_names_out()

array(['!', '#', "'s", '2', '5s', '7x7', 'amazon', 'and', 'chamber',
       'den', 'great', 'harry', 'ny', 'of', 'potter', 's', 'secrets',
       'sorcerer', 'ss', 'stone', 'the'], dtype=object)

In [None]:
query_str = "Harry Potter Den!"
tf1.transform([query_str]).toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.64485945,
        0.        , 0.54044255, 0.        , 0.        , 0.54044255,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

- We can clearly observe that there is a tfidf score for the terms `Harry`, `Potter`, `den` but there is no score for `!` although it is present in the Vocabulary

```
docs1 = [
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "The Sorcerer's Den! 5s",
    "Great! Sorcerer's of NY 2",
    "Great Secrets of Amazon",
    "S",
    "Ss",
    "7x7"
]
```

In [None]:
cosine_sim1 = cosine_similarity(tfidf1, tf1.transform([query_str]))

cosine_sim1

array([[0.44450406],
       [0.41109895],
       [0.3694869 ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ]])

- We can observe that `Harry Potter Den!` has similarities in descending order as follows.

```
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "The Sorcerer's Den! 5s",
```

In [None]:
query_str = "Harry Potter Den!"

tokenized_query = word_tokenize(query_str.lower())

tokenized_documents = [word_tokenize(document.lower()) for document in docs1]

bm25 = BM25Okapi(tokenized_documents)

scores = bm25.get_scores(tokenized_query)

scores

array([1.43861337, 1.34043312, 2.26229145, 0.77615639, 0.        ,
       0.        , 0.        , 0.        ])

- We can observe that `Harry Potter Den!` has similarities in descending order as follows.

```
    "The Sorcerer's Den! 5s",
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "Great! Sorcerer's of NY 2",
```

In [None]:
query_str = "Harry Potter Den!"

tokenized_query = word_tokenize(query_str.lower())

tokenized_documents = [word_tokenize(document.lower()) for document in docs1]

bm25 = BM25Okapi(tokenized_documents)

top_n = bm25.get_top_n(tokenized_query, tokenized_documents, n=4)

top_n

[['the', 'sorcerer', "'s", 'den', '!', '5s'],
 ['harry', 'potter', '#', 'and', 'the', 'sorcerer', "'s", 'stone'],
 ['harry', 'potter', '#', '2', 'and', 'the', 'chamber', 'of', 'secrets'],
 ['great', '!', 'sorcerer', "'s", 'of', 'ny', '2']]

In [None]:
query_str = "Harry Potter Den!"

tokenized_query = word_tokenize(query_str.lower())

tokenized_documents = [word_tokenize(document.lower()) for document in docs1]

bm25 = BM25Okapi(tokenized_documents)

top_n = bm25.get_top_n(tokenized_query, docs1, n=4)

top_n

["The Sorcerer's Den! 5s",
 "Harry Potter # and the Sorcerer's Stone",
 'Harry Potter #2 and the Chamber of Secrets',
 "Great! Sorcerer's of NY 2"]

**Using the tfidfVectorizer's default tokenizer**

In [None]:
tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                    min_df=0)

tfidf1 = tf1.fit_transform(docs1)

print(tfidf1.toarray().shape)

(8, 16)


In [None]:
tf1.get_feature_names_out()

array(['5s', '7x7', 'amazon', 'and', 'chamber', 'den', 'great', 'harry',
       'ny', 'of', 'potter', 'secrets', 'sorcerer', 'ss', 'stone', 'the'],
      dtype=object)

In [None]:
query_str = "Harry Potter Den!"
tf1.transform([query_str]).toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.64485945, 0.        , 0.54044255, 0.        , 0.        ,
        0.54044255, 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

```
docs1 = [
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "The Sorcerer's Den! 5s",
    "Great! Sorcerer's of NY 2",
    "Great Secrets of Amazon",
    "S",
    "Ss",
    "7x7"
]
```

```
array([[0.44450406],
       [0.41109895],
       [0.3694869 ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ]])
```

In [None]:
cosine_sim1 = cosine_similarity(tfidf1, tf1.transform([query_str]))

cosine_sim1

array([[0.44450406],
       [0.41109895],
       [0.3694869 ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ]])

- We can observe that the score is same as the above tfidfVectorizer and Cosine Similary pair
- We can conclude that, in tfidfVectorizer, punctuations are not considered as terms or vocabulary even if it is present in the vocabularity of the tfidfVectorizer object