In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.tokenize import sent_tokenize, word_tokenize 
from rank_bm25 import BM25Okapi

### Basic Similarity Calculation

**Need to run this code first**

In [None]:
# import nltk
# nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

**Test Data**

In [3]:
sentence = "Natural language processing is an interdisciplinary subfield of computer science and linguistics. It is primarily concerned with giving computers the ability to support and manipulate human language."

sentence_collection = [
    'Natural language processing is an interdisciplinary subfield of computer science and linguistics. It is primarily concerned with giving computers the ability to support and manipulate human language.',

    'Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence. The proposed test includes a task that involves the automated interpretation and generation of natural language.'
                       ]

**`sent_tokenize` is used to tokenize a text into sentences**
- It accepts only a single string at a time. It does not accept a collection of string.

In [4]:
sent_tokenize(sentence)

['Natural language processing is an interdisciplinary subfield of computer science and linguistics.',
 'It is primarily concerned with giving computers the ability to support and manipulate human language.']

In [5]:
for sentence in sentence_collection:
    print(sent_tokenize(sentence))

['Natural language processing is an interdisciplinary subfield of computer science and linguistics.', 'It is primarily concerned with giving computers the ability to support and manipulate human language.']
['Natural language processing has its roots in the 1950s.', 'Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence.', 'The proposed test includes a task that involves the automated interpretation and generation of natural language.']


**`word_tokenize` is used to tokenize a sentence or a piece of text into words.**
- It accepts only a single string at a time. It does not accept a collection of string.

In [6]:
docs1 = [
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "The Sorcerer's Den! 5s",
    "Great! Sorcerer's of NY 2",
    "Great Secrets of Amazon",
    "S",
    "Ss",
    "7x7"
]

In [7]:
for doc in docs1:
    print(word_tokenize(doc))

['Harry', 'Potter', '#', 'and', 'the', 'Sorcerer', "'s", 'Stone']
['Harry', 'Potter', '#', '2', 'and', 'the', 'Chamber', 'of', 'Secrets']
['The', 'Sorcerer', "'s", 'Den', '!', '5s']
['Great', '!', 'Sorcerer', "'s", 'of', 'NY', '2']
['Great', 'Secrets', 'of', 'Amazon']
['S']
['Ss']
['7x7']


**Structured way of creating tokenized list of documents**

In [8]:
tokenized_documents = [word_tokenize(document.lower()) for document in docs1]
tokenized_documents

[['harry', 'potter', '#', 'and', 'the', 'sorcerer', "'s", 'stone'],
 ['harry', 'potter', '#', '2', 'and', 'the', 'chamber', 'of', 'secrets'],
 ['the', 'sorcerer', "'s", 'den', '!', '5s'],
 ['great', '!', 'sorcerer', "'s", 'of', 'ny', '2'],
 ['great', 'secrets', 'of', 'amazon'],
 ['s'],
 ['ss'],
 ['7x7']]

- It is a smart word tokenizer which understand sentences properly and can differentiate between words and punctuations and also consider them as valid tokens

In [9]:
bm25 = BM25Okapi(tokenized_documents)

- `BM25Okapi` takes only tokenized collection of documents

**We get the scores as 1D array**

In [10]:
query = "Harry Potter #2"

tokenized_query = word_tokenize(query.lower())

scores = bm25.get_scores(tokenized_query)

scores

array([2.15792005, 2.68086623, 0.        , 0.77615639, 0.        ,
       0.        , 0.        , 0.        ])

In [11]:
pd.DataFrame(data=scores, index=docs1,
             columns=[query])

Unnamed: 0,Harry Potter #2
Harry Potter # and the Sorcerer's Stone,2.15792
Harry Potter #2 and the Chamber of Secrets,2.680866
The Sorcerer's Den! 5s,0.0
Great! Sorcerer's of NY 2,0.776156
Great Secrets of Amazon,0.0
S,0.0
Ss,0.0
7x7,0.0


**We can get the matches directly based on the scores**

In [13]:
query = "Harry Potter #2"
tokenized_query = word_tokenize(query.lower())

top_n = bm25.get_top_n(tokenized_query, docs1, n=4)
top_n

['Harry Potter #2 and the Chamber of Secrets',
 "Harry Potter # and the Sorcerer's Stone",
 "Great! Sorcerer's of NY 2",
 '7x7']

**We can get the matches based on the scores in the tokenized form**

In [None]:
query = "Harry Potter #2"
tokenized_query = word_tokenize(query.lower())

top_n = bm25.get_top_n(tokenized_query, tokenized_documents, n=4)
top_n

[['harry', 'potter', '#', '2', 'and', 'the', 'chamber', 'of', 'secrets'],
 ['harry', 'potter', '#', 'and', 'the', 'sorcerer', "'s", 'stone'],
 ['great', '!', 'sorcerer', "'s", 'of', 'ny', '2'],
 ['7x7']]

In [None]:
# stop code execution

10/0

### (TFIDF + Cosine Similarity) vs BM25 - Output Check and Comparison

In [2]:
def tfidf_cs(docs1,query_str):
    tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                     min_df=0)
    tfidf1 = tf1.fit_transform(docs1)
    cosine_sim1 = cosine_similarity(tfidf1, tf1.transform([query_str]))

    return cosine_sim1

In [3]:
def bm25(docs1,query_str):
    tokenized_query = word_tokenize(query_str.lower())
    tokenized_documents = [word_tokenize(document.lower()) for document in docs1]
    bm25 = BM25Okapi(tokenized_documents)
    scores = bm25.get_scores(tokenized_query)

    return scores

In [4]:
docs1 = pd.Series([
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "The Sorcerer's Den! 5s",
    "Great! Sorcerer's of NY 2",
    "Great Secrets of Amazon",
    "S",
    "Ss",
    "7x7",
    "Harry Harry Potter",
    "Harry Potter Potter",
    "Little Harry",
    "Little Potter"
])

In [25]:
query_str = "Harry Potter"

data1 = list(reversed((np.argsort(tfidf_cs(docs1,query_str).flatten())[-5:])))
data2 = list(reversed((np.argsort(bm25(docs1,query_str))[-5:])))

print(data1)
print(data2)

[9, 8, 0, 1, 11]
[9, 8, 0, 11, 10]


In [26]:
data = np.hstack((np.reshape(docs1.iloc[data1].values,(-1,1)),np.reshape(docs1.iloc[data2].values,(-1,1))))

In [27]:
pd.DataFrame(data=data,
             columns=[f"{query_str}-tfidf_cs",f"{query_str}-bm25"])

Unnamed: 0,Harry Potter-tfidf_cs,Harry Potter-bm25
0,Harry Potter Potter,Harry Potter Potter
1,Harry Harry Potter,Harry Harry Potter
2,Harry Potter # and the Sorcerer's Stone,Harry Potter # and the Sorcerer's Stone
3,Harry Potter #2 and the Chamber of Secrets,Little Potter
4,Little Potter,Little Harry


In [31]:
query_str = "Sorcerer's Stone NY"

data1 = list(reversed((np.argsort(tfidf_cs(docs1,query_str).flatten())[-5:])))
data2 = list(reversed((np.argsort(bm25(docs1,query_str))[-5:])))

print(data1)
print(data2)

data = np.hstack((np.reshape(docs1.iloc[data1].values,(-1,1)),np.reshape(docs1.iloc[data2].values,(-1,1))))

pd.DataFrame(data=data,
             columns=[f"{query_str}-tfidf_cs",f"{query_str}-bm25"])

[3, 0, 2, 11, 10]
[3, 0, 2, 11, 10]


Unnamed: 0,Sorcerer's Stone NY-tfidf_cs,Sorcerer's Stone NY-bm25
0,Great! Sorcerer's of NY 2,Great! Sorcerer's of NY 2
1,Harry Potter # and the Sorcerer's Stone,Harry Potter # and the Sorcerer's Stone
2,The Sorcerer's Den! 5s,The Sorcerer's Den! 5s
3,Little Potter,Little Potter
4,Little Harry,Little Harry


In [32]:
query_str = "the Potter Harry"

data1 = list(reversed((np.argsort(tfidf_cs(docs1,query_str).flatten())[-5:])))
data2 = list(reversed((np.argsort(bm25(docs1,query_str))[-5:])))

print(data1)
print(data2)

data = np.hstack((np.reshape(docs1.iloc[data1].values,(-1,1)),np.reshape(docs1.iloc[data2].values,(-1,1))))

pd.DataFrame(data=data,
             columns=[f"{query_str}-tfidf_cs",f"{query_str}-bm25"])

[9, 8, 0, 1, 11]
[0, 1, 9, 8, 2]


Unnamed: 0,the Potter Harry-tfidf_cs,the Potter Harry-bm25
0,Harry Potter Potter,Harry Potter # and the Sorcerer's Stone
1,Harry Harry Potter,Harry Potter #2 and the Chamber of Secrets
2,Harry Potter # and the Sorcerer's Stone,Harry Potter Potter
3,Harry Potter #2 and the Chamber of Secrets,Harry Harry Potter
4,Little Potter,The Sorcerer's Den! 5s


In [33]:
query_str = "Harry Potter Sorcerer's"

data1 = list(reversed((np.argsort(tfidf_cs(docs1,query_str).flatten())[-5:])))
data2 = list(reversed((np.argsort(bm25(docs1,query_str))[-5:])))

print(data1)
print(data2)

data = np.hstack((np.reshape(docs1.iloc[data1].values,(-1,1)),np.reshape(docs1.iloc[data2].values,(-1,1))))

pd.DataFrame(data=data,
             columns=[f"{query_str}-tfidf_cs",f"{query_str}-bm25"])

[9, 8, 0, 1, 11]
[0, 2, 3, 9, 8]


Unnamed: 0,Harry Potter Sorcerer's-tfidf_cs,Harry Potter Sorcerer's-bm25
0,Harry Potter Potter,Harry Potter # and the Sorcerer's Stone
1,Harry Harry Potter,The Sorcerer's Den! 5s
2,Harry Potter # and the Sorcerer's Stone,Great! Sorcerer's of NY 2
3,Harry Potter #2 and the Chamber of Secrets,Harry Potter Potter
4,Little Potter,Harry Harry Potter


In [None]:
# stop code execution

10/0

### Combine `nltk` for Vocabulary Generation and (`tfidf` + `Cosine Similarity`) for Similarity Calculation

In [74]:
docs1 = [
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "The Sorcerer's Den! 5s",
    "Great! Sorcerer's of NY 2",
    "Great Secrets of Amazon",
    "S",
    "R",
    "Ss",
    "7x7"
]

In [75]:
vocabulary_set = set()

for document in docs1:
    vocabulary_set.update(word_tokenize(document.lower()))

print(vocabulary_set)
print(len(vocabulary_set))

{'s', '#', '7x7', 'ny', '!', 'potter', 'great', 'den', "'s", 'r', 'stone', 'secrets', 'harry', 'amazon', 'chamber', 'the', '5s', 'of', 'sorcerer', 'ss', '2', 'and'}
22


**Sorting the vocabulary list**

In [79]:
vocabulary_list = list(vocabulary_set)
vocabulary_list.sort()
print(vocabulary_list)

['!', '#', "'s", '2', '5s', '7x7', 'amazon', 'and', 'chamber', 'den', 'great', 'harry', 'ny', 'of', 'potter', 'r', 's', 'secrets', 'sorcerer', 'ss', 'stone', 'the']


In [80]:
tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                    min_df=0, vocabulary=vocabulary_set)

tfidf1 = tf1.fit_transform(docs1)

print(tfidf1.toarray().shape)
print("="*50)
print(tfidf1.toarray())

(9, 22)
[[0.         0.         0.         0.         0.         0.
  0.         0.41121476 0.         0.         0.         0.41121476
  0.         0.         0.41121476 0.         0.         0.
  0.35753936 0.         0.48686598 0.35753936]
 [0.         0.         0.         0.         0.         0.
  0.         0.38031487 0.45028144 0.         0.         0.38031487
  0.         0.3306728  0.38031487 0.         0.         0.38031487
  0.         0.         0.         0.3306728 ]
 [0.         0.         0.         0.         0.56993279 0.
  0.         0.         0.         0.56993279 0.         0.
  0.         0.         0.         0.         0.         0.
  0.41854106 0.         0.         0.41854106]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.5054797  0.
  0.59847285 0.43950001 0.         0.         0.         0.
  0.43950001 0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.


- We can observe that `'!', '#', "'s", '2', 's', 'r'` are not getting considered as matches although they are there in the vocabulary
- `"S", "R` are not getting considered as any valid vectors

In [81]:
tf1.get_feature_names_out()

array(['!', '#', "'s", '2', '5s', '7x7', 'amazon', 'and', 'chamber',
       'den', 'great', 'harry', 'ny', 'of', 'potter', 'r', 's', 'secrets',
       'sorcerer', 'ss', 'stone', 'the'], dtype=object)

**The following vectors does not work**

In [89]:
query_str = "#"
tf1.transform([query_str]).toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.]])

In [90]:
query_str = "2"
tf1.transform([query_str]).toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.]])

In [83]:
query_str = "s"
tf1.transform([query_str]).toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.]])

In [None]:
query_str = "s"
tf1.transform([query_str]).toarray()

**The following vectors work**

In [87]:
query_str = "5s"
tf1.transform([query_str]).toarray()

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.]])

In [88]:
query_str = "ss"
tf1.transform([query_str]).toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0.]])

In [91]:
query_str = "Harry Potter Den!"
tf1.transform([query_str]).toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.64192944,
        0.        , 0.54218382, 0.        , 0.        , 0.54218382,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ]])

- We can clearly observe that there is a tfidf score for the terms `Harry`, `Potter`, `den` but there is no score for `!` although it is present in the Vocabulary
- So, if the vector is a single character or the term is a single character then they are not considered in the above configuration

```
docs1 = [
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "The Sorcerer's Den! 5s",
    "Great! Sorcerer's of NY 2",
    "Great Secrets of Amazon",
    "S",
    "Ss",
    "7x7"
]
```

In [92]:
cosine_sim1 = cosine_similarity(tfidf1, tf1.transform([query_str]))

cosine_sim1.flatten()

array([0.44590798, 0.41240115, 0.36585663, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        ])

- We can observe that `Harry Potter Den!` has similarities in descending order as follows.

```
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "The Sorcerer's Den! 5s",
```

In [93]:
query_str = "Harry Potter Den!"

tokenized_query = word_tokenize(query_str.lower())

tokenized_documents = [word_tokenize(document.lower()) for document in docs1]

bm25 = BM25Okapi(tokenized_documents)

scores = bm25.get_scores(tokenized_query)

scores

array([1.56650157, 1.45587679, 2.38190502, 0.84766024, 0.        ,
       0.        , 0.        , 0.        , 0.        ])

- We can observe that `Harry Potter Den!` has similarities in descending order as follows.

```
    "The Sorcerer's Den! 5s",
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "Great! Sorcerer's of NY 2",
```

In [94]:
query_str = "Harry Potter Den!"

tokenized_query = word_tokenize(query_str.lower())

tokenized_documents = [word_tokenize(document.lower()) for document in docs1]

bm25 = BM25Okapi(tokenized_documents)

top_n = bm25.get_top_n(tokenized_query, tokenized_documents, n=4)

top_n

[['the', 'sorcerer', "'s", 'den', '!', '5s'],
 ['harry', 'potter', '#', 'and', 'the', 'sorcerer', "'s", 'stone'],
 ['harry', 'potter', '#', '2', 'and', 'the', 'chamber', 'of', 'secrets'],
 ['great', '!', 'sorcerer', "'s", 'of', 'ny', '2']]

**Using the tfidfVectorizer's default tokenizer**

In [95]:
tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                    min_df=0)

tfidf1 = tf1.fit_transform(docs1)

print(tfidf1.toarray().shape)

(9, 16)


In [96]:
tf1.get_feature_names_out()

array(['5s', '7x7', 'amazon', 'and', 'chamber', 'den', 'great', 'harry',
       'ny', 'of', 'potter', 'secrets', 'sorcerer', 'ss', 'stone', 'the'],
      dtype=object)

In [97]:
query_str = "Harry Potter Den!"
tf1.transform([query_str]).toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.64192944, 0.        , 0.54218382, 0.        , 0.        ,
        0.54218382, 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

In [99]:
cosine_sim1 = cosine_similarity(tfidf1, tf1.transform([query_str]))

cosine_sim1.flatten()

array([0.44590798, 0.41240115, 0.36585663, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        ])

- Previous Data

```
docs1 = [
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "The Sorcerer's Den! 5s",
    "Great! Sorcerer's of NY 2",
    "Great Secrets of Amazon",
    "S",
    "Ss",
    "7x7"
]
```

```
[0.44590798, 0.41240115, 0.36585663, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        ]
```

- We can observe that the current score is same as the previous (tfidfVectorizer + Cosine Similary) score with external vocabulary
- We can conclude that, with the above settings, the internal scoring takes place based on default tokenizer rule even if we add vocabulary from outside  

**Lets try custom tokenizer directly**

In [100]:
def custom_tokenizer(text):
    tokens = word_tokenize(text)
    # You can add additional processing or filtering steps here if needed
    return tokens

In [102]:
tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                    min_df=0, tokenizer=custom_tokenizer)

tfidf1 = tf1.fit_transform(docs1)

print(tfidf1.toarray().shape)
print(tf1.get_feature_names_out())

query_str = "Harry Potter Den!"
print(tf1.transform([query_str]).toarray())

cosine_sim1 = cosine_similarity(tfidf1, tf1.transform([query_str]))
cosine_sim1.flatten()

(9, 22)
['!' '#' "'s" '2' '5s' '7x7' 'amazon' 'and' 'chamber' 'den' 'great'
 'harry' 'ny' 'of' 'potter' 'r' 's' 'secrets' 'sorcerer' 'ss' 'stone'
 'the']
[[0.47663461 0.         0.         0.         0.         0.
  0.         0.         0.         0.56432113 0.         0.47663461
  0.         0.         0.47663461 0.         0.         0.
  0.         0.         0.         0.        ]]




array([0.34421173, 0.31929011, 0.46459161, 0.18455746, 0.        ,
       0.        , 0.        , 0.        , 0.        ])

- Following one is the score given by bm25
  
```
[1.56650157, 1.45587679, 2.38190502, 0.84766024, 0.        ,
       1.        , 0.        , 0.        , 0.        ]
```

- We can observe that the scoring pattern is same (although the scores are not)
- We can also observe that, now the term '!' is getting considered during scoring

**We can observe the same for the rest of the terms**

In [104]:
query_str = "#"
print(tf1.transform([query_str]).toarray())
query_str = "'s"
print(tf1.transform([query_str]).toarray())
query_str = "2"
print(tf1.transform([query_str]).toarray())
query_str = "r"
print(tf1.transform([query_str]).toarray())
query_str = "s"
print(tf1.transform([query_str]).toarray())

[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]


**We can observe the same for the vectors**

In [109]:
query_str = "S"
print(tf1.transform([query_str]).toarray())

cosine_sim1 = cosine_similarity(tfidf1, tf1.transform([query_str]))
cosine_sim1.flatten()

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]


array([0., 0., 0., 0., 0., 1., 0., 0., 0.])

In [110]:
query_str = "R"
print(tf1.transform([query_str]).toarray())

cosine_sim1 = cosine_similarity(tfidf1, tf1.transform([query_str]))
cosine_sim1.flatten()

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]


array([0., 0., 0., 0., 0., 0., 1., 0., 0.])