**Module Imports**

In [3]:
import numpy as np
import pandas as pd
import re

from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity

### Similarity Score Experiment - TfidfVectorizer + Cosine Similarity

In [14]:
docs1 = [
    "Harry Potter and the Sorcerers Stone",
    "Harry Potter and the Chamber of Secrets",
    "The Sorcerer's Den",
    "Great Sorcerers of NY",
    "Great Secrets of Amazon",
    "Peter Harry",
    "Peter Harry Potter",
]

In [15]:
tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                     min_df=0)

tfidf1 = tf1.fit_transform(docs1)

tfidf1.shape

(7, 15)

In [21]:
tf1.get_feature_names_out()

array(['amazon', 'and', 'chamber', 'den', 'great', 'harry', 'ny', 'of',
       'peter', 'potter', 'secrets', 'sorcerer', 'sorcerers', 'stone',
       'the'], dtype=object)

In [17]:
tfidf1.todense()

matrix([[0.        , 0.42783212, 0.        , 0.        , 0.        ,
         0.31750075, 0.        , 0.        , 0.        , 0.36569672,
         0.        , 0.        , 0.42783212, 0.51540706, 0.36569672],
        [0.        , 0.40180727, 0.48405506, 0.        , 0.        ,
         0.29818731, 0.        , 0.34345153, 0.        , 0.34345153,
         0.40180727, 0.        , 0.        , 0.        , 0.34345153],
        [0.        , 0.        , 0.        , 0.63202178, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.63202178, 0.        , 0.        , 0.44843834],
        [0.        , 0.        , 0.        , 0.        , 0.48900396,
         0.        , 0.58910044, 0.41798437, 0.        , 0.        ,
         0.        , 0.        , 0.48900396, 0.        , 0.        ],
        [0.58910044, 0.        , 0.        , 0.        , 0.48900396,
         0.        , 0.        , 0.41798437, 0.        , 0.        ,
         0.48900396, 0.       

In [22]:
pd.DataFrame(data=tfidf1.toarray(), index=docs1,
             columns=tf1.get_feature_names_out())

Unnamed: 0,amazon,and,chamber,den,great,harry,ny,of,peter,potter,secrets,sorcerer,sorcerers,stone,the
Harry Potter and the Sorcerers Stone,0.0,0.427832,0.0,0.0,0.0,0.317501,0.0,0.0,0.0,0.365697,0.0,0.0,0.427832,0.515407,0.365697
Harry Potter and the Chamber of Secrets,0.0,0.401807,0.484055,0.0,0.0,0.298187,0.0,0.343452,0.0,0.343452,0.401807,0.0,0.0,0.0,0.343452
The Sorcerer's Den,0.0,0.0,0.0,0.632022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.632022,0.0,0.0,0.448438
Great Sorcerers of NY,0.0,0.0,0.0,0.0,0.489004,0.0,0.5891,0.417984,0.0,0.0,0.0,0.0,0.489004,0.0,0.0
Great Secrets of Amazon,0.5891,0.0,0.0,0.0,0.489004,0.0,0.0,0.417984,0.0,0.0,0.489004,0.0,0.0,0.0,0.0
Peter Harry,0.0,0.0,0.0,0.0,0.0,0.59594,0.0,0.0,0.803029,0.0,0.0,0.0,0.0,0.0,0.0
Peter Harry Potter,0.0,0.0,0.0,0.0,0.0,0.491331,0.0,0.0,0.662069,0.565914,0.0,0.0,0.0,0.0,0.0


In [23]:
cosine_sim1 = cosine_similarity(tfidf1, tfidf1)
cosine_sim1

array([[1.        , 0.51777895, 0.16399243, 0.2092116 , 0.        ,
        0.18921141, 0.362951  ],
       [0.51777895, 1.        , 0.15401684, 0.14355737, 0.34004272,
        0.17770175, 0.34087283],
       [0.16399243, 0.15401684, 1.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.2092116 , 0.14355737, 0.        , 1.        , 0.4138358 ,
        0.        , 0.        ],
       [0.        , 0.34004272, 0.        , 0.4138358 , 1.        ,
        0.        , 0.        ],
       [0.18921141, 0.17770175, 0.        , 0.        , 0.        ,
        1.        , 0.82446411],
       [0.362951  , 0.34087283, 0.        , 0.        , 0.        ,
        0.82446411, 1.        ]])

In [24]:
pd.DataFrame(data=cosine_sim1, index=docs1,
             columns=docs1)

Unnamed: 0,Harry Potter and the Sorcerers Stone,Harry Potter and the Chamber of Secrets,The Sorcerer's Den,Great Sorcerers of NY,Great Secrets of Amazon,Peter Harry,Peter Harry Potter
Harry Potter and the Sorcerers Stone,1.0,0.517779,0.163992,0.209212,0.0,0.189211,0.362951
Harry Potter and the Chamber of Secrets,0.517779,1.0,0.154017,0.143557,0.340043,0.177702,0.340873
The Sorcerer's Den,0.163992,0.154017,1.0,0.0,0.0,0.0,0.0
Great Sorcerers of NY,0.209212,0.143557,0.0,1.0,0.413836,0.0,0.0
Great Secrets of Amazon,0.0,0.340043,0.0,0.413836,1.0,0.0,0.0
Peter Harry,0.189211,0.177702,0.0,0.0,0.0,1.0,0.824464
Peter Harry Potter,0.362951,0.340873,0.0,0.0,0.0,0.824464,1.0


**Different similarity checks for the following cases**

In [26]:
pd.DataFrame(data=cosine_similarity(tfidf1, tf1.transform(["Harry Potter"])), index=docs1,
             columns=["Harry Potter"])

Unnamed: 0,Harry Potter
Harry Potter and the Sorcerers Stone,0.484294
Harry Potter and the Chamber of Secrets,0.454835
The Sorcerer's Den,0.0
Great Sorcerers of NY,0.0
Great Secrets of Amazon,0.0
Peter Harry,0.390695
Peter Harry Potter,0.749443


In [29]:
pd.DataFrame(data=cosine_similarity(tfidf1, tf1.transform(["Harry Potter and"])), index=docs1,
             columns=["Harry Potter and"])

Unnamed: 0,Harry Potter and
Harry Potter and the Sorcerers Stone,0.646205
Harry Potter and the Chamber of Secrets,0.606897
The Sorcerer's Den,0.0
Great Sorcerers of NY,0.0
Great Secrets of Amazon,0.0
Peter Harry,0.292804
Peter Harry Potter,0.561665


In [30]:
pd.DataFrame(data=cosine_similarity(tfidf1, tf1.transform(["Harry Potter of Great"])), index=docs1,
             columns=["Harry Potter of Great"])

Unnamed: 0,Harry Potter of Great
Harry Potter and the Sorcerers Stone,0.315877
Harry Potter and the Chamber of Secrets,0.465818
The Sorcerer's Den,0.0
Great Sorcerers of NY,0.487629
Great Secrets of Amazon,0.487629
Peter Harry,0.254828
Peter Harry Potter,0.488819


**Order of the string has no effect on cosine similarity score**

In [33]:
data1 = cosine_similarity(tfidf1, tf1.transform(["Harry and the Potter"]))
data2 = cosine_similarity(tfidf1, tf1.transform(["and the Potter Harry"]))

data = np.hstack((data1,data2))

In [34]:
pd.DataFrame(data=data, index=docs1,
             columns=["Harry and the Potter","and the Potter Harry"])

Unnamed: 0,Harry and the Potter,and the Potter Harry
Harry Potter and the Sorcerers Stone,0.742506,0.742506
Harry Potter and the Chamber of Secrets,0.69734,0.69734
The Sorcerer's Den,0.220863,0.220863
Great Sorcerers of NY,0.0,0.0
Great Secrets of Amazon,0.0,0.0
Peter Harry,0.254828,0.254828
Peter Harry Potter,0.488819,0.488819


**Partial words has no effect on the similarity score as tfidf score only varies if the word is in the vocabulary**

In [37]:
data = np.hstack((data,cosine_similarity(tfidf1, tf1.transform(["Harry Potter and the Sor Sto"]))))

In [38]:
pd.DataFrame(data=data, index=docs1,
             columns=["Harry and the Potter","and the Potter Harry","Harry Potter and the Sor Sto"])

Unnamed: 0,Harry and the Potter,and the Potter Harry,Harry Potter and the Sor Sto
Harry Potter and the Sorcerers Stone,0.742506,0.742506,0.742506
Harry Potter and the Chamber of Secrets,0.69734,0.69734,0.69734
The Sorcerer's Den,0.220863,0.220863,0.220863
Great Sorcerers of NY,0.0,0.0,0.0
Great Secrets of Amazon,0.0,0.0,0.0
Peter Harry,0.254828,0.254828,0.254828
Peter Harry Potter,0.488819,0.488819,0.488819


In [None]:
# stop code execution

10/0

ZeroDivisionError: division by zero

### `stop_words='english'` vs `stop_words=None` in TfidfVectorizer

In [39]:
docs1 = [
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "The Sorcerer's Den! 5s",
    "Great! Sorcerer's of NY 2",
    "Great Secrets of Amazon",
    "S",
    "Ss",
    "7x7"
]

docs2 = docs1

In [40]:
tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                     min_df=0)

tfidf1 = tf1.fit_transform(docs1)

tfidf1.shape

(8, 16)

In [41]:
tf2 = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                     min_df=0, stop_words="english")

tfidf2 = tf2.fit_transform(docs2)

tfidf2.shape

(8, 13)

In [42]:
tf1.get_feature_names_out()

array(['5s', '7x7', 'amazon', 'and', 'chamber', 'den', 'great', 'harry',
       'ny', 'of', 'potter', 'secrets', 'sorcerer', 'ss', 'stone', 'the'],
      dtype=object)

In [43]:
tf2.get_feature_names_out()

array(['5s', '7x7', 'amazon', 'chamber', 'den', 'great', 'harry', 'ny',
       'potter', 'secrets', 'sorcerer', 'ss', 'stone'], dtype=object)

In [44]:
set(tf1.get_feature_names_out())-set(tf2.get_feature_names_out())

{'and', 'of', 'the'}

- If we mention `stop_words='english'` then all the english stop words like `{'and', 'of', 'the'}` will be removed from the vocabulary
- Truncation of characters, numbers, punctuations happens as follows

In [None]:
# stop code execution

10/0

ZeroDivisionError: division by zero

### Cosine Similarity `[m x n],[m x n]` vs `[m x n],[1 x n]` Output and Interpretation

In [None]:
docs1 = [
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "The Sorcerer's Den! 5s",
]

In [None]:
tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                     min_df=0)

tfidf1 = tf1.fit_transform(docs1)

tfidf1.shape

(3, 11)

In [None]:
print(cosine_similarity(tfidf1, tfidf1))

[[1.         0.48296408 0.28318707]
 [0.48296408 1.         0.09042292]
 [0.28318707 0.09042292 1.        ]]


In [None]:
print(cosine_similarity(tfidf1, tf1.transform(["Harry Potter of Great"])))

[[0.41159365]
 [0.65132981]
 [0.        ]]


**Insight**

```
my_matrix =
v1=> 1 2 3
v2=> 4 5 6
v3=> 7 8 9
```

`cosine_similary(my_matrix,my_matrix) =`

```
    col1     col1      col1
[
[v1 vs v1, v2 vs v1, v3 vs v1],
[v1 vs v2, v2 vs v2, v3 vs v2],
[v1 vs v3, v2 vs v3, v4 vs v3]
]
```

---

```
other_matrix =
v4 => 1 5 9
```

`cosine_similary(my_matrix,other_matrix) =`

```
    col1
[
[v4 vs v1],
[v4 vs v2],
[v4 vs v3]
]
```


In [None]:
# stop code execution

10/0

### np.argsort() vs np.argpartition()

In [None]:
docs1 = [
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "The Sorcerer's Den! 5s",
    "Great! Sorcerer's of NY 2",
    "Great Secrets of Amazon",
    "S",
    "Ss",
    "7x7"
]

In [None]:
tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                     min_df=0)

tfidf1 = tf1.fit_transform(docs1)

tfidf1.shape

(8, 16)

In [None]:
tf1.get_feature_names_out()

array(['5s', '7x7', 'amazon', 'and', 'chamber', 'den', 'great', 'harry',
       'ny', 'of', 'potter', 'secrets', 'sorcerer', 'ss', 'stone', 'the'],
      dtype=object)

In [None]:
cosine_sim1 = cosine_similarity(tfidf1, tf1.transform(["Harry Potter and Great Amazon NY Ss S 7x"]))
cosine_sim1

array([[0.42897558],
       [0.39673747],
       [0.        ],
       [0.42603636],
       [0.41277968],
       [0.        ],
       [0.41488779],
       [0.        ]])

In [None]:
np.argsort(cosine_sim1.flatten())

array([2, 5, 7, 1, 4, 6, 3, 0], dtype=int64)

- `argpartition` first sorts the array then partitions the array on the _kth_ element.
- All elements lower than the _kth_ element will be behind it and larget will be after it.


In [None]:
np.argpartition(cosine_sim1.flatten(), -2)

array([6, 7, 2, 1, 4, 5, 3, 0], dtype=int64)

In [None]:
list(reversed((np.argsort(cosine_sim1.flatten())[-3:])))

[0, 3, 6]

In [None]:
list(reversed((np.argpartition(cosine_sim1.flatten(), -3)[-3:])))

[0, 3, 6]

In [None]:
# stop code execution

10/0

### `analyzer=word` vs `analyzer=char` in TfidfVectorizer 

In [45]:
docs1 = [
    "Harry Potter # and the Sorcerer's Stone",
    "Harry Potter #2 and the Chamber of Secrets",
    "The Sorcerer's Den! 5s",
    "Great! Sorcerer's of NY 2",
    "Great Secrets of Amazon",
    "S",
    "Ss",
    "7x7"
]

In [50]:
TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                     min_df=0).fit(docs1).get_feature_names_out()

array(['5s', '7x7', 'amazon', 'and', 'chamber', 'den', 'great', 'harry',
       'ny', 'of', 'potter', 'secrets', 'sorcerer', 'ss', 'stone', 'the'],
      dtype=object)

In [None]:
TfidfVectorizer(analyzer='char', ngram_range=(1, 1),
                     min_df=0).fit(docs1).get_feature_names_out()

array([' ', '!', '#', "'", '2', '5', '7', 'a', 'b', 'c', 'd', 'e', 'f',
       'g', 'h', 'm', 'n', 'o', 'p', 'r', 's', 't', 'x', 'y', 'z'],
      dtype=object)

**Unigram and Bigram both**

In [None]:
TfidfVectorizer(analyzer='char', ngram_range=(1, 2),
                     min_df=0).fit(docs1).get_feature_names_out()

array([' ', ' #', ' 2', ' 5', ' a', ' c', ' d', ' n', ' o', ' p', ' s',
       ' t', '!', '! ', '#', '# ', '#2', "'", "'s", '2', '2 ', '5', '5s',
       '7', '7x', 'a', 'am', 'an', 'ar', 'at', 'az', 'b', 'be', 'c', 'ce',
       'ch', 'cr', 'd', 'd ', 'de', 'e', 'e ', 'ea', 'ec', 'en', 'er',
       'et', 'f', 'f ', 'g', 'gr', 'h', 'ha', 'he', 'm', 'ma', 'mb', 'n',
       'n!', 'nd', 'ne', 'ny', 'o', 'of', 'on', 'or', 'ot', 'p', 'po',
       'r', 'r ', "r'", 'rc', 're', 'rr', 'ry', 's', 's ', 'se', 'so',
       'ss', 'st', 't', 't ', 't!', 'te', 'th', 'to', 'ts', 'tt', 'x',
       'x7', 'y', 'y ', 'z', 'zo'], dtype=object)

### Custom Tokenizer in tfidfVectorizer

#### Using nltk as Tokenizer

In [2]:
# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Define a custom tokenizer using NLTK
def custom_tokenizer(text):
    tokens = word_tokenize(text)
    # You can add additional processing or filtering steps here if needed
    return tokens

# Create TfidfVectorizer with the custom tokenizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Print the TF-IDF matrix and feature names
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())
print("\nFeature Names:")
print(feature_names)

TF-IDF Matrix:
[[0.42520648 0.         0.         0.42520648 0.5252146  0.34763416
  0.         0.         0.34763416 0.         0.34763416]
 [0.32513203 0.         0.         0.65026407 0.         0.26581674
  0.         0.50938216 0.26581674 0.         0.26581674]
 [0.31055267 0.         0.48654076 0.         0.         0.25389715
  0.48654076 0.         0.25389715 0.48654076 0.25389715]
 [0.         0.59276931 0.         0.37835697 0.46734613 0.30933162
  0.         0.         0.30933162 0.         0.30933162]]

Feature Names:
['.' '?' 'and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']




#### Using Custom Regex as Tokenizer

In [5]:
# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Define a custom tokenizer using a regex pattern
def custom_tokenizer(text):
    # Use a regex pattern to tokenize based on word boundaries
    tokens = re.findall(r'\b\w+\b', text)
    # You can add additional processing or filtering steps here if needed
    return tokens

# Create TfidfVectorizer with the custom tokenizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Print the TF-IDF matrix and feature names
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())
print("\nFeature Names:")
print(feature_names)


TF-IDF Matrix:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]

Feature Names:
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
