**Imports**


In [4]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Study


#### Effect of punctuations


In [20]:
docs1 = [
    "Harry Potter and the Sorcerer's Stone",
    "Harry Potter and the Chamber of Secrets",
    "The Sorcerer's Den",
    "Great Sorcerers of NY",
    "Great Secrets of Amazon",
]

docs2 = [
    "Harry Potter and the Sorcerer's Stone",
    "Harry Potter and the Chamber of Secrets",
    "The Sorcerer's Den",
    "Great Sorcerer's of NY",
    "Great Secrets of Amazon",
]

In [21]:
docs = docs2

In [22]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                     min_df=0, stop_words='english')

tfidf = tf.fit_transform(docs)

tfidf.shape

(5, 10)

In [23]:
tfidf.todense()

matrix([[0.        , 0.        , 0.        , 0.        , 0.48648432,
         0.        , 0.48648432, 0.        , 0.40382593, 0.60298477],
        [0.        , 0.5819515 , 0.        , 0.        , 0.4695148 ,
         0.        , 0.4695148 , 0.4695148 , 0.        , 0.        ],
        [0.        , 0.        , 0.83088075, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.55645052, 0.        ],
        [0.        , 0.        , 0.        , 0.55681615, 0.        ,
         0.69015927, 0.        , 0.        , 0.4622077 , 0.        ],
        [0.659118  , 0.        , 0.        , 0.53177225, 0.        ,
         0.        , 0.        , 0.53177225, 0.        , 0.        ]])

In [24]:
tf.get_feature_names_out()

array(['amazon', 'chamber', 'den', 'great', 'harry', 'ny', 'potter',
       'secrets', 'sorcerer', 'stone'], dtype=object)

In [25]:
pd.DataFrame(data=tfidf.toarray(), index=docs,
             columns=tf.get_feature_names_out())

Unnamed: 0,amazon,chamber,den,great,harry,ny,potter,secrets,sorcerer,stone
Harry Potter and the Sorcerer's Stone,0.0,0.0,0.0,0.0,0.486484,0.0,0.486484,0.0,0.403826,0.602985
Harry Potter and the Chamber of Secrets,0.0,0.581951,0.0,0.0,0.469515,0.0,0.469515,0.469515,0.0,0.0
The Sorcerer's Den,0.0,0.0,0.830881,0.0,0.0,0.0,0.0,0.0,0.556451,0.0
Great Sorcerer's of NY,0.0,0.0,0.0,0.556816,0.0,0.690159,0.0,0.0,0.462208,0.0
Great Secrets of Amazon,0.659118,0.0,0.0,0.531772,0.0,0.0,0.0,0.531772,0.0,0.0


In [26]:
cosine_sim = cosine_similarity(tfidf, tfidf)
cosine_sim

array([[1.        , 0.45682318, 0.22470915, 0.18665146, 0.        ],
       [0.45682318, 1.        , 0.        , 0.        , 0.24967495],
       [0.22470915, 0.        , 1.        , 0.25719572, 0.        ],
       [0.18665146, 0.        , 0.25719572, 1.        , 0.29609938],
       [0.        , 0.24967495, 0.        , 0.29609938, 1.        ]])

In [27]:
# similarity at r4c1

cosine_sim[3,0]

0.18665145580125506

In [28]:
# stop code execution

10/0

ZeroDivisionError: division by zero

**Insight**
- The punctuations in the actual document maters as term truncation happens on these characters - It is better to remove them before calculating tfidf

#### Cosine Similarity [m x n][m x n] vs [m x n][1 x n] Output and Interpretation

In [30]:
[
    "Harry Potter and the Sorcerer's Stone",
    "Harry Potter and the Chamber of Secrets",
    "The Sorcerer's Den",
    "Great Sorcerers of NY",
    "Great Secrets of Amazon",
]

["Harry Potter and the Sorcerer's Stone",
 'Harry Potter and the Chamber of Secrets',
 "The Sorcerer's Den",
 'Great Sorcerers of NY',
 'Great Secrets of Amazon']

In [31]:
cosine_sim = cosine_similarity(
    tfidf, tf.transform(["Harry Secrets Sorcerers"]))
cosine_sim

array([[0.34399636],
       [0.6639942 ],
       [0.        ],
       [0.        ],
       [0.37601977]])

**Insight**

```
my_matrix =
v1=> 1 2 3
v2=> 4 5 6
v3=> 7 8 9
```

`cosine_similary(my_matrix,my_matrix) =`

```
    col1     col1      col1
[
[v1 vs v1, v2 vs v1, v3 vs v1],
[v1 vs v2, v2 vs v2, v3 vs v2],
[v1 vs v3, v2 vs v3, v4 vs v3]
]
```

---

```
other_matrix =
v4 => 1 5 9
```

`cosine_similary(my_matrix,other_matrix) =`

```
    col1
[
[v4 vs v1],
[v4 vs v2],
[v4 vs v3]
]
```


#### np.argsort() vs np.argpartition()

In [None]:
np.argsort(cosine_sim.flatten())

array([2, 0, 4, 3, 1], dtype=int64)

- `argpartition` first sorts the array then partitions the array on the _kth_ element.
- All elements lower than the _kth_ element will be behind it and larget will be after it.


In [None]:
np.argpartition(cosine_sim.flatten(), -3)

array([2, 0, 4, 3, 1], dtype=int64)

In [None]:
list(reversed((np.argsort(cosine_sim.flatten())[-3:])))

[1, 3, 4]

In [None]:
list(reversed((np.argpartition(cosine_sim.flatten(), -3)[-3:])))

[1, 3, 4]