In [5]:
from sklearn.datasets import fetch_20newsgroups

DATA = fetch_20newsgroups(
    subset='test',
    remove=('headers', 'footers', 'quotes'),
    categories=['rec.autos', 'comp.windows.x', 'soc.religion.christian', 'rec.sport.baseball']
)


In [6]:
DATA.target_names

['comp.windows.x', 'rec.autos', 'rec.sport.baseball', 'soc.religion.christian']

In [7]:
DATA.target

array([2, 3, 2, ..., 1, 0, 0])

In [8]:
FEATURES = DATA.data

In [9]:
TARGET = [DATA.target_names[i] for i in DATA.target]

In [10]:
print (f'DATA : {FEATURES[0]}')
print (f'LABEL : {TARGET[0]}')

DATA : With all the recent problems the Indians have been having
with their pitching staff I have heard numerous names
thrown around about who could solve their problem.

One name I have not heard is Mike Soper (RP).  As far as
I know, Soper has had pretty good minor league stats.
Why not give the kid a chance?  Anyone know anything about
this guy?

-- 
LABEL : rec.sport.baseball


In [11]:
from collections import Counter 

In [12]:
Counter(TARGET)

Counter({'soc.religion.christian': 398,
         'rec.sport.baseball': 397,
         'rec.autos': 396,
         'comp.windows.x': 395})

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train,x_test,y_train,y_test = train_test_split(FEATURES , TARGET , test_size = .2 , stratify = TARGET,random_state = 42)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words = 'english')
vectorizer.fit(x_train)
x_train_v = vectorizer.transform(x_train)
x_test_v = vectorizer.transform(x_test)

In [16]:
x_train[0]

'\n\nPlease note that God commanded Adam to work before the fall:\n\n"The LORD God took the man and put him in the Garden of Eden to work\n it and take care of it." (Gen 2:15, NIV).  \n\nWork was God\'s design from the beginning.\n\n-- \nKen'

In [17]:
x_train_v[0]

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 18 stored elements and shape (1, 18556)>

In [18]:
y_train[0]

'soc.religion.christian'

In [19]:
x_test[3]

"\n\nEither I've just fallen for this, or you guys\nare _really_ paranoid!\n\nYou're actually worried about somebody stealing \nyour oil?\n\nC'mon, you think a vandal'll do that?!\n\nThat's absolutely ridiculous!\n\nBesides, how hard is it to get under the car to \nchange the oil?\n\nI can say from experience on the cars that I've driven and\nchanged the oil on, my Mazda 323 is pretty much a pain, but\nonce you've done it once, you don't forget how, and it\ngets easier.\n\nI can't imagine any other cars are much worse than mine.\n"

In [20]:
x_test_v[3]

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 32 stored elements and shape (1, 18556)>

In [21]:
y_test[3]

'rec.autos'

In [42]:
import random
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from termcolor import colored  # Ensure termcolor is installed: `pip install termcolor`

for i in random.choices(range(len(x_test)), k=5):  # Randomly select 5 indices
    print(f"id: {i}")
    print("true_label:", colored(y_test[i], 'green'))

    # Compute cosine similarities
    distances = cosine_similarity(x_test_v[i].reshape(1, -1), x_train_v).flatten()

    # Sort distances in descending order and get indices
    indicies = np.argsort(distances)[::-1]

    # Print the 3 nearest labels
    for idx, j in enumerate(indicies[:3]):
        print(
            f"{idx} nearest label is {colored(y_train[j], 'green' if y_train[j] == y_test[i] else 'red')}, "
            f"similarity: {colored(round(distances[j], 3), 'yellow')}"
        )



id: 8
true_label: [32msoc.religion.christian[0m
0 nearest label is [32msoc.religion.christian[0m, similarity: [33m0.164[0m
1 nearest label is [32msoc.religion.christian[0m, similarity: [33m0.151[0m
2 nearest label is [32msoc.religion.christian[0m, similarity: [33m0.133[0m
id: 126
true_label: [32mrec.autos[0m
0 nearest label is [32mrec.autos[0m, similarity: [33m0.218[0m
1 nearest label is [32mrec.autos[0m, similarity: [33m0.216[0m
2 nearest label is [32mrec.autos[0m, similarity: [33m0.214[0m
id: 277
true_label: [32msoc.religion.christian[0m
0 nearest label is [32msoc.religion.christian[0m, similarity: [33m0.346[0m
1 nearest label is [32msoc.religion.christian[0m, similarity: [33m0.329[0m
2 nearest label is [32msoc.religion.christian[0m, similarity: [33m0.306[0m
id: 312
true_label: [32msoc.religion.christian[0m
0 nearest label is [32msoc.religion.christian[0m, similarity: [33m0.216[0m
1 nearest label is [32msoc.religion.christian[0m, simi