## Ans(3):

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances, euclidean_distances
from sklearn.neighbors import NearestNeighbors
import re
import string

In [4]:
# Step 1: Load and Preprocess the Dataset
dataset = pd.read_json("News_Category_Dataset_v3.json", lines=True)
dataset

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28


In [5]:
dataset['headline'][0]

'Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters'

In [6]:
dataset['headline'].mode()

0    Sunday Roundup
Name: headline, dtype: object

In [7]:
dataset['headline'].value_counts().nlargest(3)

Sunday Roundup                                            90
The 20 Funniest Tweets From Women This Week               80
Weekly Roundup of eBay Vintage Clothing Finds (PHOTOS)    59
Name: headline, dtype: int64

In [8]:
# Step 2: Data Preprocessing
# Clean the text by removing unwanted characters, symbols, and punctuation
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text = text.strip()  # Remove leading/trailing spaces
    return text

dataset['cleaned_text'] = dataset['headline'].apply(clean_text)


dataset

Unnamed: 0,link,headline,category,short_description,authors,date,cleaned_text
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,over million americans roll up sleeves for om...
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,american airlines flyer charged banned for lif...
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23,of the funniest tweets about cats and dogs thi...
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23,the funniest tweets from parents this week sept
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22,woman who called cops on black birdwatcher los...
...,...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28,rim ceo thorsten heins significant plans for b...
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28,maria sharapova stunned by victoria azarenka i...
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28,giants over patriots jets over colts among mo...
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28,aldon smith arrested ers linebacker busted for...


In [9]:
# Step 3: Define the Given Data Point
given_data = "The 20 Funniest Tweets From Women This Week"
given_data = clean_text(given_data)
#print(given_data)

In [10]:
# Step 4: Calculate Similarity Scores
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(dataset['cleaned_text'])

In [11]:
# Convert the given data point to a TF-IDF vector
given_data_vector = tfidf_vectorizer.transform([given_data])

In [12]:
# Calculate cosine similarity
cosine_sim_scores = cosine_similarity(tfidf_matrix, given_data_vector)

In [13]:
# Calculate Manhattan distances
manhattan_dist_scores = manhattan_distances(tfidf_matrix, given_data_vector)

In [14]:
# Calculate Euclidean distances
euclidean_dist_scores = euclidean_distances(tfidf_matrix, given_data_vector)

In [18]:
# Step 5: Find the Most Similar Data
# Find the index of the most similar data point using the maximum similarity score for each algorithm
most_similar_cosine = cosine_sim_scores.argmax()
most_similar_manhattan = manhattan_dist_scores.argmax()
most_similar_euclidean = euclidean_dist_scores.argmax()

In [19]:
# Get the actual data of the most similar data points
most_similar_cosine_data = dataset.loc[most_similar_cosine]
most_similar_manhattan_data = dataset.loc[most_similar_manhattan]
most_similar_euclidean_data = dataset.loc[most_similar_euclidean]

In [20]:
# Print the results
print("Most Similar Data (Cosine Similarity):")
print(most_similar_cosine_data)
print("\nMost Similar Data (Manhattan Distance):")
print(most_similar_manhattan_data)
print("\nMost Similar Data (Euclidean Distance):")
print(most_similar_euclidean_data)

Most Similar Data (Cosine Similarity):
link                 https://www.huffingtonpost.com/entry/the-20-fu...
headline                   The 20 Funniest Tweets From Women This Week
category                                                         WOMEN
short_description    "Welcome to adulthood. You have a favorite spa...
authors                                                  Hollis Miller
date                                               2018-05-25 00:00:00
cleaned_text                 the  funniest tweets from women this week
Name: 8742, dtype: object

Most Similar Data (Manhattan Distance):
link                 https://www.huffingtonpost.com/entry/chats-wit...
headline             Chats with Esperanza Spalding, Michelle Philli...
category                                                 ENTERTAINMENT
short_description    DAVE MCGRAW & MANDY FER’S “CREATURES WE ARE” E...
authors              Mike Ragogna, ContributorTrafficbeat Entertain...
date                                      

In [21]:
# Step 6: Train the KNN Model
k = 5  # Number of neighbors to consider
knn_model = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='cosine')
knn_model.fit(tfidf_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [22]:
# Step 7: Convert the Given Data Point to a TF-IDF Vector
given_data_vector = tfidf_vectorizer.transform([given_data])

# Step 8: Find the Most Similar Data Points
distances, indices = knn_model.kneighbors(given_data_vector)

# Step 9: Get the Actual Data of the Most Similar Data Points
most_similar_data = dataset.iloc[indices[0]]

# Step 10: Print the Results
print("Most Similar Data:")
print(most_similar_data)

Most Similar Data:
                                                     link  \
70651   https://www.huffingtonpost.com/entry/the-20-fu...   
128724  https://www.huffingtonpost.com/entry/best-twee...   
81651   https://www.huffingtonpost.com/entry/the-20-fu...   
47253   https://www.huffingtonpost.com/entry/the-20-fu...   
14205   https://www.huffingtonpost.com/entry/the-20-fu...   

                                           headline category  \
70651   The 20 Funniest Tweets From Women This Week    WOMEN   
128724     The Funniest Tweets From Women This Week    WOMEN   
81651   The 20 Funniest Tweets From Women This Week    WOMEN   
47253   The 20 Funniest Tweets From Women This Week    WOMEN   
14205   The 20 Funniest Tweets From Women This Week    WOMEN   

                                        short_description          authors  \
70651   The ladies of Twitter never fail to brighten o...  Alanna Vagianos   
128724                                                     Alanna Vagiano