In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import numpy as np
import faiss
# Step 1: Data Preprocessing
# Function to process each line and extract the relevant information
def process_line(line):
    parts = line.strip().split('\t')
    user_id = parts[0]
    track_name = parts[5]
    return user_id, track_name

# Load the data and process each line
user_track_dict = {}
with open(r'C:\Users\saica\Downloads\lastfm-dataset-1K\lastfm-dataset-1K\userid-timestamp-artid-artname-traid-traname.tsv', 'r', encoding='utf-8') as file:
    for line in file:
        user_id, track_name = process_line(line)
        if user_id not in user_track_dict:
            user_track_dict[user_id] = []
        user_track_dict[user_id].append(track_name)

users=list(user_track_dict.keys())

# Convert user_track_dict to a list of user documents
user_docs = list(user_track_dict.values())


In [None]:
# Step 2: Doc2Vec Embedding
tagged_data = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(user_docs)]

max_epochs = 200
vec_size = 54
alpha = 0.025
workers = -1  # Utilize all CPU cores
model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1, workers=workers)
model.build_vocab(tagged_data)
for epoch in range(max_epochs):
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    model.alpha -= 0.0002
    model.min_alpha = model.alpha
num_docs = len(user_docs)


In [None]:
all_embeddings=[]
for i in range(992):
    all_embeddings.append(model.dv[str(i)])
print(all_embeddings[0:5])
userkeys=list(user_track_dict.keys())
print(userkeys[0:5])

# Initialize an empty dictionary
user_embeddings_dict = {}

# Create the dictionary
for i, user_key in enumerate(userkeys):
    user_embeddings_dict[user_key] = all_embeddings[i]

[array([-0.0096867 , -0.01107246, -0.01829769,  0.0158386 ,  0.00660392,
        0.0004871 , -0.01829746, -0.00956787, -0.01799623,  0.00372367,
        0.00524132,  0.00859918, -0.00795792, -0.00582538, -0.00570146,
       -0.01615169,  0.00402312,  0.01708449, -0.01759605, -0.00640386,
       -0.00698131,  0.00482849, -0.01053992,  0.00485311,  0.01074541,
       -0.0150127 , -0.01542554, -0.01843457,  0.00913527, -0.01689316,
        0.01081845,  0.01259308, -0.01204889, -0.00837015, -0.00232382,
        0.00304874, -0.00274322, -0.01581952, -0.00667151,  0.00320671,
       -0.00380911, -0.01338897,  0.00774926, -0.01587841,  0.00502136,
       -0.00854393,  0.00119524, -0.0038099 ,  0.0100245 , -0.01481958,
       -0.00392565, -0.00017744, -0.01229403, -0.01208549], dtype=float32), array([-0.00357955,  0.01630273, -0.0023392 ,  0.00654811, -0.01064873,
        0.01632373,  0.00539902,  0.0171846 ,  0.00805529, -0.00777698,
        0.00415168, -0.00817121,  0.01069804,  0.00339171, 

In [None]:
import faiss

# Convert embeddings to numpy array
doc_vectors = np.array([model.dv[str(i)] for i in range(num_docs - 5)])  # Remove the last 5 vectors

# Initialize a Faiss index
index = faiss.IndexFlatL2(vec_size)  # Assuming vec_size is the dimension of the embeddings

# Add document vectors to the index
index.add(doc_vectors)

In [None]:
import faiss

# Convert the embeddings of the left-out documents to numpy array
left_out_embeddings = np.array([model.dv[str(i)] for i in range(num_docs - 5, num_docs)])

# Perform a nearest neighbor search to find the 10 most similar vectors
k = 40
D, I = index.search(left_out_embeddings, k)

# Print the indices of the most similar vectors
print("Indices of the most similar vectors:")
print(I)

# Print the distances to the most similar vectors
print("Distances to the most similar vectors:")
print(D)

Indices of the most similar vectors:
[[745 861 246 484 648  26 628 181  48 244 904  77 121 515 766 915 558 155
  292 575 294 809 757 798 625 290 323 447 273 408 710 709 471 226 722 475
  371  22 508 596]
 [294 416 788 502 532 838 589 929 509 844 132 262 575 733 666 776   4 571
  757 425 181 503 604 352 596 203 409 327 337 275 813 320 933 643 238 840
  429  30 102 753]
 [407 192 473 483 118 356 395 178 424 477 342 148 899  39 697 634 643 563
  850 981  82 826 482 842 765 294 940 607 787 938 385 688 230 189 572 889
  615 766 623 515]
 [600 845 694 283 788 125 571 721 889 933 529 647 451 132 278 541 257 869
  493 141 515 714 684 589 429  35 862 244 385 225 602 975 594 596 100 708
  409 311 113 428]
 [876 185 787 482 232 385 839 874 685 240 650 703 526 498 112  86 696 721
  192  22 426 722 316  38  13 849  48 690 831 634 602 339 727 639 978  82
  310  52 926 515]]
Distances to the most similar vectors:
[[0.00752663 0.00765096 0.00787766 0.00818963 0.00828231 0.00838386
  0.00856833 0.00874

In [None]:
# Get the indices of the most similar vectors
similar_indices_last = I[-1]  # Last array of indices

# Convert indices to user keys in the format user_000xxx
similar_user_keys_last1 = [f"user_{idx:06d}" for idx in similar_indices_last]

print("Most similar user keys (last array):")
print(similar_user_keys_last1)

Most similar user keys (last array):
['user_000876', 'user_000185', 'user_000787', 'user_000482', 'user_000232', 'user_000385', 'user_000839', 'user_000874', 'user_000685', 'user_000240', 'user_000650', 'user_000703', 'user_000526', 'user_000498', 'user_000112', 'user_000086', 'user_000696', 'user_000721', 'user_000192', 'user_000022', 'user_000426', 'user_000722', 'user_000316', 'user_000038', 'user_000013', 'user_000849', 'user_000048', 'user_000690', 'user_000831', 'user_000634', 'user_000602', 'user_000339', 'user_000727', 'user_000639', 'user_000978', 'user_000082', 'user_000310', 'user_000052', 'user_000926', 'user_000515']


In [None]:
# Get the indices of the most similar vectors
similar_indices_last = I[-1]  # Last array of indices

# Convert indices to user keys in the format user_000xxx incremented by 1
similar_user_keys_last = [f"user_{idx + 1:06d}" for idx in similar_indices_last]
similar_indices_last=similar_user_keys_last + similar_user_keys_last1

print("Most similar user keys (last array):")
print(similar_user_keys_last)


Most similar user keys (last array):
['user_000877', 'user_000186', 'user_000788', 'user_000483', 'user_000233', 'user_000386', 'user_000840', 'user_000875', 'user_000686', 'user_000241', 'user_000651', 'user_000704', 'user_000527', 'user_000499', 'user_000113', 'user_000087', 'user_000697', 'user_000722', 'user_000193', 'user_000023', 'user_000427', 'user_000723', 'user_000317', 'user_000039', 'user_000014', 'user_000850', 'user_000049', 'user_000691', 'user_000832', 'user_000635', 'user_000603', 'user_000340', 'user_000728', 'user_000640', 'user_000979', 'user_000083', 'user_000311', 'user_000053', 'user_000927', 'user_000516']


In [None]:
similar_user_tracks_all = [user_track_dict[key] for key in similar_user_keys_last]

# Flatten the list of lists into a single list and convert it to a set
all_tracks_set = set(track for tracks_list in similar_user_tracks_all for track in tracks_list)

In [None]:
# Extract values from user_track_dict for the key 'user_000992' and convert to set
user_000992_tracks_set = set(user_track_dict['user_001000'])

print("Set of tracks for user_000992:")
print(user_000992_tracks_set)
print('Motherzone' in user_000992_tracks_set)


Set of tracks for user_000992:
False


In [None]:
# Find elements present in user_000992_tracks_set but not in all_tracks_set
unique_tracks_user_000992 = user_000992_tracks_set - all_tracks_set

print("Unique tracks in user_000992_tracks_set:")
print(unique_tracks_user_000992)


Unique tracks in user_000992_tracks_set:
{'Freestyle (Beans)', 'Miasto Doznań', "Makin' My Way [Prd. M-Phazes]", 'Reckoner Live @ Gorge', 'Crazy/Forever', "Well It'S True That We Love One Another", 'Boy Decide', 'Deafkids', "Heaven'S Demon", 'Understand?', 'The Game (Feat. Dj Premier)', 'This Is What It Became (Dubox)', 'Neony', "Just Couldn'T Tie Me Down", 'Taught To Look Away', 'Song Of Our So-Called Friend', 'Míg Nyelveden A Csoki', 'The Jesus Demeanor', 'You Got Me Up', 'Mule', 'One Bass Hit', 'Duet For Guitars #2', 'S.K.J.', 'Solar Day', 'Sages-Femmes', 'Savage Composition', 'For Minor Sky', 'The Names Of All The Animals', 'Time Of Action (Not Panic (A Better Way (Logarhythmix)))', 'A Caucus Race', 'Silverline', 'Dinner And A Movie', 'Nieces Pieces (Boat Knife Version)', 'Blue Imelda', '02 Organisms', 'Any Day', 'Thought I Was A Gun', 'Joe Dimaggio Done It Again', 'String Strikes', 'Black Republicans (Feat. Juelz Santana)', 'A Party (For Everyone (Get Involved (Logarhythmix)))', '

In [None]:
print(1-len(unique_tracks_user_000992)/len(user_000992_tracks_set))

0.7083415112855741


In [None]:
# Get the indices of the most similar vectors
similar_indices_last = I[-1]  # Last array of indices

# Convert indices to user keys in the format user_000xxx incremented by 1
similar_user_keys_last = [f"user_{idx + 1:06d}" for idx in similar_indices_last]
similar_indices_last=similar_user_keys_last + similar_user_keys_last1

print("Most similar user keys (last array):")
print(similar_user_keys_last)

similar_user_tracks_all = [user_track_dict[key] for key in similar_user_keys_last]

# Flatten the list of lists into a single list and convert it to a set
all_tracks_set = set(track for tracks_list in similar_user_tracks_all for track in tracks_list)

# Extract values from user_track_dict for the key 'user_000992' and convert to set
user_001000_tracks_set = set(user_track_dict['user_001000'])

print("Set of tracks for user_001000:")
print(user_001000_tracks_set)

# Find elements present in user_000992_tracks_set but not in all_tracks_set
unique_tracks_user_000991 = user_001000_tracks_set - all_tracks_set

print("Unique tracks in user_000992_tracks_set:")
print(unique_tracks_user_000991)


print(1-len(unique_tracks_user_000991)/len(user_001000_tracks_set))





Most similar user keys (last array):
['user_000877', 'user_000186', 'user_000788', 'user_000483', 'user_000233', 'user_000386', 'user_000840', 'user_000875', 'user_000686', 'user_000241', 'user_000651', 'user_000704', 'user_000527', 'user_000499', 'user_000113', 'user_000087', 'user_000697', 'user_000722', 'user_000193', 'user_000023', 'user_000427', 'user_000723', 'user_000317', 'user_000039', 'user_000014', 'user_000850', 'user_000049', 'user_000691', 'user_000832', 'user_000635', 'user_000603', 'user_000340', 'user_000728', 'user_000640', 'user_000979', 'user_000083', 'user_000311', 'user_000053', 'user_000927', 'user_000516']
Set of tracks for user_001000:
Unique tracks in user_000992_tracks_set:
{'Freestyle (Beans)', 'Miasto Doznań', "Makin' My Way [Prd. M-Phazes]", 'Reckoner Live @ Gorge', 'Crazy/Forever', "Well It'S True That We Love One Another", 'Boy Decide', 'Deafkids', "Heaven'S Demon", 'Understand?', 'The Game (Feat. Dj Premier)', 'This Is What It Became (Dubox)', 'Neony', 

In [None]:
# Get the indices of the most similar vectors
similar_indices_last = I[-1]  # Last array of indices

# Convert indices to user keys in the format user_000xxx incremented by 1
similar_user_keys_last = [f"user_{idx + 1:06d}" for idx in similar_indices_last]
similar_indices_last=similar_user_keys_last + similar_user_keys_last1

print("Most similar user keys (last array):")
print(similar_user_keys_last)

similar_user_tracks_all = [user_track_dict[key] for key in similar_user_keys_last]

# Flatten the list of lists into a single list and convert it to a set
all_tracks_set = set(track for tracks_list in similar_user_tracks_all for track in tracks_list)

# Extract values from user_track_dict for the key 'user_000992' and convert to set
user_001000_tracks_set = set(user_track_dict['user_001000'])

print("Set of tracks for user_001000:")
print(user_001000_tracks_set)

# Find elements present in user_000992_tracks_set but not in all_tracks_set
unique_tracks_user_000991 = user_001000_tracks_set - all_tracks_set

print("Unique tracks in user_000992_tracks_set:")
print(unique_tracks_user_000991)


print(1-len(unique_tracks_user_000991)/len(user_001000_tracks_set))





Most similar user keys (last array):
['user_000877', 'user_000186', 'user_000788', 'user_000483', 'user_000233', 'user_000386', 'user_000840', 'user_000875', 'user_000686', 'user_000241', 'user_000651', 'user_000704', 'user_000527', 'user_000499', 'user_000113', 'user_000087', 'user_000697', 'user_000722', 'user_000193', 'user_000023', 'user_000427', 'user_000723', 'user_000317', 'user_000039', 'user_000014', 'user_000850', 'user_000049', 'user_000691', 'user_000832', 'user_000635', 'user_000603', 'user_000340', 'user_000728', 'user_000640', 'user_000979', 'user_000083', 'user_000311', 'user_000053', 'user_000927', 'user_000516']
Set of tracks for user_001000:
Unique tracks in user_000992_tracks_set:
{'Freestyle (Beans)', 'Miasto Doznań', "Makin' My Way [Prd. M-Phazes]", 'Reckoner Live @ Gorge', 'Crazy/Forever', "Well It'S True That We Love One Another", 'Boy Decide', 'Deafkids', "Heaven'S Demon", 'Understand?', 'The Game (Feat. Dj Premier)', 'This Is What It Became (Dubox)', 'Neony', 

In [None]:
# Get the indices of the most similar vectors
similar_indices_last = I[-2]  # Last array of indices

# Convert indices to user keys in the format user_000xxx incremented by 1
similar_user_keys_last = [f"user_{idx + 1:06d}" for idx in similar_indices_last]
similar_indices_last=similar_user_keys_last + similar_user_keys_last1

print("Most similar user keys (last array):")
print(similar_user_keys_last)

similar_user_tracks_all = [user_track_dict[key] for key in similar_user_keys_last]

# Flatten the list of lists into a single list and convert it to a set
all_tracks_set = set(track for tracks_list in similar_user_tracks_all for track in tracks_list)

# Extract values from user_track_dict for the key 'user_000992' and convert to set
user_001000_tracks_set = set(user_track_dict['user_000999'])

print("Set of tracks for user_001000:")
print(user_001000_tracks_set)

# Find elements present in user_000992_tracks_set but not in all_tracks_set
unique_tracks_user_000991 = user_001000_tracks_set - all_tracks_set

print("Unique tracks in user_000992_tracks_set:")
print(unique_tracks_user_000991)


print(1-len(unique_tracks_user_000991)/len(user_001000_tracks_set))





Most similar user keys (last array):
['user_000601', 'user_000846', 'user_000695', 'user_000284', 'user_000789', 'user_000126', 'user_000572', 'user_000722', 'user_000890', 'user_000934', 'user_000530', 'user_000648', 'user_000452', 'user_000133', 'user_000279', 'user_000542', 'user_000258', 'user_000870', 'user_000494', 'user_000142', 'user_000516', 'user_000715', 'user_000685', 'user_000590', 'user_000430', 'user_000036', 'user_000863', 'user_000245', 'user_000386', 'user_000226', 'user_000603', 'user_000976', 'user_000595', 'user_000597', 'user_000101', 'user_000709', 'user_000410', 'user_000312', 'user_000114', 'user_000429']
Set of tracks for user_001000:
Unique tracks in user_000992_tracks_set:
{'Odejdę, Gdy Zapieje Kur', 'Bubble Pop Electric (Feat. Johnny Vulture)', "Heroes' Elegy", 'Koi No Megalover', 'Ostravo', 'New Pulse', 'Tweedle', 'Take Me To The Bonuslevel Because I Need An Extralife', 'Suite-Pee (Live)', 'Droga Długa Jest', 'House By The Sea', 'W Deszczu Maleńkich, Żółty

In [None]:
# Get the indices of the most similar vectors
similar_indices_last = I[-3]  # Last array of indices

# Convert indices to user keys in the format user_000xxx incremented by 1
similar_user_keys_last = [f"user_{idx + 1:06d}" for idx in similar_indices_last]
similar_indices_last=similar_user_keys_last + similar_user_keys_last1

print("Most similar user keys (last array):")
print(similar_user_keys_last)

similar_user_tracks_all = [user_track_dict[key] for key in similar_user_keys_last]

# Flatten the list of lists into a single list and convert it to a set
all_tracks_set = set(track for tracks_list in similar_user_tracks_all for track in tracks_list)

# Extract values from user_track_dict for the key 'user_000992' and convert to set
user_001000_tracks_set = set(user_track_dict['user_000997'])

print("Set of tracks for user_001000:")
print(user_001000_tracks_set)

# Find elements present in user_000992_tracks_set but not in all_tracks_set
unique_tracks_user_000991 = user_001000_tracks_set - all_tracks_set

print("Unique tracks in user_000992_tracks_set:")
print(unique_tracks_user_000991)


print(1-len(unique_tracks_user_000991)/len(user_001000_tracks_set))





Most similar user keys (last array):
['user_000408', 'user_000193', 'user_000474', 'user_000484', 'user_000119', 'user_000357', 'user_000396', 'user_000179', 'user_000425', 'user_000478', 'user_000343', 'user_000149', 'user_000900', 'user_000040', 'user_000698', 'user_000635', 'user_000644', 'user_000564', 'user_000851', 'user_000982', 'user_000083', 'user_000827', 'user_000483', 'user_000843', 'user_000766', 'user_000295', 'user_000941', 'user_000608', 'user_000788', 'user_000939', 'user_000386', 'user_000689', 'user_000231', 'user_000190', 'user_000573', 'user_000890', 'user_000616', 'user_000767', 'user_000624', 'user_000516']
Set of tracks for user_001000:
{'99 To Life', 'Dawna', 'Still In Love', 'Story Of My Life', 'Girl Anachronism', 'Incubus Succubus Ii', 'Edit', 'In The Flat Field', 'Blues From A Gun', 'Karate', "Let'S Get Def", 'Pleasure Is The Boss', 'Airwaves', 'Lust For Life', 'You Are Fading', 'Huh...What?', '1945', 'Wardance', 'From Suspicious Minds', 'Love The Virgins', 

In [None]:
# Get the indices of the most similar vectors
similar_indices_last = I[-4]  # Last array of indices

# Convert indices to user keys in the format user_000xxx incremented by 1
similar_user_keys_last = [f"user_{idx + 1:06d}" for idx in similar_indices_last]
similar_indices_last=similar_user_keys_last + similar_user_keys_last1

print("Most similar user keys (last array):")
print(similar_user_keys_last)

similar_user_tracks_all = [user_track_dict[key] for key in similar_user_keys_last]

# Flatten the list of lists into a single list and convert it to a set
all_tracks_set = set(track for tracks_list in similar_user_tracks_all for track in tracks_list)

# Extract values from user_track_dict for the key 'user_000992' and convert to set
user_000999_tracks_set = set(user_track_dict['user_000996'])

print("Set of tracks for user_000990:")
print(user_000999_tracks_set)

# Find elements present in user_000992_tracks_set but not in all_tracks_set
unique_tracks_user_000999 = user_000999_tracks_set - all_tracks_set

print("Unique tracks in user_000999_tracks_set:")
print(unique_tracks_user_000999)


print(1-len(unique_tracks_user_000999)/len(user_000999_tracks_set))





Most similar user keys (last array):
['user_000295', 'user_000417', 'user_000789', 'user_000503', 'user_000533', 'user_000839', 'user_000590', 'user_000930', 'user_000510', 'user_000845', 'user_000133', 'user_000263', 'user_000576', 'user_000734', 'user_000667', 'user_000777', 'user_000005', 'user_000572', 'user_000758', 'user_000426', 'user_000182', 'user_000504', 'user_000605', 'user_000353', 'user_000597', 'user_000204', 'user_000410', 'user_000328', 'user_000338', 'user_000276', 'user_000814', 'user_000321', 'user_000934', 'user_000644', 'user_000239', 'user_000841', 'user_000430', 'user_000031', 'user_000103', 'user_000754']
Set of tracks for user_000990:
Unique tracks in user_000999_tracks_set:
{'Ghetto Youth', 'Alpha Female', 'Colour Me', 'Hours From My Life', 'As I Lay Me Down', 'Counting Blue Cars', 'Last Flowers To The Hospital', "Baba O'Remix", 'Walk The Line', 'Radiohead & Portishead - Blow Out', 'You Were Meant For Me', "Something'S Always Wrong", "C' Mere", 'Radiohead - S

In [None]:
# Get the indices of the most similar vectors
similar_indices_last = I[-5]  # Last array of indices

# Convert indices to user keys in the format user_000xxx incremented by 1
similar_user_keys_last = [f"user_{idx + 1:06d}" for idx in similar_indices_last]
similar_indices_last=similar_user_keys_last + similar_user_keys_last1

print("Most similar user keys (last array):")
print(similar_user_keys_last)

similar_user_tracks_all = [user_track_dict[key] for key in similar_user_keys_last]

# Flatten the list of lists into a single list and convert it to a set
all_tracks_set = set(track for tracks_list in similar_user_tracks_all for track in tracks_list)

# Extract values from user_track_dict for the key 'user_000992' and convert to set
user_000991_tracks_set = set(user_track_dict['user_000996'])

print("Set of tracks for user_000990:")
print(user_000991_tracks_set)

# Find elements present in user_000992_tracks_set but not in all_tracks_set
unique_tracks_user_000991 = user_000991_tracks_set - all_tracks_set

print("Unique tracks in user_000992_tracks_set:")
print(unique_tracks_user_000991)


print(1-len(unique_tracks_user_000991)/len(user_000991_tracks_set))





Most similar user keys (last array):
['user_000746', 'user_000862', 'user_000247', 'user_000485', 'user_000649', 'user_000027', 'user_000629', 'user_000182', 'user_000049', 'user_000245', 'user_000905', 'user_000078', 'user_000122', 'user_000516', 'user_000767', 'user_000916', 'user_000559', 'user_000156', 'user_000293', 'user_000576', 'user_000295', 'user_000810', 'user_000758', 'user_000799', 'user_000626', 'user_000291', 'user_000324', 'user_000448', 'user_000274', 'user_000409', 'user_000711', 'user_000710', 'user_000472', 'user_000227', 'user_000723', 'user_000476', 'user_000372', 'user_000023', 'user_000509', 'user_000597']
Set of tracks for user_000990:
Unique tracks in user_000992_tracks_set:
{'Alpha Female', 'Hours From My Life', "Baba O'Remix", 'Radiohead & Portishead - Blow Out', "Something'S Always Wrong", "C' Mere", 'Radiohead - Sunday Bloody Sunday (U2 Cover Live)', 'People Carrier', 'Radiohead - Coke Babies', 'Feel Good Inc. (Album Version)', 'Desperately Wanting', 'B.O.