In [38]:
import gensim.downloader as api
from gensim.models import FastText

word_vectors = api.load("fasttext-wiki-news-subwords-300") 




In [49]:
word_vectors.save('fastvec.bin')

In [66]:
from gensim.models import KeyedVectors

fasttext_vectors = KeyedVectors.load("fastvec.bin")

In [67]:
import csv

def load_genres_from_csv(file_path):
    """
    Load genres from a CSV file into a list.
    The CSV should have a header with "name" as a column.
    """
    genres = []
    with open(file_path, "r", encoding="utf-8") as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            genres.append(row["name"])
    return genres


# Function to calculate genre similarity
def log_genre_similarity(genre, genres, topn=10):
    """
    Log the top N similar genres using FastText embeddings.
    """

    genre = genre.replace(" ", "-")

    if genre not in word_vectors:
        print(f"'{genre}' not found in FastText embeddings.")
        return

    # Get vector for the given genre
    genre_vector = word_vectors[genre]

    # Calculate similarities with available genres
    similar_genres = []
    for g in genres:
        if g in word_vectors:
            score = word_vectors.similarity(genre, g)
            similar_genres.append((g, score))

    # Sort by similarity score and limit results
    similar_genres = sorted(similar_genres, key=lambda x: -x[1])[:topn]

    # Print results
    print(f"Genres similar to '{genre}':")
    for similar, score in similar_genres:
        print(f"  {similar}: {score:.2f}")

# Load genres from CSV
genres_file = "./genres.csv"  # Replace with your actual path
genres = load_genres_from_csv(genres_file)

# Test: Log similar genres for a sample
log_genre_similarity("rock", genres)

Genres similar to 'rock':
  rock: 1.00
  rock-and-roll: 0.65
  punk: 0.65
  pop: 0.63
  metal: 0.63
  blues-rock: 0.62
  protopunk: 0.61
  britpop: 0.59
  funk: 0.58
  rockabilly: 0.58


In [37]:
def get_vector_for_phrase(model, phrase):
    words = phrase.split()
    vectors = [model[word] for word in words if word in model]
    if not vectors:
        raise ValueError(f"None of the words in '{phrase}' are in the model vocabulary.")
    return sum(vectors) / len(vectors)


def log_genre_similarity(model, genre, topn=10):
    try:
        vector = get_vector_for_phrase(model, genre)
        similar = model.similar_by_vector(vector, topn=topn)
        print(f"Genres similar to '{genre}':")
        for similar_genre, score in similar:
            print(f"  {similar_genre}: {score:.2f}")
    except ValueError as e:
        print(e)

log_genre_similarity(word_vectors, "dance pop")

# Save word vectors
word_vectors.save('word_vectors.model')

Genres similar to 'dance pop':
  dance: 0.86
  pop: 0.86
  music: 0.74
  songs: 0.67
  dancing: 0.65
  hop: 0.65
  song: 0.64
  musical: 0.64
  hip-hop: 0.63
  folk: 0.63


In [35]:
valid_genres = {
    "rock", "punk", "metal", "pop", "indie", "alternative", "jazz",
    "classical", "hip-hop", "blues", "funk", "electronic", "soul", "grunge",
    "orchestral", "instrumental", "dance", "trap", "r&b", "rap"
}

def filter_similarities(model, genre, valid_genres, threshold=0.5):
    similar_genres = model.most_similar(genre, topn=10000)
    filtered = [(g, score) for g, score in similar_genres if g in valid_genres and score > threshold]

    print(f"Genres similar to '{genre}' (filtered):")
    for g, score in filtered:
        print(f"  {g}: {score:.2f}")

filter_similarities(word_vectors, "soul", valid_genres, threshold=0.4)

Genres similar to 'soul' (filtered):
  r&b: 0.53
  pop: 0.52
  blues: 0.48
  funk: 0.47
  rap: 0.47
  rock: 0.45
  hip-hop: 0.45
  jazz: 0.44
  punk: 0.40


In [68]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

playlists = {
    "User1": ["pop", "dance-pop", "tropical-house", "edm"],
    "User2": ["rock", "classic-rock", "pop-rock", "alternative-rock"],
    "User3": ["rap", "hip-hop", "trap-music", "pop-rap"],
    "User4": ["folk-pop", "acoustic", "jazz", "blues"],
    "User5": ["pop", "neo-mellow", "latin", "r&b"],
}



def get_user_embedding(playlist):
    vectors = []
    for genre in playlist:
        if genre in fasttext_vectors:
            vectors.append(fasttext_vectors[genre])
        else:
            print(f"'{genre}' not found in FastText embeddings.")
    if vectors:
        return np.mean(vectors, axis=0)  # Average vector
    else:
        return np.zeros(fasttext_vectors.vector_size)
    
user_embeddings = {user: get_user_embedding(playlist) for user, playlist in playlists.items()}


user_ids = list(user_embeddings.keys())
embedding_matrix = np.array([user_embeddings[user] for user in user_ids])
similarity_matrix = cosine_similarity(embedding_matrix)


print("User Similarity Matrix:")
print(similarity_matrix)


def group_users(similarity_matrix, threshold=0.9):
    """
    Group users based on similarity threshold.
    """
    groups = []
    visited = set()

    for i, user in enumerate(user_ids):
        if i in visited:
            continue
        group = [user]
        visited.add(i)
        for j, other_user in enumerate(user_ids):
            if j != i and similarity_matrix[i, j] >= threshold and j not in visited:
                group.append(other_user)
                visited.add(j)
        groups.append(group)
    
    return groups


# Group users and print results
groups = group_users(similarity_matrix, threshold=0.8)
print("\nUser Groups Based on Similarity:")
for i, group in enumerate(groups, 1):
    print(f"Group {i}: {', '.join(group)}")

'tropical-house' not found in FastText embeddings.
'trap-music' not found in FastText embeddings.
'neo-mellow' not found in FastText embeddings.
'r&b' not found in FastText embeddings.
User Similarity Matrix:
[[1.         0.77048117 0.7126486  0.6961173  0.8373386 ]
 [0.77048117 1.         0.69991845 0.8043604  0.73256654]
 [0.7126486  0.69991845 1.         0.67184263 0.6879008 ]
 [0.6961173  0.8043604  0.67184263 1.0000005  0.6854836 ]
 [0.8373386  0.73256654 0.6879008  0.6854836  0.99999994]]

User Groups Based on Similarity:
Group 1: User1, User5
Group 2: User2, User4
Group 3: User3
