### Test script for app

This notebook includes all of the test code I ran to try out different query methods, including: 
- Sentiment analysis for lyrics: use VADER and HuggingFace pipeline, results may be usable for English lyrics, but was not optimal for other languages
- TF-IDF to query for top keywords: use [text2text library](https://github.com/artitw/text2text#tf-idf)
- Summarize lyrics and mood to use as query: use NLTK

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

import spotipy
from spotipy.oauth2 import SpotifyOAuth, SpotifyClientCredentials

# Get Spotify app credentials
client_id = os.getenv("SPOTIFY_CLIENT_ID")
client_secret = os.getenv("SPOTIFY_CLIENT_SECRET")
redirect_uri = os.getenv("SPOTIFY_REDIRECT_URI")
scope = "user-library-read playlist-read-private user-top-read"

# Authenticate 
client_credentials_manager = SpotifyClientCredentials(
    client_id=client_id,
    client_secret=client_secret
)

oauth_manager = SpotifyOAuth(
    client_id=client_id,
    client_secret=client_secret,
    redirect_uri=redirect_uri,
    scope=scope
)

sp = spotipy.Spotify(oauth_manager=oauth_manager)
sp.current_user()

Fetch Spotify songs

In [2]:
top_songs = sp.current_user_top_tracks(limit=50)
user_songs = []
for song in top_songs['items']:
    user_songs.append({
        'id': song['id'],
        'title': song['name'],
        'artist': song['artists'][0]['name'],
        'album': song['album']['name'],
        'release_date': song['album']['release_date'],
        'popularity': song['popularity']
    })

In [None]:
import pandas as pd
user_songs = pd.DataFrame(user_songs)
user_songs

Match songs with lyrics

In [None]:
import lyricsgenius
gen_client_access_token = os.getenv("GENIUS_CLIENT_TOKEN")
genius = lyricsgenius.Genius(gen_client_access_token, sleep_time = 5)
list_lyrics = []

for i, song in user_songs.iterrows():
    title = song['title']
    artist = song['artist']
    lyrics = genius.search_song(title, artist)
    if lyrics:
        list_lyrics.append({
            'title': title,
            'lyrics': lyrics.lyrics
        })
    else:
        lyrics = genius.search_song(title)
        if lyrics:
            list_lyrics.append({
                'title': title,
                'lyrics': lyrics.lyrics
            })

Clean and tokenize lyrics

In [98]:
import random
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

# nltk.download('stopwords')
# nltk.download('wordnet')   
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')   

def clean_lyrics(df, column):
    """
    Cleans the words without importance and fix the format of the  dataframe's column lyrics 
    Args:
        df (DataFrame): df containing song information
        column (str): column to clean
    Returns:
        df (DataFrame): DataFrame containing the cleaned lyrics
    """
    df[column] = df[column].str.lower()
    # remove section marker
    df[column] = df[column].str.replace(r"(verse\s?\d*|chorus|bridge|outro|intro)", "", regex=True)
    df[column] = df[column].str.replace(r"(instrumental|guitar|solo)", "", regex=True) 
    df[column] = df[column].str.replace(r"\[.*?\]", "", regex=True)
    # remove new line
    df[column] = df[column].str.replace(r"\n", ". ", regex=True)
    # remove special characters
    df[column] = df[column].str.replace(r"[^\w\d'\s.]+", "", regex=True)
    df[column] = df[column].str.strip()

    return df

In [None]:
# Convert list_lyrics to DataFrame
list_lyrics_df = pd.DataFrame(list_lyrics)

# Clean and tokenize lyrics
list_lyrics_df = clean_lyrics(list_lyrics_df, 'lyrics')
list_lyrics_df

Sentiment analysis with VADER

In [None]:
from nltk.sentiment import vader
# nltk.download('vader_lexicon')

negative = []
neutral = []
positive = []
compound = []

analyzer = vader.SentimentIntensityAnalyzer()

for text in list_lyrics_df['lyrics']:
    scores = analyzer.polarity_scores(text)
    negative.append(scores['neg'])
    neutral.append(scores['neu'])
    positive.append(scores['pos'])
    compound.append(scores['compound'])

list_lyrics_df['negative'] = negative
list_lyrics_df['neutral'] = neutral
list_lyrics_df['positive'] = positive
list_lyrics_df['compound'] = compound

In [None]:
# get mean sentiment score for dataset
list_sentiment = list_lyrics_df[['negative', 'neutral', 'positive', 'compound']].mean(axis=1)
list_sentiment.mean()

Sentiment analysis using [this model](tabularisai/multilingual-sentiment-analysis) from HuggingFace:
- Better than VADER: 
  - Less neutral results
  - Language support
- VADER: 
  - Excellent sentiment for English
  - Tend to lean toward neutral?

In [None]:
# Example
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

model_name = "tabularisai/multilingual-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# def predict_sentiment(text):
#         inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
#         with torch.no_grad():
#             outputs = model(**inputs)
#         probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
#         sentiment_map = {
#             0: "sombre",
#             1: "sad",
#             2: "neutral",
#             3: "happy",
#             4: "ecstatic"
#         }
#         return [sentiment_map[p] for p in torch.argmax(probabilities, dim=-1).tolist()] 

def predict_sentiment(texts):
    # sentiment_map = {0: "Somber", 1: "Sad", 2: "Neutral", 3: "Happy", 4: "Estactic"}
    pipe = pipeline(task="sentiment-analysis", model=model_name)
    sentiments = pipe(texts)
    labels = [sentiment['label'] for sentiment in sentiments]
    # return [sentiment_map[int(p)] for p in labels]
    return labels
    
# texts = [
#     # English
#     "I absolutely love the new design of this app!", "The customer service was disappointing.", "The weather is fine, nothing special.",
#     # Chinese
#     "这家餐厅的菜味道非常棒！", "我对他的回答很失望。", "天气今天一般。",
#     # Spanish
#     "¡Me encanta cómo quedó la decoración!", "El servicio fue terrible y muy lento.", "El libro estuvo más o menos.",
#     # Arabic
#     "الخدمة في هذا الفندق رائعة جدًا!", "لم يعجبني الطعام في هذا المطعم.", "كانت الرحلة عادية。",
#     # Ukrainian
#     "Мені дуже сподобалася ця вистава!", "Обслуговування було жахливим.", "Книга була посередньою。",
#     # Hindi
#     "यह जगह सच में अद्भुत है!", "यह अनुभव बहुत खराब था।", "फिल्म ठीक-ठाक थी।",
#     # Bengali
#     "এখানকার পরিবেশ অসাধারণ!", "সেবার মান একেবারেই খারাপ।", "খাবারটা মোটামুটি ছিল।",
#     # Portuguese
#     "Este livro é fantástico! Eu aprendi muitas coisas novas e inspiradoras.", 
#     "Não gostei do produto, veio quebrado.", "O filme foi ok, nada de especial.",
#     # Japanese
#     "このレストランの料理は本当に美味しいです！", "このホテルのサービスはがっかりしました。", "天気はまあまあです。",
#     # Russian
#     "Я в восторге от этого нового гаджета!", "Этот сервис оставил у меня только разочарование.", "Встреча была обычной, ничего особенного.",
#     # French
#     "J'adore ce restaurant, c'est excellent !", "L'attente était trop longue et frustrante.", "Le film était moyen, sans plus.",
#     # Turkish
#     "Bu otelin manzarasına bayıldım!", "Ürün tam bir hayal kırıklığıydı.", "Konser fena değildi, ortalamaydı.",
#     # Italian
#     "Adoro questo posto, è fantastico!", "Il servizio clienti è stato pessimo.", "La cena era nella media.",
#     # Polish
#     "Uwielbiam tę restaurację, jedzenie jest świetne!", "Obsługa klienta była rozczarowująca.", "Pogoda jest w porządku, nic szczególnego.",
#     # Tagalog
#     "Ang ganda ng lugar na ito, sobrang aliwalas!", "Hindi maganda ang serbisyo nila dito.", "Maayos lang ang palabas, walang espesyal.",
#     # Dutch
#     "Ik ben echt blij met mijn nieuwe aankoop!", "De klantenservice was echt slecht.", "De presentatie was gewoon oké, niet bijzonder.",
#     # Malay
#     "Saya suka makanan di sini, sangat sedap!", "Pengalaman ini sangat mengecewakan.", "Hari ini cuacanya biasa sahaja.",
#     # Korean
#     "이 가게의 케이크는 정말 맛있어요!", "서비스가 너무 별로였어요.", "날씨가 그저 그렇네요.",
#     # Swiss German
#     "Ich find dä Service i de Beiz mega guet!", "Däs Esä het mir nöd gfalle.", "D Wätter hüt isch so naja.",
#     # Vietnamese
#     "Tôi thích cách trang trí mới của quán!", "Dịch vụ khách hàng thì thất vọng.", "Bộ phim này tạm ổn, không gì đặc biệt."
# ]

# for text, sentiment in zip(texts, predict_sentiment(texts)):
#     print(f"Text: {text}\nSentiment: {sentiment}\n")

In [None]:
sentiments = []
for text in list_lyrics_df['lyrics']:
    sentiment = (predict_sentiment([text[:512]])[0])
    sentiments.append(sentiment)

list_lyrics_df['sentiment'] = sentiments

Get embedding for similarity search with [this model](sentence-transformers/distiluse-base-multilingual-cased)

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

sentences = ["This is an example", "Đây là một ví dụ", "C'est un exemple", "这是一个例子"]

model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased')
embeddings = model.encode(sentences)

for i, sentence in enumerate(sentences):
    print(f"Sentences: {sentence}")
    for j, sentence in enumerate(sentences):
        print(str(j) + f": {cosine_similarity([embeddings[i]], [embeddings[j]])[0][0]:.4f}")

In [None]:
print(embeddings[0].shape)

In [None]:
lyrics_embed = []
for text in list_lyrics_df['lyrics']:
    lyrics_embed.append(model.encode(text))

list_lyrics_df['lyrics_embed'] = lyrics_embed

Get top keywords

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=50)
all_lyrics = "".join(list_lyrics_df['lyrics'])
lyrics_tfidf = vectorizer.fit_transform([all_lyrics])
query = vectorizer.get_feature_names_out()
query.tolist()

In [None]:
# example: https://colab.research.google.com/drive/1RaWj5SqWvyC2SsCTGg8IAVcl9G5hOB50?usp=sharing
import text2text as t2t

top_kw = t2t.Tfidfer().transform(all_lyrics)

kw_query = []
for kw in top_kw:
    kw_query += list(kw.keys())

kw_query

Use spotipy search with query as top keywords

In [None]:
# take random 50 top keywords
kw_query = random.sample(kw_query, 50)
search_recs = sp.search(q="".join(kw_query), type="track", limit=30)
recs = []
for track in search_recs['tracks']['items']:
    recs.append({
        'name': track['name'],
        'artist': track['artists'][0]['name']
    })
pd.DataFrame(recs)

Summarize lyrics and mood to enhance top words querying

In [118]:
def text_summarizer(text, num_sen = 1):
    languages = stopwords.fileids() # list of supported languages
    stopWords = set(stopwords.words([language for language in languages]))
    
    sentences = []
    for sentence in text.split('.'):
        sentences.append(sentence)
        
    words = word_tokenize(text)
    words = [word for word in words if word not in stopWords]
    
    fdict = FreqDist(words) # frequency distribution
    
    # assign scores to senteces based on word frequencies
    sentence_scores = [sum(fdict[word] for word in word_tokenize(sentence) if word in fdict) for sentence in sentences]
    sentence_scores = list(enumerate(sentence_scores))
    
    # sort descending
    sorted_sentences = sorted(sentence_scores, key = lambda x: x[1], reverse = True)
    
    # Randomly select the top `num_sentences` sentences for the summary
    random_sentences = random.sample(sorted_sentences[:10], num_sen)

    # Sort the randomly selected sentences based on their original order in the text
    summary_sentences = sorted(random_sentences, key=lambda x: x[0])

    # Create the summary
    summary = ' '.join([sentences[i] for i, _ in summary_sentences])

    return summary

In [None]:
summary = []
for lyrics in list_lyrics_df['lyrics']:
    summary.append(text_summarizer(lyrics))
list_lyrics_df['summary'] = summary
list_lyrics_df

In [None]:
mood = "Working out"
lyrics_sample = list_lyrics_df.sample(6)['summary']
mood += ".".join(lyrics_sample)
mood

In [None]:
search_recs = sp.search(q = mood, type="track", limit=30)
recs = []
for track in search_recs['tracks']['items']:
    recs.append({
        'name': track['name'],
        'artist': track['artists'][0]['name']
    })
pd.DataFrame(recs)