In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import itertools
from nltk.tokenize import WordPunctTokenizer
import nltk
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import re
import langid
from nltk.corpus import stopwords


In [3]:
def preprocess_lyrics(lyrics, song_name):  

    language, _ = langid.classify(lyrics)
    if language != 'en':
        lyrics = song_name
    try:
        lyrics = re.sub(r'[^a-zA-Z\s]', '', lyrics)
    except Exception as e:
        print("error:", lyrics)
        return "" 
    if len(lyrics) > 5000:
        lyrics = song_name
    # Convert to lowercase
    lyrics = lyrics.lower()

    return lyrics

class POSTogging:
    def __init__(self):
        self.wpt = WordPunctTokenizer()
    def __call__(self, doc):
        text = self.wpt.tokenize(doc)
        return [word + '/' + tag for word, tag in nltk.pos_tag(text)]

In [4]:
df = pd.read_csv('shuffled_900.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,name,album,artist,id,release_date,popularity,length,danceability,acousticness,...,instrumentalness,liveness,valence,loudness,speechiness,tempo,key,time_signature,mood,lyrics
0,70,Wish You Were Sober,Kid Krow,Conan Gray,0kn2gu8Pd03DiYHzRvX2Xk,2020-03-20,77,168880,0.701,0.0178,...,0.0,0.17,0.696,-5.226,0.0468,91.071,2,4,Energetic,74 ContributorsTranslationsPortuguêsTürkçeEspa...
1,827,Unfaithful,A Girl Like Me,Rihanna,13xxBnXOuiBxVxJI458B0i,2006-04-10,73,226973,0.588,0.839,...,0.0,0.227,0.349,-8.607,0.0334,144.069,0,4,Calm,49 ContributorsUnfaithful Lyrics\nStory of my ...
2,231,Orbits,Orbits,Halfway Escape,6jFX2sHOzh7tpIIOomlrqJ,2020-07-07,33,204827,0.438,0.855,...,0.881,0.11,0.4,-13.924,0.039,116.142,6,4,Calm,3 ContributorsThe Age of Lead LyricsThe man ha...
3,588,Water Colors,Water Colors,Star Slide,7rpGOF9MwCP2Oaz9yDO452,2019-09-18,50,182079,0.465,0.633,...,0.937,0.085,0.0659,-19.328,0.0386,72.021,7,4,Calm,1 ContributorRed Cartel Lyrics\nUsed to pour a...
4,39,Sweet Nothing,Midnights (The Til Dawn Edition),Taylor Swift,2L09RYwH5Pjzca6PmbUAw3,2023-05-26,67,188496,0.335,0.967,...,0.00012,0.115,0.391,-14.958,0.048,175.917,0,4,Calm,


In [5]:
df = df.fillna('')
df['lyrics'] = df.apply(lambda row: row['name'] if pd.isnull(row['lyrics']) else row['lyrics'], axis=1)
lyrics_list = df['lyrics'].to_list()
lyrics_name = df['name'].to_list()
df['filtered_lyrics'] = [preprocess_lyrics(lyrics, song_name) for lyrics, song_name in zip(lyrics_list, lyrics_name)]

df = df.dropna(subset=['filtered_lyrics']).reset_index(drop=True)
preprocessed_lyrics_list = df['filtered_lyrics'].to_list()

In [6]:
col_features = df.columns[6:-3]
print(col_features)
X = MinMaxScaler().fit_transform(df[col_features])
Y = df['mood']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(Y)

Index(['popularity', 'length', 'danceability', 'acousticness', 'energy',
       'instrumentalness', 'liveness', 'valence', 'loudness', 'speechiness',
       'tempo', 'key', 'time_signature'],
      dtype='object')


In [7]:
tf_idf_vectorizer = TfidfVectorizer(min_df=3, max_features=250, strip_accents='unicode', 
                                  lowercase=True, analyzer='word', token_pattern=r'\w{3,}', 
                                  ngram_range=(1, 1), use_idf=True, smooth_idf=True, 
                                  sublinear_tf=True, stop_words="english")
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, random_state=42)
train_indices, test_indices = train_test_split(range(len(X)), test_size=0.25, random_state=42)
preprocessed_lyrics_train = [preprocessed_lyrics_list[i] for i in train_indices]
preprocessed_lyrics_test = [preprocessed_lyrics_list[i] for i in test_indices]

model = xgb.XGBClassifier(n_estimators=60, random_state=42)
lyrics_tfidf_features = tf_idf_vectorizer.fit_transform(preprocessed_lyrics_train).toarray()
X_train = np.concatenate((X_train, lyrics_tfidf_features), axis=1)
model.fit(X_train, y_train)

test_lyrics_features = tf_idf_vectorizer.transform(preprocessed_lyrics_test).toarray()
X_test1 = X_test.copy()
X_test = np.concatenate((X_test, test_lyrics_features), axis = 1)
model.predict(X_test)


array([0, 3, 2, 1, 2, 1, 1, 3, 3, 0, 0, 0, 3, 2, 0, 1, 0, 2, 1, 0, 1, 0,
       2, 0, 0, 1, 3, 1, 1, 2, 1, 0, 1, 1, 0, 1, 3, 3, 2, 2, 0, 3, 3, 2,
       1, 2, 3, 3, 3, 0, 2, 1, 3, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 3, 2, 1,
       0, 2, 1, 3, 3, 1, 1, 0, 3, 1, 3, 3, 2, 0, 0, 0, 3, 1, 2, 2, 1, 3,
       0, 1, 0, 0, 3, 1, 3, 1, 0, 1, 3, 3, 3, 1, 1, 2, 0, 1, 1, 0, 3, 0,
       0, 2, 3, 3, 2, 2, 3, 3, 1, 3, 2, 2, 0, 0, 0, 3, 1, 3, 1, 2, 0, 0,
       0, 2, 2, 3, 0, 3, 3, 2, 3, 3, 3, 1, 1, 0, 1, 3, 1, 1, 2, 3, 3, 0,
       0, 0, 3, 0, 1, 3, 2, 0, 3, 0, 3, 0, 0, 2, 2, 1, 0, 2, 3, 0, 1, 1,
       1, 2, 1, 1, 3, 3, 1, 0, 3, 3, 0, 3, 3, 3, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 2, 2, 3, 2, 1, 2, 1, 0, 2, 1, 0, 1, 3, 0, 2, 3, 2, 2, 3, 1, 3,
       2, 3, 0, 2, 3])

In [8]:
import requests
import spotipy
import time
from IPython.core.display import clear_output
from spotipy import SpotifyClientCredentials, util
import lyricsgenius as lg
import numpy as np
import json

def get_song_mood(song_id):
    client_id="YOUR CLIENT_ID"
    client_secret='YOUR CLIENT_SECRET'

    manager = SpotifyClientCredentials(client_id,client_secret)
    sp = spotipy.Spotify(client_credentials_manager=manager)
    api_key = "YOUR GENIUS API KEY"
    genius = lg.Genius(api_key, skip_non_songs=True, excluded_terms=["(Remix)", "(Live)"], remove_section_headers=True)

    features = sp.audio_features(song_id)
    meta = sp.track(song_id)
    if features and meta:
        name = meta['name']
        album = meta['album']['name']
        artist = meta['album']['artists'][0]['name']
        release_date = meta['album']['release_date']
        length = meta['duration_ms']
        popularity = meta['popularity']
        ids =  meta['id']
        danceability = features[0]['danceability']
        acousticness = features[0]['acousticness']
        energy = features[0]['energy']
        instrumentalness = features[0]['instrumentalness']
        liveness = features[0]['liveness']
        valence = features[0]['valence']
        loudness = features[0]['loudness']
        speechiness = features[0]['speechiness']
        tempo = features[0]['tempo']
        key = features[0]['key']
        time_signature = features[0]['time_signature']

        try:
            song = genius.search_song(name, artist)
            lyrics = song.lyrics
            X_test_new = [popularity, length, danceability, acousticness, energy, instrumentalness, liveness, valence, loudness, speechiness, tempo, key, time_signature]
            preprocessed_lyrics_list_web = preprocess_lyrics(lyrics, name) 
            total_lyrics = preprocessed_lyrics_test.copy()
            total_lyrics.append(preprocessed_lyrics_list_web)
            lyrics_tfidf_features_web = tf_idf_vectorizer.transform(total_lyrics).toarray()
            X_test_new = np.array(X_test_new).reshape(1, -1)
            X_test2 = np.concatenate((X_test1, X_test_new), axis = 0)
            X_web = MinMaxScaler().fit_transform(X_test2)
            X_web = np.concatenate((X_web, lyrics_tfidf_features_web), axis=1) 
            y_pred = model.predict(X_web)
            mood = ""
            if y_pred[225] == 0:
               mood = "Calm"
            elif y_pred[225] == 1:
               mood = "Energetic"
            elif y_pred[225] == 2:
               mood = "Happy"
            else:
               mood = "Sad"
            print(mood)
            result= {"Title":name, "Artist":artist, "Album":album, "Release Date":release_date, "Popularity":popularity, "Lyrics":lyrics, "Mood": mood}
            return result
        except Exception as e:
            lyrics = "nan"
            print(e)
            print("no lyrics found for the song")
        
    else:
        print("Failed to retrieve song features.")


  from IPython.core.display import clear_output


In [None]:
from flask import Flask, request
import json
app = Flask(__name__)
 
@app.route('/your_endpoint', methods=['POST'])
def process_song_link():
    song_link = request.form['song_link']
    result = get_song_mood(song_link)
    result_json = json.dumps(result, default=lambda o: str(o))
    return result_json
 
if __name__ == '__main__':
 app.run(port=8100)