In [1]:
import numpy as np

In [2]:
import pandas as pd
df = pd.read_csv("muse_with_english_lyrics_only.csv")
df.drop(["lang"],axis=1,inplace=True)
df.head()

Unnamed: 0,track,artist,final_emotion,lyrics
0,'Till I Collapse,Eminem,anger,"'Cause sometimes you just feel tired, feel wea..."
1,St. Anger,Metallica,anger,St. Anger 'round my neck\r\nSt. Anger 'round m...
2,Speedin',Rick Ross,anger,Legendary\r\nRunners\r\nYou know me\r\n\r\nTri...
3,Bamboo Banga,M.I.A.,anger,"Road runner, road runner\r\nGoing hundred mile..."
4,Die MF Die,Dope,anger,I don't need your forgiveness\r\nI don't need ...


In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')  # 👈 new addition for latest NLTK

from nltk.tokenize import word_tokenize
nltk.download('punkt'); nltk.download('wordnet'); nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_lyrics(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)              # remove [chorus], [verse]
    text = re.sub(r'[^a-z\s]', '', text)             # remove punctuation/numbers
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'\d+x', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

df["cleaned_lyrics"] = df["lyrics"].apply(clean_lyrics)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\LENOVO/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\LENOVO/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df.tail()

Unnamed: 0,track,artist,final_emotion,lyrics,cleaned_lyrics
45487,spirit,delerium,fear,This is a fucking spiral\r\nYeah\r\nI have tes...,fucking spiral yeah tested limit hatred ive ma...
45488,spooky,kevoz,fear,"Yeah, yeah, yeah\r\nBrainstorm went crazy on t...",yeah yeah yeah brainstorm went crazy beat dang...
45489,spooky,tok tok tok,fear,I was born in a dump\r\nMama died and my daddy...,born dump mama died daddy got drunk left die g...
45490,wild,the reds,anger,"Tooth to bone\r\nNail to stone, yeah\r\nPlaces...",tooth bone nail stone yeah place never go ooh ...
45491,you don't know me,deadbolt,fear,Ya ya ya ya ya (4x)\r\nYou don't know me\r\nYo...,ya ya ya ya ya x dont know youll never never p...


In [7]:
df["cleaned_lyrics"].replace("", np.nan, inplace=True)
df.dropna(subset=["cleaned_lyrics"], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["cleaned_lyrics"].replace("", np.nan, inplace=True)


In [8]:
df.head()

Unnamed: 0,track,artist,final_emotion,lyrics,cleaned_lyrics
0,'Till I Collapse,Eminem,anger,"'Cause sometimes you just feel tired, feel wea...",cause sometimes feel tired feel weak feel weak...
1,St. Anger,Metallica,anger,St. Anger 'round my neck\r\nSt. Anger 'round m...,st anger round neck st anger round neck never ...
2,Speedin',Rick Ross,anger,Legendary\r\nRunners\r\nYou know me\r\n\r\nTri...,legendary runner know trilla every dollar coun...
3,Bamboo Banga,M.I.A.,anger,"Road runner, road runner\r\nGoing hundred mile...",road runner road runner going hundred mile per...
4,Die MF Die,Dope,anger,I don't need your forgiveness\r\nI don't need ...,dont need forgiveness dont need hate dont need...


In [9]:
df.to_csv("cleaned_muse_with_english_lyrics_only.csv", index=False)
