In [1]:
import pandas as pd
from pathlib import Path

root = Path("..")
audio_final = pd.read_parquet(root / "data/merged/train_audio_only_final.parquet")
audio_ids = set(audio_final["spotify_id"].astype(str))

print("audio_final rows:", len(audio_final))
print("unique audio ids:", len(audio_ids))


audio_final rows: 1432
unique audio ids: 1427


In [2]:
import pandas as pd
from pathlib import Path

root = Path("..")
audio_final = pd.read_parquet(root / "data/merged/train_audio_only_final.parquet")
audio_ids = set(audio_final["spotify_id"].astype(str))

print("audio_final rows:", len(audio_final))
print("unique audio ids:", len(audio_ids))


audio_final rows: 1432
unique audio ids: 1427


In [3]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

root = Path("..")
csv_path = root / "data/merged/merged_with_lyrics.csv"

usecols = ["spotify_id", "name", "artist", "lyrics"]  # শুধু দরকারি কলাম
chunksize = 50000  # চাইলে 20000 করো RAM কম হলে

kept = []
seen = 0
matched = 0

for ck in tqdm(pd.read_csv(csv_path, usecols=usecols, chunksize=chunksize), desc="Reading merged_with_lyrics.csv"):
    # progress counters
    seen += len(ck)
    ck["spotify_id"] = ck["spotify_id"].astype(str)

    # filter by audio ids
    ck2 = ck[ck["spotify_id"].isin(audio_ids)].copy()
    ck2 = ck2[ck2["lyrics"].notna()].copy()

    matched += len(ck2)

    if len(ck2) > 0:
        kept.append(ck2)

    # live update every chunk
    tqdm.write(f"seen={seen}, matched={matched}")

df = pd.concat(kept, ignore_index=True) if kept else pd.DataFrame(columns=usecols)
out = root / "data/merged/train_audio_lyrics_final.parquet"
df.to_parquet(out, index=False)

print("✅ saved:", out)
print("✅ rows for lyrics embedding:", len(df))


Reading merged_with_lyrics.csv: 1it [00:00, 29.59it/s]

seen=2006, matched=1432
✅ saved: ..\data\merged\train_audio_lyrics_final.parquet
✅ rows for lyrics embedding: 1432





In [4]:
import pandas as pd
from pathlib import Path

root = Path("..")

df = pd.read_parquet(root / "data/merged/train_audio_lyrics_final.parquet")
print("rows:", len(df))
print(df.columns)
df.head(2)


rows: 1432
Index(['spotify_id', 'name', 'artist', 'lyrics'], dtype='object')


Unnamed: 0,spotify_id,name,artist,lyrics
0,0z4sFQ2yzLbjzSsNC5qAkD,Shadow Moses,Bring Me The Horizon,[Chorus]\nCan you tell from the look in our ey...
1,0qQBuGsT79rKtmmfC0V38x,Haru Haru,BIGBANG,"[Intro: G-Dragon, Seungri]\nYeah\nFinally, I r..."


In [5]:
import numpy as np
from pathlib import Path

root = Path("..")

X = np.load(root / "data/merged/X_audio_mfcc.npy")
print(X.shape)


(1432, 12000)


In [6]:
import pandas as pd
from pathlib import Path

root = Path("..")
df = pd.read_parquet(root / "data/merged/train_audio_lyrics_final.parquet")

print("rows:", len(df))
df.head(2)


rows: 1432


Unnamed: 0,spotify_id,name,artist,lyrics
0,0z4sFQ2yzLbjzSsNC5qAkD,Shadow Moses,Bring Me The Horizon,[Chorus]\nCan you tell from the look in our ey...
1,0qQBuGsT79rKtmmfC0V38x,Haru Haru,BIGBANG,"[Intro: G-Dragon, Seungri]\nYeah\nFinally, I r..."


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import joblib

texts = df["lyrics"].astype(str).tolist()

vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_lyrics = vectorizer.fit_transform(texts)

print("X_lyrics shape:", X_lyrics.shape)


X_lyrics shape: (1432, 5000)


In [8]:
root = Path("..")

sparse.save_npz(root / "data/merged/X_lyrics_tfidf.npz", X_lyrics)
joblib.dump(vectorizer, root / "data/merged/tfidf_vectorizer.joblib")

print("saved: data/merged/X_lyrics_tfidf.npz")
print("saved: data/merged/tfidf_vectorizer.joblib")


saved: data/merged/X_lyrics_tfidf.npz
saved: data/merged/tfidf_vectorizer.joblib
