In [4]:
import pandas as pd
import json

# === File paths ===
MUSE_CSV = "muse_final_emotion_dataset.csv"       # original dataset
LYRICS_CSV = "muse_with_lyrics.csv"               # after fetching
FAILED_CSV = "failed_lyrics.csv"                  # existing failed file
PROGRESS_JSON = "progress_smh.json"                   # tracks traversed
SKIPPED_OUT_CSV = "skipped_refetch.csv"           # output skipped songs

# === Step 1: Move 'lyrics_not_found' from fetched CSV to failed list ===
print("🔸 Loading datasets...")
lyrics_df = pd.read_csv(LYRICS_CSV)
failed_df = pd.read_csv(FAILED_CSV) if not pd.read_csv(FAILED_CSV).empty else pd.DataFrame(columns=lyrics_df.columns)

# Identify rows with lyrics_not_found
not_found_mask = lyrics_df['lyrics'].eq('LYRICS_NOT_FOUND') | lyrics_df['lyrics'].isna()
not_found_df = lyrics_df[not_found_mask]

print(f"Found {len(not_found_df)} entries with LYRICS_NOT_FOUND or no lyrics.")

# Append them to failed list (avoid duplicates)
failed_df = pd.concat([failed_df, not_found_df]).drop_duplicates(subset=["track", "artist"])
failed_df.to_csv(FAILED_CSV, index=False)
print(f"✅ Failed file updated → {FAILED_CSV}")

# Optionally, remove them from lyrics_df if needed (doesn't modify original file here)
# lyrics_df = lyrics_df[~not_found_mask]



🔸 Loading datasets...
Found 2434 entries with LYRICS_NOT_FOUND or no lyrics.
✅ Failed file updated → failed_lyrics.csv


In [3]:
print(lyrics_df.columns)


Index(['track', 'artist', 'final_emotion', 'lyrics'], dtype='object')


In [6]:
import pandas as pd

# === File paths ===
INPUT_CSV = "muse_with_lyrics.csv"
OUTPUT_CSV = "muse_with_lyrics_cleaned.csv"

# === Load the dataset ===
df = pd.read_csv(INPUT_CSV)
initial_count = len(df)

# === Identify rows to drop ===
drop_mask = df['lyrics'].eq('LYRICS_NOT_FOUND') | df['lyrics'].isna()
drop_count = drop_mask.sum()

# === Filter out unwanted rows ===
cleaned_df = df[~drop_mask].copy()
final_count = len(cleaned_df)

# === Save to new CSV ===
cleaned_df.to_csv(OUTPUT_CSV, index=False)

print(f"🧹 Cleaned dataset saved → {OUTPUT_CSV}")
print(f"❌ Removed {drop_count} rows (LYRICS_NOT_FOUND or empty lyrics)")
print(f"✅ Remaining rows: {final_count} / {initial_count}")


🧹 Cleaned dataset saved → muse_with_lyrics_cleaned.csv
❌ Removed 2434 rows (LYRICS_NOT_FOUND or empty lyrics)
✅ Remaining rows: 42328 / 44762


In [29]:
df = pd.read_csv(OUTPUT_CSV)

print("\n📊 Emotion distribution after first fetching:")
print(df['final_emotion'].value_counts())


📊 Emotion distribution after first fetching:
final_emotion
sadness       6980
joy           6916
reflective    6127
excitement    5965
calm          4614
neutral       4597
romantic      3290
anger         2713
fear          1126
Name: count, dtype: int64


In [11]:
# === Step 2: Extract skipped songs ===
print("\n🔸 Extracting skipped songs...")

# Load original dataset
muse_df = pd.read_csv(MUSE_CSV)

# Load progress.json
with open(PROGRESS_JSON, "r", encoding="utf-8") as f:
    progress = json.load(f)

# iterate through values (which are the actual song entries)
traversed_titles = set(item["track"] for item in progress.values())
traversed_artists = set(item["artist"] for item in progress.values())
traversed_pairs = set((item["track"], item["artist"]) for item in progress.values())

# Load progress.json
# with open(PROGRESS_JSON, "r", encoding="utf-8") as f:
#     progress = json.load(f)

# traversed_titles = set(item["track"] for item in progress)
# traversed_artists = set(item["artist"] for item in progress)
# traversed_pairs = set((item["track"], item["artist"]) for item in progress)

# Filter out songs that were traversed
mask = muse_df.apply(lambda row: (row["track"], row["artist"]) not in traversed_pairs, axis=1)
skipped_df = muse_df[mask]

print(f"Skipped songs found: {len(skipped_df)}")
skipped_df.to_csv(SKIPPED_OUT_CSV, index=False)
print(f"✅ Skipped songs saved → {SKIPPED_OUT_CSV}")



🔸 Extracting skipped songs...


TypeError: 'bool' object is not subscriptable

In [10]:
import json

with open(PROGRESS_JSON, "r", encoding="utf-8") as f:
    progress = json.load(f)

print(type(progress))
print(list(progress.items())[:3])   # preview first 3 entries


<class 'dict'>
[('45', True), ('463', True), ('1923', True)]


In [18]:
import pandas as pd
import json

# === File paths ===
MUSE_CSV = "muse_final_emotion_dataset.csv"
PROGRESS_JSON = "progress_smh.json"
SKIPPED_OUT_CSV = "skipped_refetch.csv"

# === Load original dataset ===
muse_df = pd.read_csv(MUSE_CSV)
print(f"📄 Original dataset size: {len(muse_df)}")

# === Load progress.json (all keys are track titles) ===
with open(PROGRESS_JSON, "r", encoding="utf-8") as f:
    progress = json.load(f)



# Optional: handle artist matching as well
# Create combined "track|artist" keys if you want exact match
progress_keys = set(progress.keys())

# === Find skipped songs ===
# Filter out rows where track (or track|artist) is in progress
skipped_df = muse_df[~muse_df['track'].isin(progress_keys)]
print(f"⏸️ Skipped entries identified: {len(skipped_df)}")

# === Save skipped songs ===
skipped_df.to_csv(SKIPPED_OUT_CSV, index=False)
print(f"💾 Skipped songs saved to: {SKIPPED_OUT_CSV}")


📄 Original dataset size: 90001
⏸️ Skipped entries identified: 0
⏸️ Skipped entries identified: 0
💾 Skipped songs saved to: skipped_refetch.csv


In [22]:
print("Sample keys from progress.json:", list(progress.keys())[:10])
print("Sample tracks from dataset:", muse_df['track'].head(10).tolist())


Sample keys from progress.json: ["'Till I Collapse", 'St. Anger', "Speedin'", 'Bamboo Banga', 'Die MF Die', 'Step Up', 'Feedback', '7 Words', 'Limp', 'Sweet Amber']
Sample tracks from dataset: ["'Till I Collapse", 'St. Anger', "Speedin'", 'Bamboo Banga', 'Die MF Die', 'Step Up', 'Feedback', '7 Words', 'Limp', 'Sweet Amber']


In [28]:
import pandas as pd
import json
import unicodedata

# === File paths ===
MUSE_CSV = "muse_final_emotion_dataset.csv"
PROGRESS_JSON = "progress_smh.json"
SKIPPED_OUT_CSV = "skipped_refetch.csv"

# === Helper: Normalize strings ===
def normalize_text(s):
    if not isinstance(s, str):
        return ""
    return unicodedata.normalize('NFKC', s.strip().lower())

# === Load original dataset ===
muse_df = pd.read_csv(MUSE_CSV)
print(f"📄 Original dataset size: {len(muse_df)}")

# Normalize track titles in dataset
muse_df["track_normalized"] = muse_df["track"].apply(normalize_text)

# === Load progress.json ===
with open(PROGRESS_JSON, "r", encoding="utf-8") as f:
    progress = json.load(f)

# Normalize progress keys
progress_keys_normalized = set(normalize_text(k) for k in progress.keys())

# === Filter skipped songs ===
# Create a mask instead of slicing first
skipped_mask = ~muse_df["track_normalized"].isin(progress_keys_normalized)

# Then apply to the original dataset (without the normalized column)
skipped_df = muse_df.loc[skipped_mask, muse_df.columns.difference(["track_normalized"])]
print(f"⏸️ Skipped entries identified: {len(skipped_df)}")

# === Save to CSV ===
skipped_df.to_csv(SKIPPED_OUT_CSV, index=False)
print(f"💾 Skipped songs saved to: {SKIPPED_OUT_CSV}")

import os

skipped_df.to_csv(SKIPPED_OUT_CSV, index=False)
print(f"💾 Skipped songs saved to: {os.path.abspath(SKIPPED_OUT_CSV)}")

# Double-check file content
with open(SKIPPED_OUT_CSV, "r", encoding="utf-8") as f:
    content = f.read()
    print("\n📄 CSV File Content Preview:\n")
    print(content[:500] if content else "[Empty file]")


print("Preview of skipped entries:")
print(skipped_df.head(5))



📄 Original dataset size: 90001
⏸️ Skipped entries identified: 121
💾 Skipped songs saved to: skipped_refetch.csv
💾 Skipped songs saved to: c:\Users\LENOVO\Desktop\LSRA\Sentiment_Classification\Lyrics_Fetching\updated\skipped_refetch.csv

📄 CSV File Content Preview:

artist,final_emotion,track
Dry Kill Logic,anger,4039
Elvis Costello,anger,45
Buck 65,anger,463
Marissa Nadler,sadness,1923
Decapitated,excitement,404
Сплин,excitement,3007
Daturah,excitement,9
Sunny Day Real Estate,excitement,9
Cornelius,excitement,2010
Squarepusher,reflective,4001
坂本龍一,anger,1919
Café Tacvba,sadness,53100
Bohren & der Club of Gore,sadness,3
The Dresden Dolls,reflective,672
Interpol,fear,5
Emilie Autumn,fear,306
Sedativ,sadness,9
The Leather Nun,sadness,506
Mattafix,fear,1130
Th
Preview of skipped entries:
              artist final_emotion track
503   Dry Kill Logic         anger  4039
1740  Elvis Costello         anger    45
1772         Buck 65         anger   463
2670  Marissa Nadler       sadness  1923


In [33]:
import csv

INPUT_CSV = "akipped_muse_with_lyrics.csv"
TEMP_CSV = "skipped_temp_clean.csv"

with open(INPUT_CSV, 'r', encoding='utf-8', errors='ignore') as infile, \
     open(TEMP_CSV, 'w', encoding='utf-8', newline='') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    for row in reader:
        if len(row) >= 3:    # make sure track, artist, lyrics exist
            writer.writerow(row[:3])  # truncate any extra messy fields

# Now load the cleaned CSV
df = pd.read_csv(TEMP_CSV)
print(f"✅ Loaded {len(df)} rows after pre-cleaning")


✅ Loaded 9614 rows after pre-cleaning


In [35]:
import pandas as pd

# === File paths ===
INPUT_CSV = "skipped_temp_clean.csv"
OUTPUT_CSV = "skipped_muse_with_lyrics_cleaned.csv"

# === Load the dataset safely ===
df = pd.read_csv(INPUT_CSV, quotechar='"', escapechar='\\')
initial_count = len(df)

# === Clean lyrics column and identify rows to drop ===
df['lyrics'] = df['lyrics'].astype(str).str.strip()
drop_mask = (df['lyrics'].eq('LYRICS_NOT_FOUND')) | (df['lyrics'].isna()) | (df['lyrics'].eq('nan'))
drop_count = drop_mask.sum()

# === Filter out unwanted rows ===
cleaned_df = df[~drop_mask].copy()
final_count = len(cleaned_df)

# === Save to new CSV ===
cleaned_df.to_csv(OUTPUT_CSV, index=False)

print(f"🧹 Cleaned dataset saved → {OUTPUT_CSV}")
print(f"❌ Removed {drop_count} rows (LYRICS_NOT_FOUND or empty lyrics)")
print(f"✅ Remaining rows: {final_count} / {initial_count}")


🧹 Cleaned dataset saved → skipped_muse_with_lyrics_cleaned.csv
❌ Removed 2496 rows (LYRICS_NOT_FOUND or empty lyrics)
✅ Remaining rows: 7118 / 9614
