In [1]:
import pandas as pd

# === File paths ===
failed_files = [
    "failed_akipped_lyrics.csv",
    "failed_lyrics.csv",
    "failed_skipped_lyrics2.csv"
]
merged_failed_csv = "failed_lyrics_merged.csv"

# === Load and standardize each ===
dfs = []
for file in failed_files:
    df = pd.read_csv(file)
    # Keep only track & artist columns (drop reason, lyrics if exist)
    df = df[['track', 'artist']]
    dfs.append(df)

# === Concatenate all ===
merged_failed = pd.concat(dfs, ignore_index=True).drop_duplicates()
merged_failed.to_csv(merged_failed_csv, index=False)

print(f"✅ Merged failed files → {merged_failed_csv}")
print(f"📝 Total unique failed entries: {len(merged_failed)}")


✅ Merged failed files → failed_lyrics_merged.csv
📝 Total unique failed entries: 40504


In [2]:
df.drop_duplicates

<bound method DataFrame.drop_duplicates of                 track                     artist
0                  #1                       石井妥師
1     'round midnight    anne guus teerhuis trio
2                  07                     enigma
3                   3               florent ghys
4                   3  sunburned hand of the man
...               ...                        ...
1731     zero gravity    the transmissionary six
1732             zero                     island
1733             zero                     loscil
1734             zero            t. raumschmiere
1735         zoetrope                     keshco

[1736 rows x 2 columns]>

In [None]:
import pandas as pd

# === File paths ===
merged_failed_csv = "failed_lyrics_merged.csv"          # already merged file
muse_csv = "muse_90k_final.csv"                  # original dataset with emotion labels
output_csv = "failed_merged_with_emotion.csv"    # final clean output

# === Load datasets ===
failed_df = pd.read_csv(merged_failed_csv)
muse_df = pd.read_csv(muse_csv)

# === Merge to add final_emotion from original dataset ===
failed_with_emotion = failed_df.merge(
    muse_df[['track', 'artist', 'final_emotion']],
    on=['track', 'artist'],
    how='left'
)

# === Drop duplicates just in case ===
failed_with_emotion = failed_with_emotion.drop_duplicates(subset=['track', 'artist'])

# === Keep only required columns ===
failed_with_emotion = failed_with_emotion[['track', 'artist', 'final_emotion']]

# === Save result ===
failed_with_emotion.to_csv(output_csv, index=False)

print(f"✅ Final failed dataset with emotion saved → {output_csv}")
print(f"📊 Total rows: {len(failed_with_emotion)}")
print(f"⚠️ Missing emotion labels: {failed_with_emotion['final_emotion'].isna().sum()}")


📝 Total unique failed entries: 40504


In [4]:
import pandas as pd

# === File paths ===
merged_failed_csv = "failed_lyrics_merged.csv"          # already merged file
muse_csv = "muse_final_emotion_dataset.csv"                  # original dataset with emotion labels
output_csv = "failed_merged_with_emotion.csv"    # final clean output

# === Load datasets ===
failed_df = pd.read_csv(merged_failed_csv)
muse_df = pd.read_csv(muse_csv)

# === Merge to add final_emotion from original dataset ===
failed_with_emotion = failed_df.merge(
    muse_df[['track', 'artist', 'final_emotion']],
    on=['track', 'artist'],
    how='left'
)

# === Drop duplicates just in case ===
failed_with_emotion = failed_with_emotion.drop_duplicates(subset=['track', 'artist'])

# === Keep only required columns ===
failed_with_emotion = failed_with_emotion[['track', 'artist', 'final_emotion']]

# === Save result ===
failed_with_emotion.to_csv(output_csv, index=False)

print(f"✅ Final failed dataset with emotion saved → {output_csv}")
print(f"📊 Total rows: {len(failed_with_emotion)}")
print(f"⚠️ Missing emotion labels: {failed_with_emotion['final_emotion'].isna().sum()}")


✅ Final failed dataset with emotion saved → failed_merged_with_emotion.csv
📊 Total rows: 40504
⚠️ Missing emotion labels: 3414


In [7]:
import pandas as pd

# Load datasets
failed = pd.read_csv("failed_merged_with_emotion.csv")
muse = pd.read_csv("muse_final_emotion_dataset.csv")  # original

# Identify rows with missing emotion
missing_emotion = failed[failed['final_emotion'].isna()]
print("Missing emotion count:", len(missing_emotion))

# Check sample mismatches
print(missing_emotion.head())

# See if stripping spaces helps match more
failed['track_clean'] = failed['track'].str.strip().str.lower()
failed['artist_clean'] = failed['artist'].str.strip().str.lower()
muse['track_clean'] = muse['track'].str.strip().str.lower()
muse['artist_clean'] = muse['artist'].str.strip().str.lower()

# Check how many of missing ones exist in muse after cleaning
missing_clean_match = missing_emotion.merge(
    muse[['track_clean', 'artist_clean']],
    on=['track_clean', 'artist_clean'],
    how='inner'
)
print("After cleaning, matches found:", len(missing_clean_match))


Missing emotion count: 3414
                          track                         artist final_emotion
0                            #1                      spambient           NaN
1          (left a) pretty scar                  adrian sieber           NaN
2  04 - de mende dans les mende  zeru ta lur / cérou ta lourre           NaN
3                            05                     panda bear           NaN
5                          2012                    lifeisround           NaN


KeyError: 'track_clean'

In [8]:
import pandas as pd
import re

# Load
failed = pd.read_csv("failed_lyrics_merged.csv")  # before emotion merge
muse = pd.read_csv("muse_final_emotion_dataset.csv")

# Clean function for track/artist
def clean_text(x):
    if pd.isna(x): return ""
    x = x.strip().lower()
    x = re.sub(r'^\d+\s*[-–]?\s*', '', x)   # remove leading track numbers like "01 -"
    x = re.sub(r'[^\w\s]', '', x)           # remove punctuation
    x = re.sub(r'\s+', ' ', x)              # normalize spaces
    return x

# Apply cleaning
failed['track_clean'] = failed['track'].apply(clean_text)
failed['artist_clean'] = failed['artist'].apply(clean_text)
muse['track_clean'] = muse['track'].apply(clean_text)
muse['artist_clean'] = muse['artist'].apply(clean_text)

# Merge on cleaned fields
merged = failed.merge(
    muse[['track_clean','artist_clean','final_emotion']],
    on=['track_clean','artist_clean'],
    how='left'
)

# Check result
print("✅ Total rows after merge:", len(merged))
print("⚠️ Still missing emotions:", merged['final_emotion'].isna().sum())

# Save
merged.to_csv("failed_merged_with_emotion_cleaned.csv", index=False)


✅ Total rows after merge: 40899
⚠️ Still missing emotions: 0
