In [2]:
# cell 0 - count rows in all CSV files in current folder and print per-file counts + total
from pathlib import Path
import csv

def count_csv_rows(path):
    try:
        with path.open(newline='', encoding='utf-8') as f:
            return sum(1 for _ in csv.reader(f))
    except UnicodeDecodeError:
        with path.open(newline='', encoding='latin-1') as f:
            return sum(1 for _ in csv.reader(f))

p = Path('.')
csv_files = sorted(p.glob('*.csv'))

if not csv_files:
    print("No CSV files found in current folder.")
else:
    totals = {}
    for f in csv_files:
        rows = count_csv_rows(f)
        totals[f.name] = rows
        print(f"{f.name}: {rows}")
    total_sum = sum(totals.values())
    print("Total rows across all CSV files:", total_sum)

Billie_Ilish.csv: 41
Drake.csv: 106
Ed_Sheeran.csv: 126
Eminem.csv: 100
Justin_Bieber.csv: 96
Kanye_West.csv: 145
Kendrick_Lamar.csv: 97
Postmalone.csv: 63
Rihanna.csv: 128
Taylor_Swift.csv: 127
The_Weeknd.csv: 116
Travis_Scott.csv: 110
Total rows across all CSV files: 1255


In [7]:
import pandas as pd

# File names and final target record counts
files_targets = {
    "Billie_Ilish.csv": 41,
    "Drake.csv": 87,
    "Ed_Sheeran.csv": 86,
    "Eminem.csv": 100,
    "Justin_Bieber.csv": 96,
    "Kanye_West.csv": 86,
    "Kendrick_Lamar.csv": 97,
    "Postmalone.csv": 63,
    "Rihanna.csv": 90,
    "Taylor_Swift.csv": 87,
    "The_Weeknd.csv": 86,
    "Travis_Scott.csv": 86
}

master_df = []

for file, target_count in files_targets.items():
    # Load CSV
    df = pd.read_csv(file)

    # Shuffle so selection is random
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Trim to the required number of rows
    df = df.head(target_count)

    # Add artist column (file name without .csv)
    artist_name = file.replace(".csv", "")
    df["artist"] = artist_name

    master_df.append(df)

# Combine all final trimmed data
balanced_master = pd.concat(master_df, ignore_index=True)

# Save final dataset
balanced_master.to_csv("balanced_master.csv", index=False)

print("Balanced dataset created: balanced_master.csv")
print("Final shape:", balanced_master.shape)


Balanced dataset created: balanced_master.csv
Final shape: (1000, 9)


In [8]:
df = pd.read_csv("balanced_master.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   SongName   1000 non-null   object
 1   AlbumName  1000 non-null   object
 2   AlbumLink  1000 non-null   object
 3   Year       1000 non-null   int64 
 4   PlayCount  1000 non-null   int64 
 5   Lyrics     1000 non-null   object
 6   SongLink   1000 non-null   object
 7   Duration   1000 non-null   int64 
 8   artist     1000 non-null   object
dtypes: int64(3), object(6)
memory usage: 70.4+ KB


In [11]:
print(df['artist'].value_counts())

artist
Eminem            99
Kendrick_Lamar    96
Justin_Bieber     95
Rihanna           90
Drake             87
Taylor_Swift      87
Ed_Sheeran        86
Kanye_West        86
The_Weeknd        86
Travis_Scott      86
Postmalone        62
Billie_Ilish      40
Name: count, dtype: int64


In [10]:
len(df)

1000

In [12]:
import re
from pathlib import Path

out_dir = Path("Top_Artists")
out_dir.mkdir(exist_ok=True)

def sanitize_filename(s: str) -> str:
    s = s.strip()
    s = re.sub(r'[\\/*?:"<>|]', "", s)  # remove illegal filename chars
    s = re.sub(r'\s+', '_', s)  # replace whitespace with underscore
    return s[:200]  # cap length to avoid OS limits

seen = {}
count = 0

for idx, row in df.iterrows():
    artist = str(row.get("artist", "")).strip()
    song = str(row.get("SongName", "")).strip()
    lyrics = row.get("Lyrics", "")

    a = sanitize_filename(artist) or "unknown_artist"
    s = sanitize_filename(song) or f"song_{idx}"

    base_name = f"{a}_{s}.txt"
    if base_name in seen:
        seen[base_name] += 1
        filename = f"{a}_{s}_{seen[base_name]}.txt"
    else:
        seen[base_name] = 0
        filename = base_name

    path = out_dir / filename
    path.write_text(str(lyrics), encoding="utf-8")
    count += 1

print(f"Saved {count} lyric files to {out_dir}")

Saved 1000 lyric files to Top_Artists
