In [1]:
# cell 0 - count rows in all CSV files in current folder and print per-file counts + total
from pathlib import Path
import csv

def count_csv_rows(path):
    try:
        with path.open(newline='', encoding='utf-8') as f:
            return sum(1 for _ in csv.reader(f))
    except UnicodeDecodeError:
        with path.open(newline='', encoding='latin-1') as f:
            return sum(1 for _ in csv.reader(f))

p = Path('.')
csv_files = sorted(p.glob('*.csv'))

if not csv_files:
    print("No CSV files found in current folder.")
else:
    totals = {}
    for f in csv_files:
        rows = count_csv_rows(f)
        totals[f.name] = rows
        print(f"{f.name}: {rows}")
    total_sum = sum(totals.values())
    print("Total rows across all CSV files:", total_sum)

DieAntwoord.csv: 74
JPEGMAFIA.csv: 108
WittLowry.csv: 62
bjork.csv: 83
charlesaznavour.csv: 54
duster.csv: 68
gangstarr.csv: 86
goonrock.csv: 2
justinskye.csv: 51
kenyagrace.csv: 18
kyan.csv: 9
mako.csv: 20
markmorrison.csv: 11
maydayparade.csv: 78
metronomy.csv: 50
natalieimbruglia.csv: 67
reneeelisegoldsberry.csv: 14
roar.csv: 32
stevieray vaughan.csv: 42
thehollies.csv: 220
troy.csv: 11
vigiland.csv: 16
Total rows across all CSV files: 1176


In [None]:
# import pandas as pd

# # File names and final target record counts
# files_targets = {
#     "Billie_Ilish.csv": 41,
#     "Drake.csv": 87,
#     "Ed_Sheeran.csv": 86,
#     "Eminem.csv": 100,
#     "Justin_Bieber.csv": 96,
#     "Kanye_West.csv": 86,
#     "Kendrick_Lamar.csv": 97,
#     "Postmalone.csv": 63,
#     "Rihanna.csv": 90,
#     "Taylor_Swift.csv": 87,
#     "The_Weeknd.csv": 86,
#     "Travis_Scott.csv": 86
# }

# master_df = []

# for file, target_count in files_targets.items():
#     # Load CSV
#     df = pd.read_csv(file)

#     # Shuffle so selection is random
#     df = df.sample(frac=1, random_state=42).reset_index(drop=True)

#     # Trim to the required number of rows
#     df = df.head(target_count)

#     # Add artist column (file name without .csv)
#     artist_name = file.replace(".csv", "")
#     df["artist"] = artist_name

#     master_df.append(df)

# # Combine all final trimmed data
# balanced_master = pd.concat(master_df, ignore_index=True)

# # Save final dataset
# balanced_master.to_csv("balanced_master.csv", index=False)

# print("Balanced dataset created: balanced_master.csv")
# print("Final shape:", balanced_master.shape)


Balanced dataset created: balanced_master.csv
Final shape: (1000, 9)


In [75]:
import pandas as pd

master_df_list = []

for f in csv_files:
    try:
        # Load the CSV file
        df_temp = pd.read_csv(f)
        
        # Extract artist name from filename
        artist_name = f.stem
        
        # Add the 'artist' column
        df_temp['artist'] = artist_name
        
        # Append the dataframe to our list
        master_df_list.append(df_temp)
        
    except Exception as e:
        print(f"Could not process file {f.name}: {e}")

# Concatenate all dataframes in the list into one
if master_df_list:
    master_df = pd.concat(master_df_list, ignore_index=True)

    # Save the combined dataframe to a new CSV file
    master_df.to_csv("master_df.csv", index=False)

    print("Successfully combined all CSV files into master_df.csv")
    print("Final shape of the master dataframe:", master_df.shape)
else:
    print("No dataframes were created to combine.")

Successfully combined all CSV files into master_df.csv
Final shape of the master dataframe: (1152, 9)


In [76]:
df = pd.read_csv("master_df.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1152 entries, 0 to 1151
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   SongName   1152 non-null   object
 1   AlbumName  1152 non-null   object
 2   AlbumLink  1152 non-null   object
 3   Year       1152 non-null   int64 
 4   PlayCount  1152 non-null   int64 
 5   Lyrics     1152 non-null   object
 6   SongLink   1152 non-null   object
 7   Duration   1152 non-null   int64 
 8   artist     1152 non-null   object
dtypes: int64(3), object(6)
memory usage: 81.1+ KB


In [77]:
print(df['artist'].value_counts())

artist
thehollies              219
JPEGMAFIA               107
gangstarr                85
bjork                    82
maydayparade             77
DieAntwoord              73
duster                   67
natalieimbruglia         66
WittLowry                61
charlesaznavour          53
justinskye               50
metronomy                49
stevieray vaughan        41
roar                     31
mako                     19
kenyagrace               15
vigiland                 15
reneeelisegoldsberry     13
markmorrison             10
troy                     10
kyan                      8
goonrock                  1
Name: count, dtype: int64


In [78]:
len(df)

1152

In [79]:
import re
from pathlib import Path

out_dir = Path("Low Artists")
out_dir.mkdir(exist_ok=True)

def sanitize_filename(s: str) -> str:
    s = s.strip()
    s = re.sub(r'[\\/*?:"<>|]', "", s)  # remove illegal filename chars
    s = re.sub(r'\s+', '_', s)  # replace whitespace with underscore
    return s[:200]  # cap length to avoid OS limits

seen = {}
count = 0

for idx, row in df.iterrows():
    artist = str(row.get("artist", "")).strip()
    song = str(row.get("SongName", "")).strip()
    lyrics = row.get("Lyrics", "")

    a = sanitize_filename(artist) or "unknown_artist"
    s = sanitize_filename(song) or f"song_{idx}"

    base_name = f"{a}_{s}.txt"
    if base_name in seen:
        seen[base_name] += 1
        filename = f"{a}_{s}_{seen[base_name]}.txt"
    else:
        seen[base_name] = 0
        filename = base_name

    path = out_dir / filename
    path.write_text(str(lyrics), encoding="utf-8")
    count += 1

print(f"Saved {count} lyric files to {out_dir}")

Saved 1152 lyric files to Low Artists


In [71]:
import pandas as pd
# Using the 'df' DataFrame which is already loaded with 'balanced_master.csv'
df = pd.read_csv("WittLowry.csv")

In [72]:
df['SongName'] = df['SongName'].str.strip()

In [73]:

# Get the 'SongName' column and its corresponding indices
songs = df['SongName'].tolist()
indices = df.index.tolist()

# A set to keep track of printed indices to avoid duplicate reports
reported_indices = set()

print("Checking for partial duplicate song names...")

# Iterate through each song and compare it with every other song
for i in range(len(songs)):
    # Skip if this song has already been reported as a duplicate
    if i in reported_indices:
        continue

    # Clean up the first song name
    s1 = str(songs[i]).strip().lower()
    
    # List to hold matches for the current song
    matches = []

    # Compare with subsequent songs
    for j in range(i + 1, len(songs)):
        # Clean up the second song name
        s2 = str(songs[j]).strip().lower()

        # Check for partial match (substring)
        if s1 in s2 or s2 in s1:
            matches.append(j)

    # If any matches were found for song s1
    if matches:
        print("\n--- Match Found ---")
        print(f"Original: Index {indices[i]}, Name: {songs[i]}")
        reported_indices.add(i)
        for match_idx in matches:
            print(f"   Match: Index {indices[match_idx]}, Name: {songs[match_idx]}")
            reported_indices.add(match_idx)

if not reported_indices:
    print("No partial duplicates found in 'SongName' column.")

Checking for partial duplicate song names...

--- Match Found ---
Original: Index 0, Name: ALONE
   Match: Index 19, Name: Hurt Alone

--- Match Found ---
Original: Index 18, Name: HURT
   Match: Index 19, Name: Hurt Alone


In [33]:
# Drop the row at index 100
df = df.drop(13)


In [34]:

# Save the updated DataFrame back to the CSV file
df.to_csv("kenyagrace.csv", index=False)
print("New shape of the DataFrame:", df.shape)

New shape of the DataFrame: (15, 8)
