In [104]:

from datasets import load_dataset
import pandas as pd
import numpy as np
import os
import re

#dataset = load_dataset("maharshipandya/spotify-tracks-dataset")

In [106]:
lyrics1_path = 'lyrics1_dataset.csv'
lyrics2_folder_path = 'lyrics2_dataset'

no_lyrics_text = "lyrics for this song have yet to be released please check back once the song has been released"

# Read the first dataset
df_master = pd.read_csv(lyrics1_path, usecols=['Artist Name', 'Song Name', 'Lyrics'])
df_master.columns = ['Artist', 'Title', 'Lyrics']  # Standardize column names

# Iterate through each CSV file in the lyrics2_dataset folder
for file in os.listdir(lyrics2_folder_path):
    file_path = os.path.join(lyrics2_folder_path, file)
    if os.path.isfile(file_path) and file_path.endswith('.csv'):
        # Read the dataset
        df_temp = pd.read_csv(file_path, usecols=['Artist', 'Title', 'Lyric'])

        # Filter out rows with no lyrics
        df_temp = df_temp.loc[~df_temp['Lyric'].str.contains(no_lyrics_text, case=False, na=False)]
        
        df_temp.columns = ['Artist', 'Title', 'Lyrics']  # Standardize column names
        
        # Concatenate with the master DataFrame
        df_master = pd.concat([df_master, df_temp], ignore_index=True)



allowed_chars_pattern = re.compile(r'[^a-zA-Z0-9 .,;\'"?!()-]')

def clean_text(text):
    text = text.replace('"', '')
    cleaned_text = re.sub(allowed_chars_pattern, '', text)
    return cleaned_text

def clean_dataframe(df):
    for col in df.columns:
        df[col] = df[col].astype(str).apply(clean_text)
    return df

# Assuming df_master is your DataFrame
df_master_cleaned = clean_dataframe(df_master)

df_master.to_csv('combined_lyrics_dataset.csv', index=False)

print("Dataset combined and saved.")

Dataset combined and saved.


In [98]:
features_df = pd.read_csv('features_dataset.csv').drop_duplicates(subset=['artists', 'track_name'], keep='first')
combined_lyrics_df = df_master
features_df['artists'] = features_df['artists'].str.split(';')
features_exploded_df = features_df.explode('artists')

#merged_df = pd.merge(combined_lyrics_df, features_exploded_df, left_on=['Artist', 'Title'], right_on=['artists', 'track_name'])
merged_df = pd.merge(combined_lyrics_df, features_exploded_df, left_on=['Title'], right_on=['track_name'])

final_columns = ['Artist', 'Title', 'Lyrics', 'track_id', 'popularity', 'duration_ms', 'explicit', 
                 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'track_genre']
final_merged_df = merged_df[final_columns]


final_merged_df = final_merged_df.drop_duplicates(subset=['Artist', 'Title'], keep='first')
final_merged_df.to_csv('final_lyrics_features_combined.csv', index=False)
state1 = final_merged_df

In [96]:
PAD_CHAR = u"\u25A1"  # the empty square character

# Function to pad each column
def pad_column(series):
    max_length = series.astype(str).apply(len).max()  # Find the max length in the column
    return series.astype(str).apply(lambda x: x + (PAD_CHAR * (max_length - len(x))))  # Pad each element

# Apply padding to each column and then concatenate
columns_to_concatenate = ['Artist', 'Title', 'popularity', 'duration_ms', 'explicit', 
                          'danceability', 'energy', 'key', 'loudness', 'mode', 
                          'speechiness', 'acousticness', 'instrumentalness', 
                          'liveness', 'valence', 'tempo', 'time_signature', 
                          'track_genre']

# Apply padding
for col in columns_to_concatenate:
    final_merged_df[col] = pad_column(final_merged_df[col])

# Concatenate the padded columns
final_merged_df['metadata'] = final_merged_df[columns_to_concatenate].agg(' '.join, axis=1)

# Select the 'metadata' and 'lyrics' columns to form a new DataFrame
new_df = final_merged_df[['metadata', 'Lyrics']].drop_duplicates(subset=['Lyrics'], keep='first')
# Output the new DataFrame to a TSV file
new_df.to_csv('metadata.tsv', sep='\t', index=False)


In [97]:
# in total 1127 rows
# shuffle rows
shuffled_df = new_df.sample(frac=1).reset_index(drop=True)
# make lyrics_dev.tsv first 300
shuffled_df[:300].to_csv('lyrics_dev.tsv', sep='\t', index=False)

# make lyrics_train.tsv last 827 random values
shuffled_df[300:].to_csv('lyrics_train.tsv', sep='\t', index=False)

In [99]:
# RUN CELL 3 AGAIN before running this.
# make lyrics_inputs.tsv 100 --> randomly shuffle the columns
shuffled_df = state1.apply(lambda x: x.sample(frac=1).reset_index(drop=True) if x.name != 'lyrics' else x)

# Define fluctuation ranges
fluctuation_ranges = {
    'popularity': (-10, 10),
    'duration_ms': (-300, 300),
    'danceability': (-0.2, 0.2),
    'energy': (-0.2, 0.2),
    'speechiness': (-0.1, 0.1),
    'acousticness': (-0.1, 0.1),
    'instrumentalness': (-0.1, 0.1),
    'liveness': (-0.1, 0.1)
}

# Apply fluctuations
for column, fluct_range in fluctuation_ranges.items():
    if column in shuffled_df.columns:
        random_fluctuation = np.random.uniform(fluct_range[0], fluct_range[1], shuffled_df.shape[0])
        shuffled_df[column] = shuffled_df[column].astype(float) + random_fluctuation
        print(f"Fluctuated {column} within range {fluct_range}.")

# Padding
PAD_CHAR = u"\u25A1"  # Define padding character

columns_to_concatenate = ['Artist', 'Title', 'popularity', 'duration_ms', 'explicit', 
                          'danceability', 'energy', 'key', 'loudness', 'mode', 
                          'speechiness', 'acousticness', 'instrumentalness', 
                          'liveness', 'valence', 'tempo', 'time_signature', 
                          'track_genre']

shuffled_df['explicit'] = shuffled_df['explicit'].astype(str)
state1['explicit'] = state1['explicit'].map({1: 'True', 0: 'False'})

# Assuming pad_column function is defined as in previous instructions
for col in columns_to_concatenate:
    if col in shuffled_df.columns:
        shuffled_df[col] = pad_column(shuffled_df[col])
        print(f"Padded {col}.")

# Create a new DataFrame for metadata
shuffled_df['metadata'] = shuffled_df[columns_to_concatenate].agg(' '.join, axis=1)
new_df = shuffled_df[['metadata', 'Lyrics']]  # Assuming 'lyrics' is a column to be preserved as is

new_df[:100].to_csv('lyrics_inputs.tsv', sep='\t', index=False)

Fluctuated popularity within range (-10, 10).
Fluctuated duration_ms within range (-300, 300).
Fluctuated danceability within range (-0.2, 0.2).
Fluctuated energy within range (-0.2, 0.2).
Fluctuated speechiness within range (-0.1, 0.1).
Fluctuated acousticness within range (-0.1, 0.1).
Fluctuated instrumentalness within range (-0.1, 0.1).
Fluctuated liveness within range (-0.1, 0.1).
Padded Artist.
Padded Title.
Padded popularity.
Padded duration_ms.
Padded explicit.
Padded danceability.
Padded energy.
Padded key.
Padded loudness.
Padded mode.
Padded speechiness.
Padded acousticness.
Padded instrumentalness.
Padded liveness.
Padded valence.
Padded tempo.
Padded time_signature.
Padded track_genre.


In [103]:
import re

# Define the pad character
PAD_CHAR = u"\u25A1"

# Define the regex pattern for allowed characters
allowed_chars_pattern = re.compile(r"[^a-zA-Z0-9.,'?! \u25A1]")

def clean_text(text):
    """
    Clean the text by removing any characters not allowed.
    """
    # Replace characters not in the allowed list with an empty string
    cleaned_text = re.sub(allowed_chars_pattern, '', text)
    return cleaned_text

def clean_tsv(input_file_path, output_file_path):
    """
    Clean the TSV file by removing the first row and any disallowed characters.
    """
    with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
        for line in file:
            cleaned_line = clean_text(line)
            outfile.write(cleaned_line)

# List of TSV files to clean
tsv_files = ['lyrics_dev.tsv', 'lyrics_inputs.tsv', 'lyrics_train.tsv']

# Clean each TSV file
for file_name in tsv_files:
    # Define the output file name, you can adjust the naming convention as needed
    output_file_name = f"cleaned_{file_name}"
    clean_tsv(file_name, output_file_name)
    print(f"Cleaned file written to: {output_file_name}")


Cleaned file written to: cleaned_lyrics_dev.tsv
Cleaned file written to: cleaned_lyrics_inputs.tsv
Cleaned file written to: cleaned_lyrics_train.tsv
