# Dataset Preperation

In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to C:\Users\SHASHANK
[nltk_data]     P\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\SHASHANK
[nltk_data]     P\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Loading Data

In [3]:
# Get 50,000 records of each tag
tags = set(["pop", "rap", "rock", "misc", "rb", "country"])

In [4]:
SAMPLES_PER_CLASS = 5000

In [5]:
from collections import defaultdict

i = 0
counter = defaultdict(int)
sampled_df = pd.DataFrame()

for chunk in pd.read_csv('./datasets/ds2.csv', chunksize=100_000):
    print(f"Chunk [{i}] : Processing...")
    for tag in tags:
        count = counter[tag]

        if count == SAMPLES_PER_CLASS:
            continue
        else:
            remaining = SAMPLES_PER_CLASS - count
            filtered_chunk: pd.DataFrame = chunk[chunk["tag"] == tag]

            sample_size = min(len(filtered_chunk), remaining)
            sampled_chunk = filtered_chunk.sample(n=sample_size, random_state=42)

            sampled_df = pd.concat([sampled_df, sampled_chunk])
            count = len(sampled_chunk)
            counter[tag] += count


    # Check when have 50k records
    print(f"Chunk [{i}] :", dict(counter), end="\n\n")
    
    bools = [val == SAMPLES_PER_CLASS for val in counter.values()]
    if all(bools):
        print("\nProcessed")
        break

    i += 1

Chunk [0] : Processing...
Chunk [0] : {'misc': 3437, 'rb': 1517, 'rap': 5000, 'pop': 1676, 'rock': 4466, 'country': 220}

Chunk [1] : Processing...
Chunk [1] : {'misc': 5000, 'rb': 2427, 'rap': 5000, 'pop': 5000, 'rock': 5000, 'country': 3280}

Chunk [2] : Processing...
Chunk [2] : {'misc': 5000, 'rb': 3757, 'rap': 5000, 'pop': 5000, 'rock': 5000, 'country': 5000}

Chunk [3] : Processing...

Processed


In [6]:
sampled_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30000 entries, 87256 to 384558
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     29998 non-null  object
 1   tag       30000 non-null  object
 2   artist    30000 non-null  object
 3   year      30000 non-null  int64 
 4   views     30000 non-null  int64 
 5   features  30000 non-null  object
 6   lyrics    30000 non-null  object
 7   id        30000 non-null  int64 
dtypes: int64(3), object(5)
memory usage: 2.1+ MB


In [8]:
sampled_df.to_pickle("./pickles/01_sampled_df.pkl")

## Remove null rows

In [9]:
df = pd.read_pickle("./pickles/01_sampled_df.pkl")

In [10]:
def clean_df(df: pd.DataFrame):
    df.dropna(inplace=True)

In [11]:
clean_df(df)

In [12]:
df["tag"].value_counts()

tag
rb         5000
rap        5000
pop        5000
country    5000
misc       4999
rock       4999
Name: count, dtype: int64

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29998 entries, 87256 to 384558
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     29998 non-null  object
 1   tag       29998 non-null  object
 2   artist    29998 non-null  object
 3   year      29998 non-null  int64 
 4   views     29998 non-null  int64 
 5   features  29998 non-null  object
 6   lyrics    29998 non-null  object
 7   id        29998 non-null  int64 
dtypes: int64(3), object(5)
memory usage: 2.1+ MB


## Text Cleaning

In [19]:
df.reset_index(inplace=True, drop=True)
print(df["lyrics"][0])

(You...)

[Verse 1]
Thought it was love; memories sculpt the lies
That, lead me far from here (here... from here)
Trying hard to silence all you say but
You stay in my ear (you stay in my ear)
Because everything you didn't want,
Became a part of me.
All the things you'd asked for,
I just could not see
Everything you are... (everything you are)
Everything you are, left a broken heartbeat

[Hook]
Through the darkest shades of grey
I see beauty in the rain
'Nd I find myself (hold me tears are blinding my eyes)
Let me find myself (go free and walk alone in the night)
Through the darkest shades of grey
I see beauty in the rain...
(Heart stops on the ground where I'm lying
Called out to a stranger and I
But it was you... you...)
(Everything you didn't want became a part of me)

[Verse 2]
Sleeping alone I cried out, for you in my dream...
No matter how hard I try to change you;
You won't change with me
Because everything you didn't want
Became a part of me
All the things you'd asked for
I jus

In [20]:
import re

def clean_text(text: str) -> str:
    text = text.lower()

    text = re.sub(r"\[.*?\]", "", text) # Remove text between square brackets
    text = re.sub(r"[^\w\s]", " ", text)  # Remove punctuations
    text = re.sub(r"\d+", "", text)  # Remove numbers
    
    text = text.strip()
    return text

In [21]:
df["lyrics"] = df["lyrics"].apply(clean_text)

In [22]:
print(df["lyrics"][0])

you    


thought it was love  memories sculpt the lies
that  lead me far from here  here    from here 
trying hard to silence all you say but
you stay in my ear  you stay in my ear 
because everything you didn t want 
became a part of me 
all the things you d asked for 
i just could not see
everything you are     everything you are 
everything you are  left a broken heartbeat


through the darkest shades of grey
i see beauty in the rain
 nd i find myself  hold me tears are blinding my eyes 
let me find myself  go free and walk alone in the night 
through the darkest shades of grey
i see beauty in the rain   
 heart stops on the ground where i m lying
called out to a stranger and i
but it was you    you    
 everything you didn t want became a part of me 


sleeping alone i cried out  for you in my dream   
no matter how hard i try to change you 
you won t change with me
because everything you didn t want
became a part of me
all the things you d asked for
i just could not see
everythin

In [23]:
df.to_pickle("./pickles/02_cleaned_lyrics_df.pkl")

### Tokenization

In [24]:
df = pd.read_pickle("./pickles/02_cleaned_lyrics_df.pkl")

In [25]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from typing import List

stop_words = set(stopwords.words("english"))

def tokenize_lyrics(text: str) -> List[str]:
    tokens = word_tokenize(text)

    return [token for token in tokens if token not in stop_words]

In [26]:
df["lyrics"] = df["lyrics"].apply(tokenize_lyrics)

In [27]:
largest = max(df["lyrics"], key=len)
len(largest)

22621