In [8]:
import os

from pyspark import SparkConf
from pyspark.sql import SparkSession

### Dataset link
<b> https://www.kaggle.com/datasets/carlosgdcj/genius-song-lyrics-with-language-information </b>

##### Configuring access to s3 bucket

In [9]:
access_key = os.getenv("AWS_ACCESS_KEY_ID")
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")

s3_url = 's3a://genius-lyrics-dataset/song_lyrics.csv'

##### Setting up spark environment to use the s3 bucket

In [10]:
conf = SparkConf()
conf.set('spark.hadoop.fs.s3a.access.key',access_key)
conf.set('spark.hadoop.fs.s3a.secret.key', secret_key)
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.4')

<pyspark.conf.SparkConf at 0x7fb770ad5520>

##### Creating spark session

In [11]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

##### Reading the data, using quote and escape to count for the ","s in the middle of the data

In [12]:
df = spark.read.option("header", "true") \
        .option("multiline", "true") \
            .option("quote", "\"") \
                .option("escape", "\"") \
                .csv(s3_url)

In [13]:
df.show()

+--------------------+---+---------+----+------+--------------------+--------------------+---+-------------+-----------+--------+
|               title|tag|   artist|year| views|            features|              lyrics| id|language_cld3|language_ft|language|
+--------------------+---+---------+----+------+--------------------+--------------------+---+-------------+-----------+--------+
|           Killa Cam|rap|  Cam'ron|2004|173166|{"Cam\\'ron","Ope...|[Chorus: Opera St...|  1|           en|         en|      en|
|          Can I Live|rap|    JAY-Z|1996|468624|                  {}|[Produced by Irv ...|  3|           en|         en|      en|
|   Forgive Me Father|rap| Fabolous|2003|  4743|                  {}|Maybe cause I'm e...|  4|           en|         en|      en|
|        Down and Out|rap|  Cam'ron|2004|144404|{"Cam\\'ron","Kan...|[Produced by Kany...|  5|           en|         en|      en|
|              Fly In|rap|Lil Wayne|2005| 78271|                  {}|[Intro]\nSo they ...|

##### Filtering out the books, podcast and other unnecssary data

In [7]:
music = df.filter(df["tag"] != "misc")

In [8]:
music.show()

[Stage 2:>                                                          (0 + 1) / 1]

+--------------------+---+---------+----+------+--------------------+--------------------+---+-------------+-----------+--------+
|               title|tag|   artist|year| views|            features|              lyrics| id|language_cld3|language_ft|language|
+--------------------+---+---------+----+------+--------------------+--------------------+---+-------------+-----------+--------+
|           Killa Cam|rap|  Cam'ron|2004|173166|{"Cam\\'ron","Ope...|[Chorus: Opera St...|  1|           en|         en|      en|
|          Can I Live|rap|    JAY-Z|1996|468624|                  {}|[Produced by Irv ...|  3|           en|         en|      en|
|   Forgive Me Father|rap| Fabolous|2003|  4743|                  {}|Maybe cause I'm e...|  4|           en|         en|      en|
|        Down and Out|rap|  Cam'ron|2004|144404|{"Cam\\'ron","Kan...|[Produced by Kany...|  5|           en|         en|      en|
|              Fly In|rap|Lil Wayne|2005| 78271|                  {}|[Intro]\nSo they ...|

                                                                                

##### Filling null values in language_cld3 with language_ft

In [9]:
music.fillna("language_cld3", "language_ft").show()

+--------------------+---+---------+----+------+--------------------+--------------------+---+-------------+-----------+--------+
|               title|tag|   artist|year| views|            features|              lyrics| id|language_cld3|language_ft|language|
+--------------------+---+---------+----+------+--------------------+--------------------+---+-------------+-----------+--------+
|           Killa Cam|rap|  Cam'ron|2004|173166|{"Cam\\'ron","Ope...|[Chorus: Opera St...|  1|           en|         en|      en|
|          Can I Live|rap|    JAY-Z|1996|468624|                  {}|[Produced by Irv ...|  3|           en|         en|      en|
|   Forgive Me Father|rap| Fabolous|2003|  4743|                  {}|Maybe cause I'm e...|  4|           en|         en|      en|
|        Down and Out|rap|  Cam'ron|2004|144404|{"Cam\\'ron","Kan...|[Produced by Kany...|  5|           en|         en|      en|
|              Fly In|rap|Lil Wayne|2005| 78271|                  {}|[Intro]\nSo they ...|

##### Filling null values in language_cld3 with language

In [10]:
music.na.fill("language_cld3", "language").show()

+--------------------+---+---------+----+------+--------------------+--------------------+---+-------------+-----------+--------+
|               title|tag|   artist|year| views|            features|              lyrics| id|language_cld3|language_ft|language|
+--------------------+---+---------+----+------+--------------------+--------------------+---+-------------+-----------+--------+
|           Killa Cam|rap|  Cam'ron|2004|173166|{"Cam\\'ron","Ope...|[Chorus: Opera St...|  1|           en|         en|      en|
|          Can I Live|rap|    JAY-Z|1996|468624|                  {}|[Produced by Irv ...|  3|           en|         en|      en|
|   Forgive Me Father|rap| Fabolous|2003|  4743|                  {}|Maybe cause I'm e...|  4|           en|         en|      en|
|        Down and Out|rap|  Cam'ron|2004|144404|{"Cam\\'ron","Kan...|[Produced by Kany...|  5|           en|         en|      en|
|              Fly In|rap|Lil Wayne|2005| 78271|                  {}|[Intro]\nSo they ...|

##### Dropping all nas

In [11]:
music.na.drop(how="any").show(truncate=False)

+--------------------------+---+---------+----+------+--------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

##### Dropping duplicate columns

In [12]:
music = music.drop("language_ft", "language")

In [13]:
music.show()

[Stage 6:>                                                          (0 + 1) / 1]

+--------------------+---+---------+----+------+--------------------+--------------------+---+-------------+
|               title|tag|   artist|year| views|            features|              lyrics| id|language_cld3|
+--------------------+---+---------+----+------+--------------------+--------------------+---+-------------+
|           Killa Cam|rap|  Cam'ron|2004|173166|{"Cam\\'ron","Ope...|[Chorus: Opera St...|  1|           en|
|          Can I Live|rap|    JAY-Z|1996|468624|                  {}|[Produced by Irv ...|  3|           en|
|   Forgive Me Father|rap| Fabolous|2003|  4743|                  {}|Maybe cause I'm e...|  4|           en|
|        Down and Out|rap|  Cam'ron|2004|144404|{"Cam\\'ron","Kan...|[Produced by Kany...|  5|           en|
|              Fly In|rap|Lil Wayne|2005| 78271|                  {}|[Intro]\nSo they ...|  6|           en|
|      Lollipop Remix|rap|Lil Wayne|2008|580832|{"Kanye West","St...|[Intro: Lil Wayne...|  7|           en|
|          Im Not Y

                                                                                

##### Replacing new line characters with spaces

In [14]:
from pyspark.sql.functions import col, regexp_replace

music = music.withColumn("lyrics", regexp_replace(col("lyrics"), "\n", " "))

In [15]:
music.show()

+--------------------+---+---------+----+------+--------------------+--------------------+---+-------------+
|               title|tag|   artist|year| views|            features|              lyrics| id|language_cld3|
+--------------------+---+---------+----+------+--------------------+--------------------+---+-------------+
|           Killa Cam|rap|  Cam'ron|2004|173166|{"Cam\\'ron","Ope...|[Chorus: Opera St...|  1|           en|
|          Can I Live|rap|    JAY-Z|1996|468624|                  {}|[Produced by Irv ...|  3|           en|
|   Forgive Me Father|rap| Fabolous|2003|  4743|                  {}|Maybe cause I'm e...|  4|           en|
|        Down and Out|rap|  Cam'ron|2004|144404|{"Cam\\'ron","Kan...|[Produced by Kany...|  5|           en|
|              Fly In|rap|Lil Wayne|2005| 78271|                  {}|[Intro] So they a...|  6|           en|
|      Lollipop Remix|rap|Lil Wayne|2008|580832|{"Kanye West","St...|[Intro: Lil Wayne...|  7|           en|
|          Im Not Y

##### Considering only rap, pop and rock songs as they constitute for 80% of the data

In [16]:
# Defining the subset condition
subset_condition = (col("tag") == "rap") | (col("tag") == "pop") | (col("tag") == "rock")

# Using .filter to apply the subset condition
music = music.filter(subset_condition)

In [17]:
music.groupBy('tag').count().show()

[Stage 8:>                                                          (0 + 1) / 1]

+----+-------+
| tag|  count|
+----+-------+
| pop|2138587|
| rap|1724816|
|rock| 793220|
+----+-------+



                                                                                

##### Removing data which was translated to english by genius

In [18]:
# Define the condition for substring match
substring_condition = ~(col("artist").rlike("genius"))

# Use .filter to apply the conditions
music = music.filter(substring_condition)

##### Rmoving data from the brackets in the lyrics column

In [19]:
import re

# Define a UDF (User-Defined Function) to remove text inside brackets
def remove_text_inside_brackets(input_string):
    return re.sub(r'\[.*?\]', '', input_string)

# Refering columns by index.
rdd2 = music.rdd.map(lambda x: (x[2],remove_text_inside_brackets(x[6]).strip()))  
lyrics = rdd2.toDF(["artist", "lyrics"])
lyrics.show()

                                                                                

+---------+--------------------+
|   artist|              lyrics|
+---------+--------------------+
|  Cam'ron|Killa Cam, Killa ...|
|    JAY-Z|Yeah, hah, yeah, ...|
| Fabolous|Maybe cause I'm e...|
|  Cam'ron|Ugh, Killa! Baby!...|
|Lil Wayne|So they ask me "Y...|
|Lil Wayne|Haha Uh-huh No ho...|
|   Clipse|No, no, no! I tol...|
|  Cam'ron|Killa, Dipset Man...|
|  Cam'ron|Ay yo you wonder ...|
|  Cam'ron|Now Lord you know...|
|Lil Wayne|Yeah Money on my ...|
| Fabolous|You ain't never s...|
|    JAY-Z|"Dear God – I won...|
|    JAY-Z|Shawn Carter was ...|
|    Big L|Yo, fuck all the ...|
|  Birdman|Ayy-ayy, ayy, ayy...|
|    JAY-Z|Yo, show closer, ...|
|  Cam'ron|Ayo, fuck losing ...|
|    JAY-Z|Yeah, uh-huh, wat...|
|  Cam'ron|Gangsta Music, pa...|
+---------+--------------------+
only showing top 20 rows



##### Removing special characters and converting to lower

In [20]:
def toLower(input_string):
    return input_string.lower()

def removeSpecial(input_string):
    return re.sub('[^A-Za-z0-9\s]', '', input_string)


# Refering columns by index.
rdd3 = lyrics.rdd.map(lambda x: (removeSpecial(toLower(x[0].strip())),removeSpecial(toLower(x[1].strip()))))
imp_features = rdd3.toDF(["artist", "lyrics"])
imp_features.show()

[Stage 14:>                                                         (0 + 1) / 1]

+---------+--------------------+
|   artist|              lyrics|
+---------+--------------------+
|   camron|killa cam killa c...|
|     jayz|yeah hah yeah roc...|
| fabolous|maybe cause im ea...|
|   camron|ugh killa baby ka...|
|lil wayne|so they ask me yo...|
|lil wayne|haha uhhuh no hom...|
|   clipse|no no no i told y...|
|   camron|killa dipset man ...|
|   camron|ay yo you wonder ...|
|   camron|now lord you know...|
|lil wayne|yeah money on my ...|
| fabolous|you aint never se...|
|     jayz|dear god  i wonde...|
|     jayz|shawn carter was ...|
|    big l|yo fuck all the g...|
|  birdman|ayyayy ayy ayy ye...|
|     jayz|yo show closer jt...|
|   camron|ayo fuck losing w...|
|     jayz|yeah uhhuh watch ...|
|   camron|gangsta music par...|
+---------+--------------------+
only showing top 20 rows



                                                                                

##### Reading the top 100 artists from the popular_artist.txt, which has 50 pop and 50 rap artists

In [22]:
file = open('popular_artists.txt', 'r')
artists = file.read()
file.close()
artists = artists.lower()
artists = artists.split("\n")
print(artists[:5])

['taylor swift', 'adele', 'ed sheeran', 'justin bieber', 'beyoncé']


##### Writing data related to all 100 artists into .txt files

In [23]:
# Loop through artists and write lyrics to text files
for artist in artists:
    text = imp_features.filter(col("artist") == artist).select("lyrics").collect()
    
    if text:
        text = text[0][0]
        # print(text)
        file_name = "data/TextData/" + artist + ".txt"
        
        with open(file_name, "w+") as file:
            file.write(str(text))
            file.close()

                                                                                

In [14]:
spark.stop()