In [1]:
import pandas as pd
import numpy as np
import re

## Loading Data and Preprocessing

In [2]:
df = pd.read_csv("data/song_lyrics.csv")

In [3]:
df.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en


In [4]:
df.isna().sum()

title               165
tag                   0
artist                0
year                  0
views                 0
features              0
lyrics                0
id                    0
language_cld3     90966
language_ft      134322
language         226918
dtype: int64

#### Many Null Values in langauges, data description says that language_ft and language has language_cld3 values.

In [5]:
music = df[df!='misc']

#### Using only the tags which have music lyrics in it, misc contains books too.

In [6]:
music['language_cld3'] = music['language_cld3'].fillna(music['language_ft'])
music['language_cld3'] = music['language_cld3'].fillna(music['language'])

In [7]:
music.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en


In [8]:
music.tag.value_counts()

pop        2138587
rap        1724816
rock        793220
rb          196462
country     100316
Name: tag, dtype: int64

#### Pop has more number of tracks followed by rap

In [9]:
music.isna().sum()

title               165
tag              181455
artist                0
year                  0
views                 0
features              0
lyrics                0
id                    0
language_cld3     39817
language_ft      134322
language         226918
dtype: int64

#### dropping columns like language_ft and lanugage as they are repetative and not useful.

In [10]:
music.drop(['language_ft', 'language'], axis = 1, inplace = True)

In [11]:
music.dropna(subset = ['tag', 'language_cld3'], axis = 0, inplace = True)

#### Dropping NA's from the dataset

In [12]:
(music.isna().sum()/music.shape[0])*100

title            0.003172
tag              0.000000
artist           0.000000
year             0.000000
views            0.000000
features         0.000000
lyrics           0.000000
id               0.000000
language_cld3    0.000000
dtype: float64

In [13]:
music.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en


#### Counting the no. of lines for each song

In [14]:
music['no_of_lines'] = music.lyrics.apply(lambda x: x.count("\n"))

In [15]:
music.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,no_of_lines
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,147
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,75
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,64
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,105
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,56


#### Replacing new line character with space

In [16]:
music.lyrics = music.lyrics.apply(lambda x: x.replace("\n", " "))

#### Removing all characters from inside the brackets, including the brackets

In [17]:
import re

def remove_text_inside_brackets(input_string):
    # Define a regular expression pattern for matching text inside square brackets
    pattern = re.compile(r'\[.*?\]')
    
    # Use sub() function to replace the matched pattern with an empty string
    result = re.sub(pattern, '', input_string)
    
    return result

music.lyrics = music.lyrics.map(remove_text_inside_brackets)

In [18]:
music.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,no_of_lines
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","Killa Cam, Killa Cam, Cam Killa Cam, Killa Ca...",1,en,147
1,Can I Live,rap,JAY-Z,1996,468624,{},"Yeah, hah, yeah, Roc-A-Fella We invite you ...",3,en,75
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin And these bastards fiend...,4,en,64
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}","Ugh, Killa! Baby! Kanye, this that 1970s He...",5,en,105
4,Fly In,rap,Lil Wayne,2005,78271,{},"So they ask me ""Young boy What you gon' do th...",6,en,56


#### Strip whitespaces from starting and ending of the string

In [19]:
music.lyrics = music.lyrics.apply(lambda x: x.strip())

In [20]:
(music.tag.value_counts()/music.shape[0])*100

pop        43.122411
rap        34.795202
rock       16.090896
rb          3.960433
country     2.031058
Name: tag, dtype: float64

#### Keeping the tags which has major contribution

In [21]:
subset = music[(music.tag=='rap')|(music.tag=='pop')|(music.tag=='rock')]

In [22]:
subset.artist.value_counts()

Genius English Translations      13117
Genius Romanizations              9283
Genius Brasil Tradues             7938
Genius Traducciones al Espaol     6493
Genius Traductions Franaises      4180
                                 ...  
Jack Tomascak                        1
Partner Up                           1
Charm School                         1
Kolb                                 1
Culture Code, Pag & Mylo             1
Name: artist, Length: 584506, dtype: int64

#### Removing the songs which were translated from other languages, for lack of rhyming schemes and english grammer

In [23]:
subset_filtered = subset[~subset['artist'].str.contains('genius', case=False)]

In [24]:
subset_filtered.to_csv('data/progress.csv')

In [1]:
import pandas as pd
import numpy as np

In [2]:
subset_filtered = pd.read_csv("data/progress.csv")

In [3]:
subset_filtered.artist.value_counts()

The Grateful Dead           2113
Lil B                       1321
Frank Zappa                 1311
Tendon Levey                1297
KIDZ BOP Kids               1270
                            ... 
Yosef                          1
Riot-Act.                      1
Jake Carmody                   1
Ca$h Rohl$                     1
Culture Code, Pag & Mylo       1
Name: artist, Length: 584199, dtype: int64

In [4]:
subset_filtered.head()

Unnamed: 0.1,Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,no_of_lines
0,0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","Killa Cam, Killa Cam, Cam Killa Cam, Killa Cam...",1,en,147
1,1,Can I Live,rap,JAY-Z,1996,468624,{},"Yeah, hah, yeah, Roc-A-Fella We invite you to ...",3,en,75
2,2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin And these bastards fiend...,4,en,64
3,3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}","Ugh, Killa! Baby! Kanye, this that 1970s Heron...",5,en,105
4,4,Fly In,rap,Lil Wayne,2005,78271,{},"So they ask me ""Young boy What you gon' do the...",6,en,56


In [5]:
subset_filtered.isna().sum()

Unnamed: 0         0
title            148
tag                0
artist             0
year               0
views              0
features           0
lyrics            69
id                 0
language_cld3      0
no_of_lines        0
dtype: int64

In [6]:
subset_filtered.dropna(subset = ['lyrics'], inplace = True)

In [7]:
subset_filtered.head()

Unnamed: 0.1,Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,no_of_lines
0,0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","Killa Cam, Killa Cam, Cam Killa Cam, Killa Cam...",1,en,147
1,1,Can I Live,rap,JAY-Z,1996,468624,{},"Yeah, hah, yeah, Roc-A-Fella We invite you to ...",3,en,75
2,2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin And these bastards fiend...,4,en,64
3,3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}","Ugh, Killa! Baby! Kanye, this that 1970s Heron...",5,en,105
4,4,Fly In,rap,Lil Wayne,2005,78271,{},"So they ask me ""Young boy What you gon' do the...",6,en,56


In [8]:
imp_features = subset_filtered[['artist', 'lyrics']]

In [7]:
# imp_features.to_csv('data/imp_features.csv')
import pandas as pd
import numpy as np

imp_features = pd.read_csv('data/imp_features.csv')

In [None]:
imp_features.isna().sum()

artist    0
lyrics    0
dtype: int64

In [None]:
imp_features.head()

Unnamed: 0,artist,lyrics
0,Cam'ron,"Killa Cam, Killa Cam, Cam Killa Cam, Killa Cam..."
1,JAY-Z,"Yeah, hah, yeah, Roc-A-Fella We invite you to ..."
2,Fabolous,Maybe cause I'm eatin And these bastards fiend...
3,Cam'ron,"Ugh, Killa! Baby! Kanye, this that 1970s Heron..."
4,Lil Wayne,"So they ask me ""Young boy What you gon' do the..."


In [17]:
import re

imp_features.loc[:, "lyrics"] = imp_features.lyrics.apply(lambda x: x.lower())
imp_features.lyrics = imp_features.lyrics.apply(lambda x: x.strip())
imp_features.lyrics = imp_features.lyrics.apply(lambda x: re.sub('[^A-Za-z0-9\s]', '', x))
imp_features.artist = imp_features.artist.apply(lambda x: x.lower())
imp_features.artist = imp_features.artist.apply(lambda x: x.strip())
imp_features.artist = imp_features.artist.apply(lambda x: re.sub('[^A-Za-z0-9\s]', '', x))

In [1]:
# imp_features.to_csv('data/imp_features.csv')
import pandas as pd
import numpy as np

imp_features = pd.read_csv("data/imp_features.csv")

In [2]:
imp_features.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,artist,lyrics
0,0,0,camron,killa cam killa cam cam killa cam killa cam ki...
1,1,1,jayz,yeah hah yeah rocafella we invite you to somet...
2,2,2,fabolous,maybe cause im eatin and these bastards fiend ...
3,3,3,camron,ugh killa baby kanye this that 1970s heron flo...
4,4,4,lil wayne,so they ask me young boy what you gon do the s...


In [3]:
file = open('popular_artists.txt', 'r')
artists = file.read()
file.close()
artists = artists.lower()
artists = artists.split("\n")
print(artists[:5])

['taylor swift', 'adele', 'ed sheeran', 'justin bieber', 'beyoncé']


In [6]:
for artist in artists:
    text = ''
    text = str(imp_features[imp_features.artist==artist].lyrics.values)
    
    file_name = "data/TextDataPandas/"+artist+".txt"
    file = open(file_name, "w+")
    file.write(text)
    file.close()

#### Grouping by artist and saving the lyrics in a .txt format to access from the model.

## Model Building

https://colab.research.google.com/drive/1HRdWkp7hVxr1-0s6T0paAucE8NfC2aVm#scrollTo=ZOBnf9VqZ9e0