# Lyrics Match

The data preprocessed in this file can be found at
https://www.kaggle.com/gyani95/380000-lyrics-from-metrolyrics/download

Original data when unzipped will give a lyrics.zip file and we will preprocess it to be used for the project in this notebook.

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
df = pd.read_csv('data/lyrics.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362237 entries, 0 to 362236
Data columns (total 6 columns):
index     362237 non-null int64
song      362235 non-null object
year      362237 non-null int64
artist    362237 non-null object
genre     362237 non-null object
lyrics    266557 non-null object
dtypes: int64(2), object(4)
memory usage: 16.6+ MB


In [4]:
df.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


### Removing invalid data

In [5]:
# Removing NaNs
df.dropna(inplace=True)

# Removing index column
df.drop('index', axis=1, inplace=True)

# Removing invalid years
df = df[(df['year'] >= 1900) & (df['year'] <= 2020)]

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266550 entries, 0 to 362236
Data columns (total 5 columns):
song      266550 non-null object
year      266550 non-null int64
artist    266550 non-null object
genre     266550 non-null object
lyrics    266550 non-null object
dtypes: int64(1), object(4)
memory usage: 12.2+ MB


In [7]:
np.sort(df['year'].unique())

array([1968, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979,
       1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
       1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016])

### Cleaning and tokenizing lyrics for traditional NLP approaches

In [8]:
def lyrics_cleaner(lyrics):
    
    # Extract only characters
    lyrics = re.sub("[^a-zA-Z]", " ", str(lyrics))
    
    # Tokenize into words (all lower case)
    lyrics = lyrics.lower().split()
    
    # Join the review to one sentence
    lyrics = ' '.join(lyrics)
    
    return lyrics

In [9]:
df['cleaned_lyrics'] = df['lyrics'].apply(lyrics_cleaner)
df['length'] = df['cleaned_lyrics'].str.len()
df = df[pd.notnull(df["lyrics"]) ] 
df['length'] = df['length'].astype(int)

In [10]:
df = df[df['cleaned_lyrics'].map(len) > 0]

<div class="alert alert-danger">
The following two cell will take a while to run, about 30 mins on a powerful CPU<br>
Uncomment them and use them according to your need 
</div>

In [11]:
# from langdetect import detect

# try:
#     df['lang'] = df['cleaned_lyrics'].apply(lambda x: detect(x))
# except Exception as e:
#     print(str(e))
#     pass

In [12]:
# df.to_csv('data/lyrics_updated.csv', index=False)

In [13]:
dfu = pd.read_csv('data/lyrics_updated.csv')

In [14]:
dfu.head()

Unnamed: 0,song,year,artist,genre,lyrics,cleaned_lyrics,length,lang
0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu...",oh baby how you doing you know i m gonna cut r...,1913,en
1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see...",playin everything so easy it s like you seem s...,1234,en
2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...,if you search for tenderness it isn t hard to ...,820,en
3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote...",oh oh oh i oh oh oh i verse if i wrote a book ...,2293,en
4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po...",party the people the people the party it s pop...,1484,en


In [15]:
dfu.columns

Index(['song', 'year', 'artist', 'genre', 'lyrics', 'cleaned_lyrics', 'length',
       'lang'],
      dtype='object')

In [16]:
# Writing the bigger original format file
df.to_csv('data/lyrics_big.csv', columns=['song', 'year', 'artist', 'genre', 'lyrics'], index=False)

In [17]:
# Writing the bigger original format file (English only)
dfu[dfu['lang'] == 'en'].to_csv('data/lyrics_big_english.csv', columns=['song', 'year', 'artist', 'genre', 'lyrics'], index=False)

In [18]:
# Writing the bigger file with additional cleaned lyrics
dfu.to_csv('data/lyrics_big_cleaned.csv', columns=['song', 'year', 'artist', 'genre', 'lyrics','cleaned_lyrics','lang'], index=False)

In [19]:
df_small = df.sample(frac=0.1, random_state=42)

In [20]:
df_small.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26529 entries, 76694 to 166288
Data columns (total 7 columns):
song              26529 non-null object
year              26529 non-null int64
artist            26529 non-null object
genre             26529 non-null object
lyrics            26529 non-null object
cleaned_lyrics    26529 non-null object
length            26529 non-null int64
dtypes: int64(2), object(5)
memory usage: 1.6+ MB


In [21]:
dfu_small = dfu.sample(frac=0.1, random_state=42)

In [22]:
# Writing the smaler original format file (English only)
df_small.to_csv('data/lyrics_small.csv', index=False)

In [23]:
# Writing the smaller original format file (English only)
dfu_small[dfu_small['lang'] == 'en'].to_csv('data/lyrics_small_english.csv', columns=['song', 'year', 'artist', 'genre', 'lyrics'], index=False)

In [24]:
# Writing the smaller file with additional cleaned lyrics
dfu_small.to_csv('data/lyrics_small_cleaned.csv', columns=['song', 'year', 'artist', 'genre', 'lyrics','cleaned_lyrics','lang'], index=False)