## Import libraries

In [None]:
import pandas as pd, numpy as np
import pickle
import string
from string import digits
import dateutil.parser as parser
from datetime import datetime
import re
import unicodedata
import nltk, nltk.stem as stem
from nltk.corpus import stopwords, words as nltk_words

## Scraping Lyrics from Genius.com

- Sign up for an account that authorizes Genius API access. They will give you a `client_id` and a `client_secret` to be used as authorization keys.
- Install `lyricsgenius` (Python wrapper) by executing `$pip install lyricsgenius`

In [None]:
import lyricsgenius as genius
my_access_token = '1_cCky2Ywyz79M4_3zuNQrDMQHF8TdLnZ7A7rMivBnMYAyJWvNIxcs0EcazGFirq'
api = genius.Genius(my_access_token)
artist = api.search_artist('BTS')

Let's get the titles of all the songs by BTS that were found on Genius

In [None]:
# Titles of all the songs
titles = [s.title for s in artist.songs]

Genius provides English translations to all the songs, but under the artist "Genius Translation", so we need to modify the titles accordingly so that it can be searched

In [None]:
# Modify the titles to get their english translations
titles_1 = ['BTS - ' + t + ' (English Translation)' for t in titles]

In [None]:
songs = []
for t in titles_1:
    song = api.search_song(t, 'Genius Translations')
    if song not in songs:
        songs.append(song)

# Only get the songs that are available
songs = [s for s in songs if s is not None]

Let's pickle this file so we don't have to do all of these again

In [None]:
# Pickle the songs
song_pickle = open("BTS_songs.pickle", "wb")
pickle.dump(songs, song_pickle)
song_pickle.close()

In [None]:
# Check how many songs in total
len(songs)

Now let's write a Dataframe that consists of necessary information, like Title, Album, Release Year, and Lyrics.

In [None]:
song_df = pd.DataFrame(
        {'Title': [s.title for s in songs],
         'Album': [s.album for s in songs],
         'Release': [s.year for s in songs],
         'Lyrics': [s.lyrics for s in songs]
        } )

## Pre-process the titles
Just to make our lives easier; we will not be using Titles for analysis so this doesn't have to be pretty

In [None]:
def remove_non_ascii(text):
    return "".join(c for c in text if ord(c)<128)

def remove_substr(text):
    pattern1 = 'BTS\s-\s'
    pattern2 = '\s\(English\sTranslation\)'
    pattern3 = '\s\[English\sTranslation\]'
    text = re.sub(pattern1, '', text)
    text = re.sub(pattern2, '', text)
    text = re.sub(pattern3, '', text)
    text = re.sub('\(\s+\)', '', text)
    return text

In [None]:
for index,row in song_df.iterrows():
    title = remove_non_ascii(row['Title'])
    title = remove_substr(title)
    row['Title'] = title.lstrip()

## Pre-process the album titles

In [None]:
for index,row in song_df.iterrows():
    if row['Album'] is None:
        row['Album'] = 'Unknown'
    else:
        title = remove_non_ascii(row['Album'])
        title = remove_substr(title)
        row['Album'] = title.lstrip()

## Pre-process the lyrics
We need to normalize the lyrics, and strip off all the stopwords. There are a lot of words that don't make sense or written in a particular way that need fixing as well.

In [None]:
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_words = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    return [word.lower() for word in words]

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    new_words = []
    stop_words = stopwords.words('english')
    del_words = ['ay', 'oh', 'yeah', 'la', 'lala', 'lalala', 'lalalala', 'lalalalala',
                 'hey', 'heh', 'na', 'ah', 'ye', 'ey', 'woah', 'woo', 'gim', 'wo', 'wow',
                   'brr', 'yo', 'yah', 'ya', 'eris', 'th', 'st', 'rd', 'selfreproach',
                 'bwahahaha',
                  'errday', 'ca', 'nt', 'gon', 'lem',
                'panman', 'bang', 'tan', 'nyeon', 'dan', 'ge', 'cuz',
                'kang', 'baek', 'ho', 'til', 'jinjim', 'abcdefgh',
                'errthing', 'errthang', 'errbody', 'stich', 'sucka', 'betta', 'kaws', 'ahh', 'randa', 'rida',
                 'hehe', 'haha', 'imma', 'plaing', 'yeon', 'ai', 'hiphip',
                'bu', 'hukhuk', 'uslike', 'wan', 'uh', 'hoo', 'eh', 'ddaeng',
                'oneeight', 'onethree', 'threeeight', 'hunnit', 'calmy', 'lalalack',
                'hhope', 'accel', 'lilililike', 'lilike', 'bubut', 'illegirl', 'ta', 'bout', 'ayo',
                'dunno', 'wus', 'wassup', 'shim', 'chung', 'krsone', 'illmatic',
                'eolssu', 'ulsoo', 'ulssu', 'ohohowoah', 'ohohowoahowoah', 'dunkiduk', 
                'kungduruhruh', 'thang', 'mymy', 'deonggideok', 'kungdeoreoreo', 'em', 'heoeohoh', 'heyheyho',
                'jungun', 'im', 'whatchu', 'cmon', 'beging', 'friz', 'milli', 'bwa', 'doo',
                'joo', 'ching', 'leggo', 'bgm', 'mma', 'within', 'ilsan', 'mon', 'han', 
                 'mt', 'mudeung', 'hoshigi', 'dboy', 'dom',
                'hugok', 'festa', 'baam', 'lodi', 'dodi', 'pyung', 'monster', 
                'hurryhurryhurryhurryhurryhurryhurry', 'nanakon', 'ohho', 'nwappwappwappwa',
                'taehyung', 'hopehopehopehope', 'jeon', 'jindda', 'lobullshitter', 'soondae',
                'yoongi', 'hahahaha', 'ohhh', 'namib', 'ouuuuu', 'aaahahh', 'ahhahahhhhm', 'ctrlc', 'ctrlv',
                'dont', 'jklmnopqrst', 'hol', 'holeinone', 'handlib', 'killa', 'illa', 'dat',
                'runch', 'tombullshittone']
    stop_words += del_words
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words

def lemmatize_words(words):
    lemmatizer = stem.WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

def stem_verbs(words,p='v'):
    stemmer = stem.LancasterStemmer()
    return [stemmer.stem(word) for word in words]

def normalize(words):
    #words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_numbers(words)
    words = remove_stopwords(words)
    return words

def stem_lemmatize(words):
    words = lemmatize_words(words)
    words = stem_verbs(words)
    return words

def remove_numbers(words):
    return [re.sub('[^a-zA-Z]*','',word) for word in words]

def fix_lyrics(words):
    patterns = ['fullofregrets', 'stststutter', 'kkeut', 'kiddin', 'rockin', 'rollin',
                'gegegetting', 'hiddeni', 'zerointerest', 'fk', 'bs', 'swaggin',
                'skool', 'origing', 'knowitall', 'tugofwar', 'coupley', 'jujujujump', 'heroesintraining',
               'feeli', 'lastditch', 'feelingng', 'feelingn', 'obullshitcure', 'thumbullshit', 'obullshitcurities',
               'rreal', 'lonelily', 'crowtit']
    fix = ['regrets', 'stutter', 'end', 'kidding', 'rocking', 'rolling',
           'getting', 'hidden', 'zero', 'fuck', 'bullshit', 'swag',
           'school', 'origin', 'know-it-all', 'tug-of-war', 'couple', 'jump', 'hero',
          'feeling', 'last', 'feeling', 'feeling', 'bullshit', 'bullshit', 'insecurities',
          'real', 'lonely', 'crow']
    for p,f in zip(patterns,fix):
        words = [re.sub(p,f,word) for word in words]
    return words

In [None]:
for index,row in song_df.iterrows():
    lyrics = remove_between_square_brackets(row['Lyrics'])
    lyrics = nltk.word_tokenize(lyrics)
    lyrics = normalize(lyrics)
    lyrics = fix_lyrics(lyrics)
    row['Lyrics'] = " ".join(lyrics).lstrip()

## Deleted duplicated songs (remixes)

In [None]:
song_df = song_df[song_df.Title.str.contains('[mM]ix') == False]

In [None]:
song_df.head(35)

In [None]:
song_df.shape

## Inspecting albums
Some albums were missing release date so let's add them back using information on Wikipedia.

In [None]:
# Fill in some missing album release dates
for index,row in song_df.iterrows():
    if row['Album'] == 'O!RUL8,2?':
        row['Release'] = '2013-09-11'
    if row['Album'] == 'Dark&Wild':
        row['Release'] = '2014-08-19'
    if row['Album'] == '2 Cool 4 Skool ':
        row['Release'] = '2013-06-12'
    if row['Album'] == 'Skool Luv Affair (Special Edition)':
        row['Release'] = '2014-02-12'
    if row['Album'] == "Love Yourself  'Answer'":
        row['Release'] = '2018-08-24'
        
for index,row in song_df.iterrows():
    if row['Release'] is None:
        row['Release'] = '2016'
    else:
        row['Release'] = datetime.strptime(row['Release'], '%Y-%m-%d').year

## Export files

In [None]:
# export to csv
file_name = 'BTS_lyrics.csv'
song_df.to_csv(file_name, index=False)