In [1]:
import os
import pandas as pd

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langdetect import detect

import spacy

nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/kamilabystron/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kamilabystron/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kamilabystron/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kamilabystron/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kamilabystron/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kamilabystron/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
STOPWORDS = stopwords.words('english')
NER = spacy.load("en_core_web_sm")
LEMMATIZER = WordNetLemmatizer()

In [4]:
data_path = os.path.join('..', 'data', 'data.csv')
df = pd.read_csv(data_path).drop_duplicates(subset='lyrics')

In [5]:
df.shape

(7392, 5)

In [6]:
def drop_errors(df):
    df = df[~df.lyrics.str.startswith('Error in')].reset_index(drop=True)
    df.lyrics = df.lyrics.apply(lambda x: x.rstrip('EmbedShare URLCopyEmbedCopy'))
    df.lyrics = df.lyrics.replace('\d{,}$', '', regex=True)
    return df

def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'
    
def tokenize_lyrics(df):
    df.lyrics = df.lyrics.str.replace('\W+', ' ', regex=True)
    df['tokens'] = df.lyrics.str.lower().apply(lambda x: word_tokenize(x))
    return df

def remove_stopwords(tokens, stopwords):
    return [el for el in tokens if el not in stopwords]

def get_entities(text, ner):
    entities = {ent.text: ent.label_ for ent in ner(text).ents}
    return entities
    
def get_wordnet_pos(tokens):
    pos_tokens = nltk.pos_tag(tokens)
    pos_tokens_wordnet = [(el[0], map_pos_wordnet(el[1])) for el in pos_tokens]
    return pos_tokens_wordnet

def map_pos_wordnet(token):
    if token[0].startswith('J'):
        return wordnet.ADJ
    elif token[0].startswith('V'):
        return wordnet.VERB
    elif token[0].startswith('N'):
        return wordnet.NOUN
    elif token[0].startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def lemmatize_tokens(pos_tokens, lemmatizer):
    return [lemmatizer.lemmatize(token[0], pos=token[1]) for token in pos_tokens]

In [7]:
def prepare_dataframe(df, ner, lemmatizer, stopwords):
    df = drop_errors(df)
    df['language'] = df['lyrics'].apply(detect_language)
    df = df[df['language'] == 'en']
    df = tokenize_lyrics(df)
    df['tokens'] = df['tokens'].apply(lambda x: remove_stopwords(x, stopwords))
    df['entities'] = df['lyrics'].apply(lambda x: get_entities(x, ner))
    df['pos_tokens'] = df['tokens'].apply(lambda x: get_wordnet_pos(x))
    df['tokens_lemma'] = df['pos_tokens'].apply(lambda x: lemmatize_tokens(x, lemmatizer))
    return df

In [13]:
df = prepare_dataframe(df, NER, LEMMATIZER, STOPWORDS)

In [14]:
df.head()

Unnamed: 0,artist_name,track_name,popularity,genre,lyrics,language,tokens,entities,pos_tokens,tokens_lemma
0,James Bay,Let It Go,73,rock,From walking home and talking loads To seeing ...,en,"[walking, home, talking, loads, seeing, shows,...","{'evening': 'TIME', 'Tryna': 'PERSON', 'Cause'...","[(walking, v), (home, n), (talking, v), (loads...","[walk, home, talk, load, see, show, even, clot..."
1,Bonobo,From You,67,jazz,Gone like changing seasons Alright alright You...,en,"[gone, like, changing, seasons, alright, alrig...",{},"[(gone, v), (like, n), (changing, v), (seasons...","[go, like, change, season, alright, alright, s..."
2,Lee Brice,One Of Them Girls,69,country,Are you one of them girls that peels off the B...,en,"[one, girls, peels, bud, light, label, might, ...","{'Kinda': 'PERSON', 'one': 'CARDINAL', 'all ni...","[(one, n), (girls, n), (peels, n), (bud, v), (...","[one, girl, peel, bud, light, label, might, ru..."
3,Andy Gibb,I Just Want To Be Your Everything,62,disco,For so long You and me been finding each other...,en,"[long, finding, long, feeling, feel, strong, g...",{'Build': 'FAC'},"[(long, r), (finding, v), (long, r), (feeling,...","[long, find, long, feel, feel, strong, girl, t..."
4,"Earth, Wind & Fire",You Want My Love,61,jazz,You ain t gotta say much I can tell that love ...,en,"[got, ta, say, much, tell, love, means, someth...","{'Don': 'PERSON', 'Lies': 'PERSON'}","[(got, v), (ta, n), (say, v), (much, r), (tell...","[get, ta, say, much, tell, love, mean, somethi..."


In [15]:
df.to_csv(os.path.join('..', 'data', 'prepared_data.csv'), index=False)