# Data Preprocessor
**Input** : Unprocessed CSV file
**Output**: Processed (cleaned) pkl file
- 'Instrumental' genres are removed
- Remove newlines
- Remove songs with no lyrics (nan)
- Remove songs with word_count != 1
- Lemmatize data
- Append data with polarity for sentiment analysis
- Remove stop words

Check language
https://stackoverflow.com/questions/3182268/nltk-and-language-detection

In [85]:
### Preprocessor globals ###

# Input
INPUT_FILE_PATH = './'
INPUT_FILE_NAME = 'lyrics.csv'

# Output
OUTPUT_FILE_PATH = './'
OUTPUT_FILE_NAME = 'processed.pkl'

# Constraints
MAX_ROWS = 1000
# MAX_ROWS = -1 # ALL ROwS

In [86]:
# Import packages
import pandas as pd
import numpy as np
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('words')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langdetect import DetectorFactory 
DetectorFactory.seed = 0
from langdetect import detect

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jorge/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package words to /home/jorge/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jorge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [111]:
# Load unprocessed lyrics data
df = pd.read_csv('lyrics.csv')
if(MAX_ROWS != -1):
    df = df.sample(n = MAX_ROWS, random_state = 1)

In [112]:
# Clean lyrics data
def removeDigits(l):
    return "".join([letter for letter in l if not letter.isdigit()])
    
df['lyrics'] = df['lyrics'].apply(lambda l: str(l)
                                  .replace('\n', ' ') # Remove newlines
                                  .replace('.', '')   # Remove Punctuations
                                  .replace(',', '')   
                                  .replace('!', '')   
                                  .replace('?', '')
                                  .strip()
                                  .lower())           # To lower case

df['lyrics'] = df['lyrics'].apply(lambda l: removeDigits(l))


In [113]:
# Merge Genres
def mergeGenre(genre):
    if genre == "Country" or (genre) == "Folk":
        return "Country/Folk"
    if genre == "Hip-Hop" or (genre) == "R&B":
        return "Hip-Hop/R&B"
    if genre == "Rock" or (genre) == "Metal":
        return "Rock/Metal"
    return genre
    
df['genre'] = df['genre'].apply(lambda row: mergeGenre(row))
df = df[df['genre'] != 'Other']
df = df[df['genre'] != 'Not Available']
df = df[df['genre'] != 'Indie']
df = df[df['song'].str.contains('remix') == False]

In [114]:
# Remove song with lyrics below 10
df['word_count'] = df['lyrics'].str.split(' ').str.len()
df = df[df['word_count'] > 10]

In [117]:
# Remove rows with too many non english words
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
def isEnglish (l):
    retVal = "failed"
    try:
        retVal = detect(l)
    except:
        print('THESE LYRICS FAILED::', l)
    return retVal == 'en'


df = df[df['lyrics'].apply(isEnglish)]

In [104]:
# Remove stop words
sWords = stopwords.words('english')
sWords.extend(('got', 'get', 'gets' 'getting', '2X', '2x', 'x2', 'x3', 'x4', 'x2chorus', 'chorus', 'verse', 'bridge', 'd\xe3', 'n\xe3', 'm\xe3', 'the', 'it', 'is', "it's", 'are', 'were', 'a', 'an', 'its', 'of', 'for'))

def removeStopWords( lyrics ):
    tokens = lyrics.split()
    return " ".join([w for w in tokens if not w in sWords])

df['lyrics'] = df['lyrics'].apply(removeStopWords)

In [30]:
# Obtain polarity scores and append to dataframe
sid = SentimentIntensityAnalyzer()
df['pos_score'] = df['lyrics'].apply(lambda row: sid.polarity_scores(row)['pos'])
df['neg_score'] = df['lyrics'].apply(lambda row: sid.polarity_scores(row)['neg'])
df['neu_score'] = df['lyrics'].apply(lambda row: sid.polarity_scores(row)['neu'])
df['compound_score'] = df['lyrics'].apply(lambda row: sid.polarity_scores(row)['compound'])

In [118]:
# did it work?
#df.to_pickle("./processed.pkl")

en
es
