In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from langdetect import detect
from collections import Counter
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer
import nltk

In [None]:
df = pd.read_csv("../data/lyrics.csv")

In [None]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
df['lyrics'] = df['lyrics'].str.lower()

### Defines which [ordinal ASCII values](https://learn.parallax.com/support/reference/ascii-table-0-127) to keep. We only want lowercase letters, spaces, apostrophes, and hyphens

In [None]:
def keep(char):
    o = ord(char)
    return(o == 32 or o == 45 or (96 <= o and o <= 122))

In [None]:
def remove_fluff(text):
    return ''.join(char if keep(char) else ' ' for char in text)

In [None]:
df['lyrics'] = df['lyrics'].apply(remove_fluff)

In [None]:
def replace_multiple_spaces(text):
    return re.sub(' +', ' ', text)

In [None]:
df['lyrics'] = df['lyrics'].apply(replace_multiple_spaces)

In [None]:
df = df.copy()
df['language'] = df['lyrics'].apply(detect)

In [None]:
df_english = df[df['language'] == 'en']
df = df_english

### Remove Stop Words, Stemming

In [None]:
stop_words = set(stopwords.words('english'))
def remove_stop_words(lyric):
    return ' '.join([word if word not in stop_words else ' ' for word in lyric.split()])

In [None]:
df['lyrics_no_sw'] = df['lyrics'].apply(remove_stop_words)

In [None]:
df['lyrics_no_sw'] = df['lyrics_no_sw'].apply(replace_multiple_spaces)

In [None]:
lemmatizer = WordNetLemmatizer()
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
def lemmatize(lyric):
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(lyric)  # Tokenizing
    pos_tags = nltk.pos_tag(words)  # Getting POS tags
    return ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(p)).lower() for w, p in pos_tags])

In [None]:
df['lemmatized'] = df['lyrics_no_sw'].apply(lemmatize)

In [None]:
df['counter'] = df['lemmatized'].apply(lambda x: Counter(x.split()))
df['counter'] = df['counter'].apply(lambda x: {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)})

In [None]:
df = df[['song','artist','counter']]

In [None]:
df.to_csv('../data/counter.csv', index=False)