In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from langdetect import detect
from collections import Counter
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer
import nltk

In [10]:
df = pd.read_csv("/Users/samfinard/src/1PA/Final-Project/v2/updated_billboard/toClean_dateCount.csv", low_memory = False)

In [11]:
df.drop('Unnamed: 0', axis=1, inplace=True)

KeyError: "['Unnamed: 0'] not found in axis"

In [12]:
df['lyrics'] = df['lyrics'].str.lower()

### Defines which [ordinal ASCII values](https://learn.parallax.com/support/reference/ascii-table-0-127) to keep. We only want lowercase letters, spaces, apostrophes, and hyphens

In [13]:
def keep(char):
    o = ord(char)
    return(o == 32 or o == 45 or (96 <= o and o <= 122))

In [16]:
def remove_fluff(text):
    if pd.isnull(text):  # Check if the text is NaN
        return text  # Return NaN as it is
    return ''.join(char if keep(char) else ' ' for char in text)

In [17]:
df['lyrics'] = df['lyrics'].apply(remove_fluff)

In [20]:
def replace_multiple_spaces(text):
    if pd.isnull(text):
        return text
    return re.sub(' +', ' ', text)

In [21]:
df['lyrics'] = df['lyrics'].apply(replace_multiple_spaces)

In [24]:
# Count the number of songs with empty or NaN lyrics
num_empty_lyrics = df['lyrics'].isna().sum() + df['lyrics'].eq('').sum()
print(num_empty_lyrics)
# Drop the rows with empty or NaN lyrics
df_cleaned = df.dropna(subset=['lyrics'])  # Drops rows with NaN lyrics
df_cleaned = df_cleaned[df_cleaned['lyrics'].str.strip() != '']  # Drops rows with empty strings


47042


In [25]:
df = df.copy()
df['language'] = df['lyrics'].apply(detect)

TypeError: expected string or bytes-like object, got 'float'

In [None]:
df_english = df[df['language'] == 'en']
df = df_english

### Remove Stop Words, Stemming

In [26]:
stop_words = set(stopwords.words('english'))
def remove_stop_words(lyric):
    return ' '.join([word if word not in stop_words else ' ' for word in lyric.split()])

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/samfinard/nltk_data'
    - '/opt/homebrew/opt/python@3.11/Frameworks/Python.framework/Versions/3.11/nltk_data'
    - '/opt/homebrew/opt/python@3.11/Frameworks/Python.framework/Versions/3.11/share/nltk_data'
    - '/opt/homebrew/opt/python@3.11/Frameworks/Python.framework/Versions/3.11/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
df['lyrics_no_sw'] = df['lyrics'].apply(remove_stop_words)

In [None]:
df['lyrics_no_sw'] = df['lyrics_no_sw'].apply(replace_multiple_spaces)

In [None]:
lemmatizer = WordNetLemmatizer()
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
def lemmatize(lyric):
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(lyric)  # Tokenizing
    pos_tags = nltk.pos_tag(words)  # Getting POS tags
    return ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(p)).lower() for w, p in pos_tags])

In [None]:
df['lemmatized'] = df['lyrics_no_sw'].apply(lemmatize)

In [None]:
df['counter'] = df['lemmatized'].apply(lambda x: Counter(x.split()))
df['counter'] = df['counter'].apply(lambda x: {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)})

In [None]:
df = df[['song','artist','counter']]

In [None]:
df.to_csv('../data/counter.csv', index=False)