# Loading libraries

In [None]:
import pandas as pd
import re
from nltk import word_tokenize as tokenize
from nltk.corpus import stopwords
import plotly.express as px 


# Loading dataset and files

In [None]:
badwords_raw = open("./data/badwords.txt").readlines() #badword list
badwords = []
for word in badwords_raw:
  badwords.append(re.sub("\\n", '', word))


df = pd.read_csv('./data/lyrics_def_noDupl.csv')
df.drop(df[df.year < 1968].index, inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,song,year,artist,genre,lyrics,language
0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu...",en
1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see...",en
2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...,en
3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote...",en
4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po...",en


Inizialize a dictionary divided by genre

In [None]:
genres_list = df['genre'].unique()
genres = {}
#key: genre, value: dataframe containing only records grouped by genre
for genre in genres_list: 
    genres[genre] = df[df['genre'] == genre]

# Side functions

In [None]:
#clean the text of the lyrics
def clean_lyric(full_lyric): 
  new_lyric =  ""
  new_lyric = re.sub("[^\x00-\x7F]+", " ", full_lyric) #remove non ASCII character
  new_lyric = re.sub("[\(\[].*?[\)\]]", ' ', new_lyric) #remove words between brackets
  new_lyric = re.sub("x[0-9]+", " ", new_lyric) #remove x2, x3, etc
  new_lyric = re.sub("[0-9]x+", " ", new_lyric) #remove 2x, 3x, ect
  new_lyric = re.sub("[^\w\s^']", " ", new_lyric) #remove puntuaction
  new_lyric = re.sub("\\n", " ", new_lyric) #remove /n
  new_lyric = re.sub(' {2,}', " ", new_lyric) #remove multiple white spaces
  return new_lyric

In [None]:
#define a lyric as explicit or not explicit. Treshold define the minimum number of bad words to define a lyric as explicit
def is_explicit(token_list, treshold = 5):
  treshold = 1
  n = 0
  for token in token_list:
    #check if the token is in the bad words list, or if the token contains * (e.g, ni**a)
    if token in badwords or re.search("[a-zA-Z]+\*+[a-zA-Z]+", token) is not None: 
      n += 1
    if n >= treshold:
      return True
  return False

# Labeling explicit lyrics

In [None]:
#add a column ('is_explicit') to the dataframe of each genre, append the value ('explitic', 'not_explicit') for each lyric
for genre in genres:
  exp_lyrics = list()
  for lyric in genres[genre]['lyrics']:
    lyrics = clean_lyric(lyric)
    tok_lyric = tokenize(lyric)
    if is_explicit(tok_lyric, 5):
      exp_lyrics.append('explicit')
    else: 
      exp_lyrics.append('not_explicit')
  genres[genre]['is_explicit'] = exp_lyrics


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genres[genre]['is_explicit'] = exp_lyrics
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genres[genre]['is_explicit'] = exp_lyrics
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genres[genre]['is_explicit'] = exp_lyrics
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [None]:
#Organizing data to be plotted
values = {}
gen = []
exp = []
notexp = []
for genre in genres:
  gen.append(genre)
  exp.append(genres[genre]['is_explicit'].value_counts(normalize = True)['explicit'])
  notexp.append(genres[genre]['is_explicit'].value_counts(normalize = True)['not_explicit'])
values['genre'] = gen
values['explicit'] = exp
values['not_explicit'] = notexp


In [None]:
#distribution explicit lyrics per genre
values = pd.DataFrame.from_dict(values)
values

Unnamed: 0,genre,explicit,not_explicit
0,Pop,0.075003,0.924997
1,Hip-Hop,0.726966,0.273034
2,Rock,0.109625,0.890375
3,Metal,0.186901,0.813099
4,Country,0.029382,0.970618
5,Jazz,0.048624,0.951376
6,Electronic,0.108156,0.891844
7,Folk,0.059157,0.940843
8,R&B,0.137212,0.862788
9,Indie,0.093626,0.906374


In [None]:
fig = px.bar(values, x="genre", y=["explicit", "not_explicit"], title="Explicit lyrics distribution per genre", color_discrete_map={'not_explicit': '#1F77B4', 'explicit':'#17BECF'})
fig.show()