In [66]:
import pandas as pd
import spacy
import nltk
import numpy as np

In [67]:
df = pd.read_csv('music_lyrics.csv', index_col = 0) 
cols_to_keep = ['artist_name', 'track_name', 'release_date', 'genre', 'lyrics', 'len', 'danceability', 'loudness',
       'acousticness', 'instrumentalness', 'valence', 'energy']
lyrics  = df[cols_to_keep]

In [105]:
lyrics['decade'] = pd.cut(lyrics.release_date, range(1949, 2020, 10))
lyrics['decade'] = lyrics['decade'].apply(lambda x: f'{x.left + 1} - {x.right}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lyrics['decade'] = pd.cut(lyrics.release_date, range(1949, 2020, 10))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lyrics['decade'] = lyrics['decade'].apply(lambda x: f'{x.left + 1} - {x.right}')


In [108]:
df['decade'] = lyrics['decade']
cols_to_keep = ['artist_name', 'track_name', 'release_date', 'genre', 'lyrics', 'len', 'danceability', 'loudness',
       'acousticness', 'instrumentalness', 'valence', 'energy', 'decade']

In [68]:
lyrics.lyrics.str.len()

0        522
4        316
6        153
10       314
12       273
        ... 
82447    476
82448    400
82449    490
82450    397
82451    518
Name: lyrics, Length: 28372, dtype: int64

In [69]:
lyrics.columns

Index(['artist_name', 'track_name', 'release_date', 'genre', 'lyrics', 'len',
       'danceability', 'loudness', 'acousticness', 'instrumentalness',
       'valence', 'energy'],
      dtype='object')

In [70]:
lyrics['avg_word_length'] = lyrics.lyrics.str.split(' ').apply(lambda x : np.mean([len(word)for word in x]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lyrics['avg_word_length'] = lyrics.lyrics.str.split(' ').apply(lambda x : np.mean([len(word)for word in x]))


In [71]:
# finding adjectives, verbs and nouns
nlp = spacy.load('en_core_web_sm')
pos_tags = []
for doc in nlp.pipe(lyrics.lyrics.values.tolist()):
    doc_tags = []
    for token in doc:
        doc_tags.append(token.pos_)
    pos_tags.append(doc_tags)

In [72]:
adj_count = [doc_pos.count('ADJ') for doc_pos in pos_tags]
lyrics['adj_count'] = adj_count

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lyrics['adj_count'] = adj_count


In [73]:
verb_count = [doc_pos.count('VERB') for doc_pos in pos_tags]
lyrics['verb_count'] = verb_count

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lyrics['verb_count'] = verb_count


In [86]:
noun_count = [doc_pos.count('NOUN') for doc_pos in pos_tags]
lyrics['noun_count'] = noun_count

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lyrics['noun_count'] = noun_count


In [74]:
# extracting the top 500 most used words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 500, stop_words = 'english')
words = pd.DataFrame(cv.fit_transform(lyrics.lyrics).toarray(), columns  = cv.get_feature_names())
words.sum().sort_values()

lay            642
escape         643
lonesome       644
happiness      646
holy           648
             ...  
feel         16795
come         23619
time         26504
like         29649
know         33526
Length: 500, dtype: int64

In [75]:
# extracting the top 500 two or three word phrases
cv2 = CountVectorizer(max_features = 500, stop_words = 'english', ngram_range = (2,3))
phrases = pd.DataFrame(cv2.fit_transform(lyrics.lyrics).toarray(), columns  = cv2.get_feature_names())
phrases.sum().sort_values()

yeah like         157
spend life        157
holy holy         157
come feel         157
nothin nothin     157
                 ... 
break heart      1812
know know        1935
feel like        2012
time time        2262
yeah yeah        2874
Length: 500, dtype: int64

In [88]:
lyrics['percentage_adj'] = lyrics.adj_count / lyrics.len
lyrics['percentage_noun'] = lyrics.noun_count / lyrics.len
lyrics['percentage_verb'] = lyrics.verb_count / lyrics.len

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lyrics['percentage_adj'] = lyrics.adj_count / lyrics.len
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lyrics['percentage_noun'] = lyrics.noun_count / lyrics.len
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lyrics['percentage_verb'] = lyrics.verb_count / lyrics.len


In [90]:
lyrics.to_csv('summary.csv')

In [101]:
# grouping years into their decade
lyrics['decade'] = pd.cut(lyrics.release_date, range(1949, 2020, 10))
lyrics['decade'] = lyrics['decade'].apply(lambda x: f'{x.left + 1} - {x.right}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lyrics['decade'] = pd.cut(lyrics.release_date, range(1949, 2020, 10))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lyrics['decade'] = lyrics['decade'].apply(lambda x: f'{x.left + 1} - {x.right}')


In [109]:
df[cols_to_keep].reset_index(drop = True).join(words).to_csv('words.csv')

In [110]:
df[cols_to_keep].reset_index(drop = True).join(phrases).to_csv('phrases.csv')

In [116]:
words.sum().sort_values(ascending = False).head(50)

know       33526
like       29649
time       26504
come       23619
feel       16795
yeah       16787
away       16773
heart      16737
life       15906
want       14303
cause      14265
baby       13707
right      12977
live       12806
leave      12790
tell       11947
night      11920
think      11389
world      10519
need       10424
long       10376
gonna      10317
look       10062
break      10016
good        9984
hold        9822
fall        9358
hear        9044
mind        8767
dream       8069
say         7969
hand        7793
wanna       7735
home        7733
eye         7373
little      7233
play        6967
lose        6800
better      6774
walk        6736
fuck        6699
stay        6579
head        6489
turn        6483
believe     6441
stand       6428
girl        6419
change      6368
sing        6289
things      5754
dtype: int64

In [117]:
phrases.sum().sort_values(ascending = False).head(50)

yeah yeah           2874
time time           2262
feel like           2012
know know           1935
break heart         1812
away away           1709
long long           1440
lyric commercial    1423
hold hold           1366
come come           1338
right right         1329
yeah yeah yeah      1213
walk away           1191
come home           1167
fall fall           1138
long time           1138
feel feel           1115
live live           1051
baby baby           1011
live life            986
feel good            980
life life            955
good good            950
night night          904
sing song            887
heart break          863
leave leave          838
want want            829
believe believe      817
good time            802
night long           786
close eye            782
away away away       780
hold tight           769
look like            758
hold hold hold       727
time time time       722
money money          711
come true            691
like like            690


In [125]:
pd.DataFrame({'Response' : df.lyrics, 'U_ID': range(len(df))}).reset_index(drop = True).to_json('lyrics_to_run.json')

In [127]:
words = pd.read_csv('words.csv', index_col = 0)

In [1]:
arrBad = [
'2g1c',
'2 girls 1 cup',
'acrotomophilia',
'anal',
'anilingus',
'anus',
'arsehole',
'ass',
'asshole',
'assmunch',
'auto erotic',
'autoerotic',
'babeland',
'baby batter',
'ball gag',
'ball gravy',
'ball kicking',
'ball licking',
'ball sack',
'ball sucking',
'bangbros',
'bareback',
'barely legal',
'barenaked',
'bastardo',
'bastinado',
'bbw',
'bdsm',
'beaver cleaver',
'beaver lips',
'bestiality',
'bi curious',
'big black',
'big breasts',
'big knockers',
'big tits',
'bimbos',
'birdlock',
'bitch',
'black cock',
'blonde action',
'blonde on blonde action',
'blow j',
'blow your l',
'blue waffle',
'blumpkin',
'bollocks',
'bondage',
'boner',
'boob',
'boobs',
'booty call',
'brown showers',
'brunette action',
'bukkake',
'bulldyke',
'bullet vibe',
'bung hole',
'bunghole',
'busty',
'butt',
'buttcheeks',
'butthole',
'camel toe',
'camgirl',
'camslut',
'camwhore',
'carpet muncher',
'carpetmuncher',
'chocolate rosebuds',
'circlejerk',
'cleveland steamer',
'clit',
'clitoris',
'clover clamps',
'clusterfuck',
'cock',
'cocks',
'coprolagnia',
'coprophilia',
'cornhole',
'cum',
'cumming',
'cunnilingus',
'cunt',
'darkie',
'date rape',
'daterape',
'deep throat',
'deepthroat',
'dick',
'dildo',
'dirty pillows',
'dirty sanchez',
'dog style',
'doggie style',
'doggiestyle',
'doggy style',
'doggystyle',
'dolcett',
'domination',
'dominatrix',
'dommes',
'donkey punch',
'double dong',
'double penetration',
'dp action',
'eat my ass',
'ecchi',
'ejaculation',
'erotic',
'erotism',
'escort',
'ethical slut',
'eunuch',
'faggot',
'fecal',
'felch',
'fellatio',
'feltch',
'female squirting',
'femdom',
'figging',
'fingering',
'fisting',
'foot fetish',
'footjob',
'frotting',
'fuck',
'fucking',
'fuck buttons',
'fudge packer',
'fudgepacker',
'futanari',
'g-spot',
'gang bang',
'gay sex',
'genitals',
'giant cock',
'girl on',
'girl on top',
'girls gone wild',
'goatcx',
'goatse',
'gokkun',
'golden shower',
'goo girl',
'goodpoop',
'goregasm',
'grope',
'group sex',
'guro',
'hand job',
'handjob',
'hard core',
'hardcore',
'hentai',
'homoerotic',
'honkey',
'hooker',
'hot chick',
'how to kill',
'how to murder',
'huge fat',
'humping',
'incest',
'intercourse',
'jack off',
'jail bait',
'jailbait',
'jerk off',
'jigaboo',
'jiggaboo',
'jiggerboo',
'jizz',
'juggs',
'kike',
'kinbaku',
'kinkster',
'kinky',
'knobbing',
'leather restraint',
'leather straight jacket',
'lemon party',
'lolita',
'lovemaking',
'make me come',
'male squirting',
'masturbate',
'menage a trois',
'milf',
'missionary position',
'motherfucker',
'mound of venus',
'mr hands',
'muff diver',
'muffdiving',
'nambla',
'nawashi',
'negro',
'neonazi',
'nig nog',
'nigga',
'nigger',
'nimphomania',
'nipple',
'nipples',
'nsfw images',
'nude',
'nudity',
'nympho',
'nymphomania',
'octopussy',
'omorashi',
'one cup two girls',
'one guy one jar',
'orgasm',
'orgy',
'paedophile',
'panties',
'panty',
'pedobear',
'pedophile',
'pegging',
'penis',
'phone sex',
'piece of shit',
'piss pig',
'pissing',
'pisspig',
'playboy',
'pleasure chest',
'pole smoker',
'ponyplay',
'poof',
'poop chute',
'poopchute',
'porn',
'porno',
'pornography',
'prince albert piercing',
'pthc',
'pubes',
'pussy',
'queaf',
'raghead',
'raging boner',
'rape',
'raping',
'rapist',
'rectum',
'reverse cowgirl',
'rimjob',
'rimming',
'rosy palm',
'rosy palm and her 5 sisters',
'rusty trombone',
's&m',
'sadism',
'scat',
'schlong',
'scissoring',
'semen',
'sex',
'sexo',
'sexy',
'shaved beaver',
'shaved pussy',
'shemale',
'shibari',
'shit',
'shota',
'shrimping',
'slanteye',
'slut',
'smut',
'snatch',
'snowballing',
'sodomize',
'sodomy',
'spic',
'spooge',
'spread legs',
'strap on',
'strapon',
'strappado',
'strip club',
'style doggy',
'suck',
'sucks',
'suicide girls',
'sultry women',
'swastika',
'swinger',
'tainted love',
'taste my',
'tea bagging',
'threesome',
'throating',
'tied up',
'tight white',
'tit',
'tits',
'titties',
'titty',
'tongue in a',
'topless',
'tosser',
'towelhead',
'tranny',
'tribadism',
'tub girl',
'tubgirl',
'tushy',
'twat',
'twink',
'twinkie',
'two girls one cup',
'undressing',
'upskirt',
'urethra play',
'urophilia',
'vagina',
'venus mound',
'vibrator',
'violet blue',
'violet wand',
'vorarephilia',
'voyeur',
'vulva',
'wank',
'wet dream',
'wetback',
'white power',
'women rapping',
'wrapping men',
'wrinkled starfish',
'xx',
'xxx',
'yaoi',
'yellow showers',
'yiffy',
'zoophilia']

In [2]:
len(arrBad)

343

In [140]:
# function to check profanity 
def check_profanity(text):
    for prof in arrBad:
        if prof in text:
            return 1
    return 0

In [141]:
words['Profanity'] = words.lyrics.apply(lambda x: check_profanity(x))

In [142]:
words.to_csv('words_profanity.csv')