In [188]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [189]:
artists = pd.read_csv('../data/artist_cleaned.csv')
songs = pd.read_csv('../data/songs_cleaned.csv')

In [190]:
artists.head()

Unnamed: 0,artist,description,images_url,followers_count,gender,birth_year,birth_decade
0,Post Malone,"Austin Richard Post (born July 4, 1995), bette...",https://images.genius.com/1010194fa644be099aa2...,4542.0,Male,1995.0,1990s
1,The Weeknd,"Abel Makkonen Tesfaye (born February 16, 1990 ...",https://images.genius.com/f0813e600d43b8b43c94...,8241.0,Male,1990.0,1990s
2,Roddy Ricch,"Rodrick Wayne Moore, Jr. (b. October 22, 1998)...",https://images.genius.com/78cf8142bcc24004ee49...,722.0,Male,1998.0,1990s
3,DaBaby,"Jonathan Lyndale Kirk (b. December 22, 1991), ...",https://images.genius.com/c8f10e784589ba9ae897...,948.0,Male,1991.0,1990s
4,Drake,"Aubrey Drake Graham (born October 24, 1986) is...",https://images.genius.com/fc7fecd8b3701bdc9e1e...,14982.0,Male,1986.0,1980s


In [191]:
songs.head()

Unnamed: 0,song_name,artist,lyrics,description,accepted_annotations,contributors,pageviews
0,​​rockstar,Post Malone,"[Intro: Post Malone] Hahahahaha Tank God Ayy, ...","On “rockstar,” Post Malone compares his habits...",18.0,438.0,6859347.0
1,White Iverson,Post Malone,[Intro] Double OT I m a new three [Chorus] Sa...,"Virtually unknown before this track, Post Malo...",31.0,221.0,4134800.0
2,Congratulations,Post Malone,"[Intro: Post Malone] Mm-mmm Yeah, yeah Mm-mmm ...","On “Congratulations,“ Post Malone and Quavo ce...",13.0,211.0,3615768.0
3,Psycho,Post Malone,"[Chorus: Post Malone] Damn, my AP goin psycho...",“Psycho” is the third single from Post Malone’...,21.0,231.0,3044653.0
4,I Fall Apart,Post Malone,"[Intro] Ooh, I fall apart Ooh, yeah, mmm, yeah...","On “I Fall Apart,” Post reminisces about a hea...",7.0,126.0,2631815.0


# Dropping Songs Not in English

In [192]:
songs = songs[songs['artist'] != 'Bad Bunny']
songs = songs[songs['artist'] != 'BTS']

In [193]:
songs[songs['artist'] == 'Michael Jackson']

Unnamed: 0,song_name,artist,lyrics,description,accepted_annotations,contributors,pageviews
434,Billie Jean,Michael Jackson,[Verse 1] She was more like a beauty queen fro...,"One of Michael Jackson’s signature songs, “Bil...",15.0,206.0,1305679.0
435,Smooth Criminal,Michael Jackson,[Intro] Aaow! Cha! Shoo-cha-choo-cha! [Verse...,“Smooth Criminal” was an idea Michael had in 1...,6.0,147.0,1199843.0
436,They Don’t Care About Us,Michael Jackson,[Intro] All I want to say is that they don t r...,"The 4th single off Jackson’s album HIStory, th...",22.0,180.0,879751.0
437,Wanna Be Startin’ Somethin’,Michael Jackson,[Chorus] I said you wanna be startin somethin...,As the opening song and fourth single of Thril...,13.0,79.0,659674.0
438,Man in the Mirror,Michael Jackson,[Verse 1] I m gonna make a change For once in ...,“Man in the Mirror” is the fourth single from ...,15.0,146.0,588092.0


# First Round Cleaning

In [194]:
def clean_text(row):
    cleaned = row['lyrics'].lower()
    cleaned = cleaned.replace("ain t", "aint")
    cleaned = cleaned.replace("\u2005", "")
    cleaned = cleaned.replace("won t", "wont")
    cleaned = cleaned.replace("could ve", "couldve")
    cleaned = cleaned.replace("shoul ve", "shouldve")
    cleaned = cleaned.replace("would ve", "wouldve")
    cleaned = cleaned.replace("i ve", "ive")
    cleaned = cleaned.replace("-", "_")
    cleaned = cleaned.replace("in ", "ing ")
    cleaned = cleaned.replace(" ing ", "ing ")
    cleaned = cleaned.replace("ma ma coo sa ma ma se", "")
    cleaned = cleaned.replace("ma ma se", "")
    cleaned = cleaned.replace("ma ma sa", "")
    return cleaned

songs['cleaned_lyrics'] = songs.apply(clean_text, axis = 1)

# Building Stop Words

In [195]:
# adding artist names to stopwords
artist_names = []
for a in list(artists['artist']):
    artist_names.append(a.lower())

In [196]:
from sklearn.feature_extraction import text 
my_stop_words = ['intro','chorus','verse','don','s','jason derulo','nick jonas'] + artist_names
stop_words = text.ENGLISH_STOP_WORDS.union(my_stop_words)

# Count Vectorizer

In [197]:
from sklearn.feature_extraction.text import CountVectorizer

word_vectorizer = CountVectorizer(ngram_range=(2,2), analyzer='word',stop_words=stop_words)
sparse_matrix = word_vectorizer.fit_transform(songs['cleaned_lyrics'])

In [198]:
frequencies = sum(sparse_matrix).toarray()[0]
df2 = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency']).reset_index()

df2.sort_values('frequency').tail(100)

Unnamed: 0,index,frequency
35023,murder mind,30
28270,let ride,30
26636,know aint,30
1494,annie okay,30
1078,aint ready,30
18421,girl know,31
368,act like,31
32684,man gotta,31
17252,fuck bitches,31
59680,young dumb,31


# Analysis By Artist Gender

In [199]:
songs = songs.merge(artists, how = 'left', on = 'artist', suffixes = ('song','artist'))
songs.head()

Unnamed: 0,song_name,artist,lyrics,descriptionsong,accepted_annotations,contributors,pageviews,cleaned_lyrics,descriptionartist,images_url,followers_count,gender,birth_year,birth_decade
0,​​rockstar,Post Malone,"[Intro: Post Malone] Hahahahaha Tank God Ayy, ...","On “rockstar,” Post Malone compares his habits...",18.0,438.0,6859347.0,"[intro: post malone] hahahahaha tank god ayy, ...","Austin Richard Post (born July 4, 1995), bette...",https://images.genius.com/1010194fa644be099aa2...,4542.0,Male,1995.0,1990s
1,White Iverson,Post Malone,[Intro] Double OT I m a new three [Chorus] Sa...,"Virtually unknown before this track, Post Malo...",31.0,221.0,4134800.0,[intro] double ot i m a new three [chorus] sa...,"Austin Richard Post (born July 4, 1995), bette...",https://images.genius.com/1010194fa644be099aa2...,4542.0,Male,1995.0,1990s
2,Congratulations,Post Malone,"[Intro: Post Malone] Mm-mmm Yeah, yeah Mm-mmm ...","On “Congratulations,“ Post Malone and Quavo ce...",13.0,211.0,3615768.0,"[intro: post malone] mm_mmm yeah, yeah mm_mmm ...","Austin Richard Post (born July 4, 1995), bette...",https://images.genius.com/1010194fa644be099aa2...,4542.0,Male,1995.0,1990s
3,Psycho,Post Malone,"[Chorus: Post Malone] Damn, my AP goin psycho...",“Psycho” is the third single from Post Malone’...,21.0,231.0,3044653.0,"[chorus: post malone] damn, my ap going psych...","Austin Richard Post (born July 4, 1995), bette...",https://images.genius.com/1010194fa644be099aa2...,4542.0,Male,1995.0,1990s
4,I Fall Apart,Post Malone,"[Intro] Ooh, I fall apart Ooh, yeah, mmm, yeah...","On “I Fall Apart,” Post reminisces about a hea...",7.0,126.0,2631815.0,"[intro] ooh, i fall apart ooh, yeah, mmm, yeah...","Austin Richard Post (born July 4, 1995), bette...",https://images.genius.com/1010194fa644be099aa2...,4542.0,Male,1995.0,1990s


In [200]:
female = songs[songs['gender'] == 'Female']
male = songs[songs['gender'] == 'Male']

# Analysis By Artist Age

In [201]:
Seven = songs[songs['birth_decade'] == '1970s']
Eight = songs[songs['birth_decade'] == '1980s]
Nine = songs[songs['birth_decade'] == '1990s]
Two_Thousands = songs[songs['birth_decade'] == '2000s]

SyntaxError: EOL while scanning string literal (<ipython-input-201-0d0786ca7ad4>, line 2)