In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [41]:
artists = pd.read_csv('../data/artist_cleaned.csv')
songs = pd.read_csv('../data/songs_cleaned.csv')

# Dropping Songs Not in English

In [42]:
songs = songs[songs['artist'] != 'Bad Bunny']
songs = songs[songs['artist'] != 'BTS']

# Merging Songs and Artists to get artist information

In [43]:
songs = songs.merge(artists, how = 'left', on = 'artist', suffixes = ('song','artist'))

# First Round Cleaning

In [44]:
def clean_text(row):
    cleaned = row['lyrics'].lower()
    
    clean_dict = {"'":'',
                  ", ":" ",
                  'ain t':'aint',
                  '\u2005':'',
                  'won t':'wont',
                  'could ve':'couldve',
                  'shoul ve':'shouldve',
                  'would ve':'wouldve',
                  'i ve':'ive',
                  '-':'_',
                  'ain ':'aint ',
                  'in ':'ing ',
                  ' ing ':'ing '}
    
    for key in clean_dict:
        cleaned = cleaned.replace(key, clean_dict[key])
 
    rep_words = ['hol ','ooh ','oh ','love ','know ','yeah ','baby, ', 'gone, ', 'down, ',
                 'woah ','hey ','fight ','tryin ','skrt ', 'ma ', 'ooh yeah, ','hey, ',
                 'wild ','uh ','hillbilly ','baby ','wiggle ','fight ','want ']
    
    for rep in rep_words:
        for i in range (25):
            temp_text = rep * 3
            cleaned = cleaned.replace(temp_text,'')
            temp_text = rep * 2
            cleaned = cleaned.replace(temp_text,'')
       
    return cleaned

songs['cleaned_lyrics'] = songs.apply(clean_text, axis = 1)

# Building Stop Words

In [45]:
# adding artist names to stopwords
artist_names = []
for a in list(artists['artist']):
    artist_names.append(a.lower())

In [46]:
from sklearn.feature_extraction import text 
my_stop_words = ['intro','outro','chorus','verse','don','s']
stop_words = text.ENGLISH_STOP_WORDS.union(my_stop_words)

# Function to Return a N-gram Count DF

In [47]:
from sklearn.feature_extraction.text import CountVectorizer

def freq_df(n, column_name):
    word_vectorizer = CountVectorizer(ngram_range=(n,n), analyzer='word',stop_words=stop_words)
    sparse_matrix = word_vectorizer.fit_transform(column_name)
    frequencies = sum(sparse_matrix).toarray()[0]
    count_df = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency']).reset_index()
    count_df = count_df[~count_df['index'].isin(artist_names)]
    return count_df

# Count Vectorizer

In [48]:
all_songs_count = freq_df(2,songs['cleaned_lyrics'])
all_songs_count.sort_values('frequency').tail(10)

Unnamed: 0,index,frequency
7042,bitch im,158
97116,yeah im,186
49747,like im,190
52933,love love,198
45974,know im,204
46005,know know,205
25812,feel like,216
97614,yeah yeah,296
21223,dont know,327
21423,dont wanna,360


# Analysis By Artist Gender

In [49]:
female = songs[songs['gender'] == 'Female']
male = songs[songs['gender'] == 'Male']

#### female

In [50]:
f_count = freq_df(2,female['cleaned_lyrics'])
f_count.sort_values('frequency').tail(10)

Unnamed: 0,index,frequency
21226,want want,45
14390,oh_oh oh_oh,45
20366,trying im,48
1610,bitch im,55
19322,thats type,57
9557,im trying,57
22548,yeah yeah,58
12281,love love,64
4862,dont know,86
4939,dont wanna,107


#### male

In [51]:
m_count = freq_df(2,male['cleaned_lyrics'])
m_count.sort_values('frequency').tail(10)

Unnamed: 0,index,frequency
33382,just like,123
73334,yeah im,142
28762,hol hol,145
34813,know im,147
37663,like im,161
34837,know know,163
19469,feel like,167
16038,dont know,210
73739,yeah yeah,213
16198,dont wanna,228


# Analysis By Artist Age

In [52]:
Seven = songs[songs['birth_decade'] == '1970s']
Eight = songs[songs['birth_decade'] == '1980s']
Nine = songs[songs['birth_decade'] == '1990s']
Two_Thousands = songs[songs['birth_decade'] == '2000s']

In [53]:
seven_count = freq_df(2,Seven['cleaned_lyrics'])
seven_count.sort_values('frequency').tail(10)

Unnamed: 0,index,frequency
2344,hard love,16
72,againt play,16
489,bites dust,16
4488,play againt,17
1363,dont know,18
1387,dont stop,19
4293,ooh lets,21
3292,lets ride,22
6086,turning turning,27
1389,dont wanna,32


In [54]:
eight_count = freq_df(2,Eight['cleaned_lyrics'])
eight_count.sort_values('frequency').tail(10)

Unnamed: 0,index,frequency
9898,low life,33
16769,want body,33
9182,like girl,34
3235,da da,35
9610,look look,35
1252,bitch dont,36
8501,know know,38
16834,want want,38
3883,dont know,57
3937,dont wanna,67


In [55]:
nine_count = freq_df(2,Nine['cleaned_lyrics'])
nine_count.sort_values('frequency').tail(10)

Unnamed: 0,index,frequency
29057,like im,105
56798,yeah im,108
26828,know know,113
4222,bitch im,131
26808,know im,135
14912,feel like,138
30945,love love,162
12342,dont know,167
12496,dont wanna,173
57174,yeah yeah,222


In [56]:
thousand_count = freq_df(2,Two_Thousands['cleaned_lyrics'])
thousand_count.sort_values('frequency').tail(10)

Unnamed: 0,index,frequency
1980,dont want,24
3879,im stupid,24
8579,walk em,25
2499,fly away,26
6732,run away,28
418,bad yeah,29
4708,like im,32
9224,yeah yeah,38
1946,dont know,40
1979,dont wanna,48


# Analysis By Gender