In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [22]:
artists = pd.read_csv('../data/artist_cleaned.csv')
songs = pd.read_csv('../data/songs_cleaned.csv')

# Dropping Songs Not in English

In [23]:
songs = songs[songs['artist'] != 'Bad Bunny']
songs = songs[songs['artist'] != 'BTS']

# Merging Songs and Artists to get artist information

In [24]:
songs = songs.merge(artists, how = 'left', on = 'artist', suffixes = ('song','artist'))

# First Round Cleaning

In [25]:
def clean_text(row):
    cleaned = row['lyrics'].lower()
    
    clean_dict = {"'":'',
                  ", ":" ",
                  'ain t':'aint',
                  '\u2005':'',
                  'won t':'wont',
                  'could ve':'couldve',
                  'shoul ve':'shouldve',
                  'would ve':'wouldve',
                  'i ve':'ive',
                  '-':'_',
                  'ain ':'aint ',
                  'in ':'ing ',
                  ' ing ':'ing '}
    
    for key in clean_dict:
        cleaned = cleaned.replace(key, clean_dict[key])
 
    rep_words = ['hol ','ooh ','oh ','love ','know ','yeah ','baby, ', 'gone, ', 'down, ',
                 'woah ','hey ','fight ','tryin ','skrt ', 'ma ', 'ooh yeah, ','hey, ',
                 'wild ','uh ','hillbilly ','baby ','wiggle ','fight ','want ']
    
    for rep in rep_words:
        for i in range (25):
            temp_text = rep * 3
            cleaned = cleaned.replace(temp_text,'')
            temp_text = rep * 2
            cleaned = cleaned.replace(temp_text,'')
       
    return cleaned

songs['cleaned_lyrics'] = songs.apply(clean_text, axis = 1)

In [26]:
#saving to csv
songs.to_csv('../data/songs_after_round_2_cleaning.csv', index=False)

# Building Stop Words

In [27]:
# adding artist names to stopwords
artist_names = []
for a in list(artists['artist']):
    artist_names.append(a.lower())

In [28]:
from sklearn.feature_extraction import text 
my_stop_words = ['intro','outro','chorus','verse','don','s']
stop_words = text.ENGLISH_STOP_WORDS.union(my_stop_words)

# Function to Return a N-gram Count DF

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

def freq_df(n, column_name):
    word_vectorizer = CountVectorizer(ngram_range=(n,n), analyzer='word',stop_words=stop_words)
    sparse_matrix = word_vectorizer.fit_transform(column_name)
    frequencies = sum(sparse_matrix).toarray()[0]
    count_df = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency']).reset_index()
    count_df = count_df[~count_df['index'].isin(artist_names)]
    return count_df

# Count Vectorizer

In [30]:
all_songs_count = freq_df(2,songs['cleaned_lyrics'])
all_songs_count.sort_values('frequency').tail(10)

Unnamed: 0,index,frequency
41113,dont need,336
88107,know know,358
186120,yeah im,364
101005,love love,372
94996,like im,409
50146,feel like,457
88055,know im,474
186915,yeah yeah,556
41362,dont wanna,640
41042,dont know,811


# Analysis By Artist Gender

In [31]:
female = songs[songs['gender'] == 'Female']
male = songs[songs['gender'] == 'Male']

In [32]:
female.shape

(490, 15)

In [33]:
male.shape

(1310, 15)

#### female

In [34]:
f_count = freq_df(2,female['cleaned_lyrics'])
f_count.sort_values('frequency').tail(10)

Unnamed: 0,index,frequency
10189,eh eh,89
3061,bitch im,89
19304,just like,89
383,ah ah,90
42932,yeah yeah,91
11290,feel like,91
20105,know im,100
9446,dont wanna,140
23357,love love,158
9334,dont know,183


#### male

In [35]:
m_count = freq_df(2,male['cleaned_lyrics'])
m_count.sort_values('frequency').tail(10)

Unnamed: 0,index,frequency
31766,dont need,249
61016,im just,255
68504,know know,261
144431,yeah im,296
73928,like im,342
68463,know im,345
38733,feel like,347
145106,yeah yeah,423
31972,dont wanna,447
31710,dont know,583


# Analysis By Artist Age

In [36]:
Seven = songs[songs['birth_decade'] == '1970s']
Eight = songs[songs['birth_decade'] == '1980s']
Nine = songs[songs['birth_decade'] == '1990s']
Two_Thousands = songs[songs['birth_decade'] == '2000s']

In [37]:
seven_count = freq_df(2,Seven['cleaned_lyrics'])
seven_count.sort_values('frequency').tail(10)

Unnamed: 0,index,frequency
9489,right im,24
12114,turning turning,27
5400,im gonna,28
9246,ready right,28
6306,la la,30
4839,hell right,32
5483,im ready,35
7254,love love,38
2718,dont know,41
2763,dont wanna,42


In [38]:
eight_count = freq_df(2,Eight['cleaned_lyrics'])
eight_count.sort_values('frequency').tail(10)

Unnamed: 0,index,frequency
22064,nigga dope,60
34327,wet wet,63
22814,oh yeah,67
36071,young nigga,78
7139,diddily diddily,81
16874,know know,85
7670,dont know,86
8474,eh eh,88
7744,dont wanna,91
19577,love love,107


In [39]:
nine_count = freq_df(2,Nine['cleaned_lyrics'])
nine_count.sort_values('frequency').tail(10)

Unnamed: 0,index,frequency
1857,aint got,205
24654,dont need,211
8210,bitch im,220
111218,yeah im,224
57039,like im,249
29882,feel like,258
52759,know im,292
24849,dont wanna,357
111808,yeah yeah,374
24601,dont know,471


In [40]:
thousand_count = freq_df(2,Two_Thousands['cleaned_lyrics'])
thousand_count.sort_values('frequency').tail(10)

Unnamed: 0,index,frequency
4686,feel like,39
8342,just want,43
18582,zoo york,46
3929,dont want,46
9316,like im,48
1310,bitch im,57
8627,know im,62
3928,dont wanna,70
18325,yeah yeah,85
3874,dont know,103
