In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [23]:
artists = pd.read_csv('../data/artist_cleaned.csv')
songs = pd.read_csv('../data/songs_cleaned.csv')

# Dropping Songs Not in English

In [24]:
songs = songs[songs['artist'] != 'Bad Bunny']
songs = songs[songs['artist'] != 'BTS']

# Merging Songs and Artists to get artist information

In [25]:
songs = songs.merge(artists, how = 'left', on = 'artist', suffixes = ('song','artist'))

# First Round Cleaning

In [26]:
def clean_text(row):
    cleaned = row['lyrics'].lower()
    
    clean_dict = {"'":'',
                  ", ":" ",
                  'ain t':'aint',
                  '\u2005':'',
                  'won t':'wont',
                  'could ve':'couldve',
                  'shoul ve':'shouldve',
                  'would ve':'wouldve',
                  'i ve':'ive',
                  '-':'_',
                  'ain ':'aint ',
                  'in ':'ing ',
                  ' ing ':'ing '}
    
    for key in clean_dict:
        cleaned = cleaned.replace(key, clean_dict[key])
 
    rep_words = ['hol ','ooh ','oh ','love ','know ','yeah ','baby, ', 'gone, ', 'down, ',
                 'woah ','hey ','fight ','tryin ','skrt ', 'ma ', 'ooh yeah, ','hey, ',
                 'wild ','uh ','hillbilly ','baby ','wiggle ','fight ','want ']
    
    for rep in rep_words:
        for i in range (25):
            temp_text = rep * 3
            cleaned = cleaned.replace(temp_text,'')
            temp_text = rep * 2
            cleaned = cleaned.replace(temp_text,'')
       
    return cleaned

songs['cleaned_lyrics'] = songs.apply(clean_text, axis = 1)

In [27]:
#saving to csv
songs.to_csv('../data/songs_after_round_2_cleaning.csv', index=False)

# Building Stop Words

In [28]:
# adding artist names to stopwords
artist_names = []
for a in list(artists['artist']):
    artist_names.append(a.lower())

In [29]:
from sklearn.feature_extraction import text 
my_stop_words = ['intro','outro','chorus','verse','don','s']
stop_words = text.ENGLISH_STOP_WORDS.union(my_stop_words)

# Function to Return a N-gram Count DF

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

def freq_df(n, column_name):
    word_vectorizer = CountVectorizer(ngram_range=(n,n), analyzer='word',stop_words=stop_words)
    sparse_matrix = word_vectorizer.fit_transform(column_name)
    frequencies = sum(sparse_matrix).toarray()[0]
    count_df = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency']).reset_index()
    count_df = count_df[~count_df['index'].isin(artist_names)]
    return count_df

# Count Vectorizer

In [31]:
all_songs_count = freq_df(2,songs['cleaned_lyrics'])
all_songs_count.sort_values('frequency').tail(10)

Unnamed: 0,index,frequency
41113,dont need,336
88107,know know,358
186120,yeah im,364
101005,love love,372
94996,like im,409
50146,feel like,457
88055,know im,474
186915,yeah yeah,556
41362,dont wanna,640
41042,dont know,811


# Analysis By Artist Gender

In [32]:
female = songs[songs['gender'] == 'Female']
male = songs[songs['gender'] == 'Male']

In [33]:
female.shape

(490, 15)

In [34]:
male.shape

(1310, 15)

#### female

In [35]:
f_count = freq_df(2,female['cleaned_lyrics'])

#### male

In [36]:
m_count = freq_df(2,male['cleaned_lyrics'])

# Analysis By Artist Age

In [37]:
Six = songs[songs['birth_decade'] == '1960s']
Seven = songs[songs['birth_decade'] == '1970s']
Eight = songs[songs['birth_decade'] == '1980s']
Nine = songs[songs['birth_decade'] == '1990s']
Two_Thousands = songs[songs['birth_decade'] == '2000s']

In [38]:
six_count = freq_df(1,Six['cleaned_lyrics'])

In [39]:
seven_count = freq_df(1,Seven['cleaned_lyrics'])

In [40]:
eight_count = freq_df(1,Eight['cleaned_lyrics'])

In [41]:
nine_count = freq_df(1,Nine['cleaned_lyrics'])

In [42]:
thousand_count = freq_df(1,Two_Thousands['cleaned_lyrics'])