In [1]:
# imports
import pandas as pd

In [2]:
# reading in the csv
path = '../data/top10s.csv'

data_df = pd.read_csv(path, encoding='cp1252')
data_df.head()

Unnamed: 0.1,Unnamed: 0,title,artist,top genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
0,1,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83
1,2,Love The Way You Lie,Eminem,detroit hip hop,2010,87,93,75,-5,52,64,263,24,23,82
2,3,TiK ToK,Kesha,dance pop,2010,120,84,76,-3,29,71,200,10,14,80
3,4,Bad Romance,Lady Gaga,dance pop,2010,119,92,70,-4,8,71,295,0,4,79
4,5,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,221,2,4,78


In [3]:
# checking the cols
data_df.columns

Index(['Unnamed: 0', 'title', 'artist', 'top genre', 'year', 'bpm', 'nrgy',
       'dnce', 'dB', 'live', 'val', 'dur', 'acous', 'spch', 'pop'],
      dtype='object')

In [4]:
# subsetting the cols to remove 'unnamed:0' file
data_df = data_df[['title', 'artist', 'top genre', 'year', 'bpm', 'nrgy',
       'dnce', 'dB', 'live', 'val', 'dur', 'acous', 'spch', 'pop']]
data_df.head()

Unnamed: 0,title,artist,top genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
0,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83
1,Love The Way You Lie,Eminem,detroit hip hop,2010,87,93,75,-5,52,64,263,24,23,82
2,TiK ToK,Kesha,dance pop,2010,120,84,76,-3,29,71,200,10,14,80
3,Bad Romance,Lady Gaga,dance pop,2010,119,92,70,-4,8,71,295,0,4,79
4,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,221,2,4,78


In [5]:
# renaming columns
data_df.columns = ['title', 'artist', 'subgenre', 'year', 'bpm', 'nrgy',
       'dnce', 'dB', 'live', 'val', 'dur', 'acous', 'spch', 'pop']
data_df.head()

Unnamed: 0,title,artist,subgenre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
0,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83
1,Love The Way You Lie,Eminem,detroit hip hop,2010,87,93,75,-5,52,64,263,24,23,82
2,TiK ToK,Kesha,dance pop,2010,120,84,76,-3,29,71,200,10,14,80
3,Bad Romance,Lady Gaga,dance pop,2010,119,92,70,-4,8,71,295,0,4,79
4,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,221,2,4,78


In [6]:
# array of unique genres
genres = data_df['subgenre'].unique()
genres

array(['neo mellow', 'detroit hip hop', 'dance pop', 'pop',
       'canadian pop', 'hip pop', 'barbadian pop', 'atl hip hop',
       'australian pop', 'indie pop', 'art pop', 'colombian pop',
       'big room', 'british soul', 'chicago rap', 'acoustic pop',
       'permanent wave', 'boy band', 'baroque pop', 'celtic rock',
       'electro', 'complextro', 'canadian hip hop', 'candy pop',
       'alaska indie', 'folk-pop', 'metropopolis', 'house',
       'australian hip hop', 'electropop', 'australian dance',
       'hollywood', 'canadian contemporary r&b',
       'irish singer-songwriter', 'tropical house', 'belgian edm',
       'french indie pop', 'hip hop', 'danish pop', 'latin',
       'canadian latin', 'electronic trap', 'edm', 'electro house',
       'downtempo', 'brostep', 'contemporary country', 'moroccan pop',
       'escape room', 'alternative r&b'], dtype=object)

In [7]:
# for loop to reduce genres to broader groups
genre = []
for x in range(0, len(data_df)):
    this_genre = data_df.iloc[x, 2]
    genre_count = data_df[data_df['subgenre'] == this_genre]['subgenre'].count()
    if this_genre[-3:] == 'pop':
        genre.append('pop')
    elif this_genre[-7:] == 'hip hop':
        genre.append('hip hop')
    elif this_genre[-3:] == 'edm':
        genre.append('edm')
    elif genre_count == 1:
        genre.append('other')
    else:
        genre.append(this_genre)

In [8]:
# checking number of genres after the for loop
print(len(set(genre)))
set(genre)

19


{'australian dance',
 'big room',
 'boy band',
 'british soul',
 'brostep',
 'canadian contemporary r&b',
 'complextro',
 'downtempo',
 'edm',
 'electro',
 'electronic trap',
 'escape room',
 'hip hop',
 'latin',
 'neo mellow',
 'other',
 'permanent wave',
 'pop',
 'tropical house'}

In [9]:
# creating a list of the unique genres
unique_genres = list(set(genre))

In [10]:
# making a dict to assign numerical vals to each genre
genre_range = [x for x in range(len(set(genre)))]
genre_dict = dict(zip(unique_genres, genre_range)) 
genre_dict

{'pop': 0,
 'latin': 1,
 'complextro': 2,
 'escape room': 3,
 'australian dance': 4,
 'canadian contemporary r&b': 5,
 'hip hop': 6,
 'electro': 7,
 'downtempo': 8,
 'brostep': 9,
 'electronic trap': 10,
 'other': 11,
 'big room': 12,
 'british soul': 13,
 'boy band': 14,
 'permanent wave': 15,
 'tropical house': 16,
 'edm': 17,
 'neo mellow': 18}

In [11]:
# creating a number list for use with genres in charts
genre_numbers = [genre_dict[genre[x]] for x in range(0, len(genre))]

In [19]:
# adding the genre and genre number cols and re-ordering
data_df['genre'] = genre
data_df['genre_num'] = genre_numbers
data_df = data_df[['title', 'artist', 'genre', 'genre_num', 'subgenre', 'year', 'bpm', 'nrgy', 'dnce', 'dB',
       'live', 'val', 'dur', 'acous', 'spch', 'pop']]
data_df.head(20)

Unnamed: 0,title,artist,genre,genre_num,subgenre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
0,"Hey, Soul Sister",Train,neo mellow,18,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83
1,Love The Way You Lie,Eminem,hip hop,6,detroit hip hop,2010,87,93,75,-5,52,64,263,24,23,82
2,TiK ToK,Kesha,pop,0,dance pop,2010,120,84,76,-3,29,71,200,10,14,80
3,Bad Romance,Lady Gaga,pop,0,dance pop,2010,119,92,70,-4,8,71,295,0,4,79
4,Just the Way You Are,Bruno Mars,pop,0,pop,2010,109,84,64,-5,9,43,221,2,4,78
5,Baby,Justin Bieber,pop,0,canadian pop,2010,65,86,73,-5,11,54,214,4,14,77
6,Dynamite,Taio Cruz,pop,0,dance pop,2010,120,78,75,-4,4,82,203,0,9,77
7,Secrets,OneRepublic,pop,0,dance pop,2010,148,76,52,-6,12,38,225,7,4,77
8,Empire State of Mind (Part II) Broken Down,Alicia Keys,pop,0,hip pop,2010,93,37,48,-8,12,14,216,74,3,76
9,Only Girl (In The World),Rihanna,pop,0,barbadian pop,2010,126,72,79,-4,7,61,235,13,4,73


In [18]:
grouped_by_df = data_df.groupby('genre').count().sort_values('title', ascending=False)
grouped_by_df.head()

Unnamed: 0_level_0,title,artist,genre_num,subgenre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
pop,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484
boy band,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15
hip hop,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14
british soul,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11
other,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11


In [14]:
# checking for null values
data_df.isnull().sum()

title        0
artist       0
genre        0
genre_num    0
subgenre     0
year         0
bpm          0
nrgy         0
dnce         0
dB           0
live         0
val          0
dur          0
acous        0
spch         0
pop          0
dtype: int64

In [15]:
# exporting cleaned data
data_df.to_csv('../data/data_cleaned.csv', index=False)

In [16]:
# all unique artist names
unique_artists = list(data_df['artist'].unique())
unique_artists

['Train',
 'Eminem',
 'Kesha',
 'Lady Gaga',
 'Bruno Mars',
 'Justin Bieber',
 'Taio Cruz',
 'OneRepublic',
 'Alicia Keys',
 'Rihanna',
 'Flo Rida',
 'Mike Posner',
 'Far East Movement',
 'Usher',
 'Sean Kingston',
 'The Black Eyed Peas',
 'Adam Lambert',
 'Maroon 5',
 'Neon Trees',
 'Selena Gomez & The Scene',
 'Enrique Iglesias',
 'Katy Perry',
 'Britney Spears',
 '3OH!3',
 'David Guetta',
 'Christina Aguilera',
 'Florence + The Machine',
 'Shakira',
 'Tinie Tempah',
 'T.I.',
 'Martin Solveig',
 'Christina Perri',
 'Adele',
 'Pitbull',
 'Beyoncé',
 'Hot Chelle Rae',
 'Avril Lavigne',
 'Kanye West',
 'LMFAO',
 'Jessie J',
 'Jennifer Lopez',
 'Chris Brown',
 'Sleeping At Last',
 'Nicki Minaj',
 'P!nk',
 'Coldplay',
 'One Direction',
 'Taylor Swift',
 'Carly Rae Jepsen',
 'Kelly Clarkson',
 'Owl City',
 'The Wanted',
 'fun.',
 'Ellie Goulding',
 'Gym Class Heroes',
 'Avicii',
 'The Script',
 'Miley Cyrus',
 'Swedish House Mafia',
 'Daft Punk',
 'James Arthur',
 'Robin Thicke',
 'Demi Lo