Author: Elaine C, Dillion D  
Organized by: Elaine C

In [1]:
import pandas as pd
import numpy as np

## Step 1. Read Data

In [2]:
spotify = pd.read_csv('../Data/SpotifyFeatures.csv')

In [3]:
spotify.columns

Index(['genre', 'artist_name', 'track_name', 'track_id', 'popularity',
       'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')

In [4]:
spotify.sort_values('popularity', ascending = False).head(10)

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
9027,Dance,Ariana Grande,7 rings,14msK75pk3pA33pzPVNtBF,100,0.578,0.725,178640,0.321,0.0,C#,0.0884,-10.744,Minor,0.323,70.142,4/4,0.319
107804,Pop,Ariana Grande,7 rings,14msK75pk3pA33pzPVNtBF,100,0.578,0.725,178640,0.321,0.0,C#,0.0884,-10.744,Minor,0.323,70.142,4/4,0.319
86951,Rap,Post Malone,Wow.,6MWtB6iiXyIwun0YzU6DFP,99,0.163,0.833,149520,0.539,2e-06,B,0.101,-7.399,Minor,0.178,99.947,4/4,0.385
107803,Pop,Post Malone,Wow.,6MWtB6iiXyIwun0YzU6DFP,99,0.163,0.833,149520,0.539,2e-06,B,0.101,-7.399,Minor,0.178,99.947,4/4,0.385
107802,Pop,Ariana Grande,"break up with your girlfriend, i'm bored",4kV4N9D1iKVxx1KLvtTpjS,99,0.0421,0.726,190440,0.554,0.0,F,0.106,-5.29,Minor,0.0917,169.999,4/4,0.335
9026,Dance,Ariana Grande,"break up with your girlfriend, i'm bored",4kV4N9D1iKVxx1KLvtTpjS,99,0.0421,0.726,190440,0.554,0.0,F,0.106,-5.29,Minor,0.0917,169.999,4/4,0.335
66643,Hip-Hop,Daddy Yankee,Con Calma,5w9c2J52mkdntKOmRLeM2m,98,0.11,0.737,193227,0.86,2e-06,G#,0.0574,-2.652,Minor,0.0593,93.989,4/4,0.656
107909,Pop,Daddy Yankee,Con Calma,5w9c2J52mkdntKOmRLeM2m,98,0.11,0.737,193227,0.86,2e-06,G#,0.0574,-2.652,Minor,0.0593,93.989,4/4,0.656
138918,Reggaeton,Daddy Yankee,Con Calma,5w9c2J52mkdntKOmRLeM2m,98,0.11,0.737,193227,0.86,2e-06,G#,0.0574,-2.652,Minor,0.0593,93.989,4/4,0.656
107829,Pop,Ava Max,Sweet but Psycho,25sgk305KZfyuqVBQIahim,97,0.0691,0.719,187436,0.704,0.0,C#,0.166,-4.724,Major,0.0476,133.002,4/4,0.628


## Step 2. Features Engineering

### Column Genre

In [5]:
# Citation from Elaine
genre_dum = pd.get_dummies(data=spotify[['track_id', 'genre']], columns=['genre'], dtype=int)
genre_dum_2 = genre_dum.groupby('track_id').sum()
genre_dum_2.shape

(176774, 27)

In [6]:
genre_dum_2.columns

Index(['genre_A Capella', 'genre_Alternative', 'genre_Anime', 'genre_Blues',
       'genre_Children's Music', 'genre_Children’s Music', 'genre_Classical',
       'genre_Comedy', 'genre_Country', 'genre_Dance', 'genre_Electronic',
       'genre_Folk', 'genre_Hip-Hop', 'genre_Indie', 'genre_Jazz',
       'genre_Movie', 'genre_Opera', 'genre_Pop', 'genre_R&B', 'genre_Rap',
       'genre_Reggae', 'genre_Reggaeton', 'genre_Rock', 'genre_Ska',
       'genre_Soul', 'genre_Soundtrack', 'genre_World'],
      dtype='object')

In [7]:
# Citation from Elaine
# Drop Genre column in origin data
spotify_pre_merge = spotify.drop(columns = ['genre'],axis = 1).drop_duplicates()

In [8]:
# Citation from Elaine
# Merge origin data w/o Genre with genre_dum
spotify_df = pd.merge(
    left = spotify_pre_merge, 
    right = genre_dum_2,
    left_on = 'track_id',
    right_on = genre_dum_2.index,
    how = 'right'
    
)
spotify_df.sort_values('popularity', ascending = False).head(10)

Unnamed: 0,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,...,genre_Pop,genre_R&B,genre_Rap,genre_Reggae,genre_Reggaeton,genre_Rock,genre_Ska,genre_Soul,genre_Soundtrack,genre_World
26315,Ariana Grande,7 rings,14msK75pk3pA33pzPVNtBF,100,0.578,0.725,178640,0.321,0.0,C#,...,1,0,0,0,0,0,0,0,0,0
156214,Post Malone,Wow.,6MWtB6iiXyIwun0YzU6DFP,99,0.163,0.833,149520,0.539,2e-06,B,...,1,0,1,0,0,0,0,0,0,0
116777,Ariana Grande,"break up with your girlfriend, i'm bored",4kV4N9D1iKVxx1KLvtTpjS,99,0.0421,0.726,190440,0.554,0.0,F,...,1,0,0,0,0,0,0,0,0,0
145882,Daddy Yankee,Con Calma,5w9c2J52mkdntKOmRLeM2m,98,0.11,0.737,193227,0.86,2e-06,G#,...,1,0,0,0,1,0,0,0,0,0
81894,Post Malone,Sunflower - Spider-Man: Into the Spider-Verse,3KkXRkHbMCARz0aVfEt68P,97,0.556,0.76,158040,0.479,0.0,D,...,1,0,1,0,0,0,0,0,0,0
157962,Sam Smith,Dancing With A Stranger (with Normani),6Qs4SXO9dwPj5GKvVOv8Ki,97,0.45,0.741,171030,0.52,2e-06,G#,...,1,0,0,0,0,0,0,0,0,0
51328,Ava Max,Sweet but Psycho,25sgk305KZfyuqVBQIahim,97,0.0691,0.719,187436,0.704,0.0,C#,...,1,0,0,0,0,0,0,0,0,0
64844,Marshmello,Happier,2dpaYNEQHiRxtZbfNsse99,97,0.191,0.687,214290,0.792,0.0,F,...,1,0,0,0,0,0,0,0,0,0
143086,Halsey,Without Me,5p7ujcrUXASCNwRaWNHR1C,97,0.297,0.752,201661,0.488,9e-06,F#,...,1,0,0,0,0,0,0,0,0,0
140622,Pedro Capó,Calma - Remix,5iwz1NiezX7WWjnCgY5TH4,97,0.323,0.826,238200,0.773,0.0,B,...,1,0,0,0,0,0,0,0,0,0


In [9]:
spotify_df.shape

(191056, 44)

In [10]:
spotify_df.artist_name.nunique()

14564

### Map other Categorical Features

In [11]:
# Citation from Dillon
spotify_df['mode'] = spotify_df['mode'].map({'Major': 1, 'Minor': 0})
spotify_df['mode'].value_counts()

mode
1    125788
0     65268
Name: count, dtype: int64

In [12]:
# Citation from Dillon
# Turning time_signature column to a float

def frc_to_float(col):
    new_vals = [i.split("/") for i in col]
    float_list = []
    for pair in new_vals:
        num = int(pair[0])/int(pair[1])
        float_list.append(num)
    return float_list

spotify_df['time_signature'] = frc_to_float(spotify_df['time_signature'])

In [13]:
spotify_df['time_signature'].value_counts()

time_signature
1.00    162200
0.75     21842
1.25      4591
0.25      2416
0.00         7
Name: count, dtype: int64

In [14]:
spotify_df = pd.get_dummies(columns = ['key'],data=spotify_df, dtype='int', drop_first=True)
spotify_df.shape

(191056, 54)

## Step 3. Features Selection

In [15]:
# Convert the milliseconds to minutes
spotify_df['duration_min'] = spotify_df['duration_ms']/60_000
# Drop duration_ms
spotify_df.drop(columns = ['duration_ms'], inplace = True)

In [19]:
# Citation from Dillon
df_numeric = spotify_df[['popularity', 'danceability', 'energy', 'instrumentalness', 
                        'liveness', 'loudness', 'speechiness', 'tempo', 'time_signature', 
                        'valence','duration_min']]
df_numeric.corr()[['popularity']].sort_values(by='popularity',ascending =False)

Unnamed: 0,popularity
popularity,1.0
loudness,0.340087
energy,0.251953
danceability,0.216698
time_signature,0.131815
tempo,0.08457
valence,0.055765
duration_min,0.016117
liveness,-0.163029
instrumentalness,-0.165713


In [20]:
# Drop low correlation for duration_min
spotify_df.drop(columns=['duration_min'],inplace = True)

In [21]:
spotify.columns

Index(['genre', 'artist_name', 'track_name', 'track_id', 'popularity',
       'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')

In [22]:
# Drop artist_name, track_name, track_id
spotify_df.drop(columns=['artist_name','track_name','track_id'],inplace = True)

In [23]:
spotify_df.shape

(191056, 50)

In [24]:
spotify_df.to_csv('../Data/Cleaned Data/Cleaned_SpotifyFeatures.csv',index=False)